diff --git a/.gitignore b/.gitignore index e4673a2851d13..a3578a69d20fa 100644 --- a/.gitignore +++ b/.gitignore @@ -23,9 +23,12 @@ flink-runtime-web/web-dashboard/node/ flink-runtime-web/web-dashboard/node_modules/ flink-runtime-web/web-dashboard/web/ flink-python/dist/ +flink-python/apache-flink-libraries/dist/ flink-python/build/ +flink-python/apache-flink-libraries/build flink-python/pyflink.egg-info/ flink-python/apache_flink.egg-info/ +flink-python/apache-flink-libraries/apache_flink_libraries.egg-info/ flink-python/docs/_build flink-python/.tox/ flink-python/dev/download @@ -34,12 +37,12 @@ flink-python/dev/log/ flink-python/dev/.stage.txt flink-python/.eggs/ flink-python/apache-flink-*.dev0/ +flink-python/apache-flink-libraries/apache_flink_libraries-*.dev0/ flink-python/**/*.c flink-python/**/*.so atlassian-ide-plugin.xml out/ /docs/api -/docs/content /docs/.bundle /docs/.rubydeps /docs/ruby2/.bundle diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000..6cb7e5ce65305 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "docs/themes/book"] + path = docs/themes/book + url = https://github.com/alex-shpak/hugo-book diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6d74a8567fcc5..cbe3b11650938 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -38,7 +38,7 @@ resources: containers: # Container with Maven 3.2.5, SSL to have the same environment everywhere. - container: flink-build-container - image: rmetzger/flink-ci:ubuntu-amd64-f009d96 + image: rmetzger/flink-ci:ubuntu-amd64-7ac4e28 # On AZP provided machines, set this flag to allow writing coredumps in docker options: --privileged @@ -49,6 +49,7 @@ resources: variables: MAVEN_CACHE_FOLDER: $(Pipeline.Workspace)/.m2/repository E2E_CACHE_FOLDER: $(Pipeline.Workspace)/e2e_cache + E2E_TARBALL_CACHE: $(Pipeline.Workspace)/e2e_artifact_cache MAVEN_OPTS: '-Dmaven.repo.local=$(MAVEN_CACHE_FOLDER)' CACHE_KEY: maven | $(Agent.OS) | **/pom.xml, !**/target/** CACHE_FALLBACK_KEY: maven | $(Agent.OS) @@ -56,6 +57,8 @@ variables: SECRET_S3_BUCKET: $[variables.IT_CASE_S3_BUCKET] SECRET_S3_ACCESS_KEY: $[variables.IT_CASE_S3_ACCESS_KEY] SECRET_S3_SECRET_KEY: $[variables.IT_CASE_S3_SECRET_KEY] + SECRET_GLUE_SCHEMA_ACCESS_KEY: $[variables.IT_CASE_GLUE_SCHEMA_ACCESS_KEY] + SECRET_GLUE_SCHEMA_SECRET_KEY: $[variables.IT_CASE_GLUE_SCHEMA_SECRET_KEY] stages: @@ -79,16 +82,7 @@ stages: pool: vmImage: 'ubuntu-16.04' steps: - - task: UseRubyVersion@0 - inputs: - versionSpec: '= 2.4' - addToPath: true - script: ./tools/ci/docs.sh - # upload spider.log for debugging - - task: PublishPipelineArtifact@1 - inputs: - targetPath: ./docs/spider.log - artifact: spider.log # CI / Special stage for release, e.g. building PyFlink wheel packages, etc: - stage: ci_release displayName: "CI build (release)" diff --git a/docs/.gitignore b/docs/.gitignore index 3d6212de50701..270dd7f731332 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -2,7 +2,7 @@ .jekyll-metadata .jekyll-cache/ .rubydeps/ -content/ -content_*/ ruby2/.bundle/ ruby2/.rubydeps/ +public/ +resources/ diff --git a/docs/404.md b/docs/404.md deleted file mode 100644 index 5a25bad7426f0..0000000000000 --- a/docs/404.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: "404" -permalink: /404.html -layout: 404_base ---- - - -The page you are looking for has been moved. This could be because of a recent reorganization of the -documentation. Redirecting to [Documentation Home Page](/) in 5 seconds. diff --git a/docs/404.zh.md b/docs/404.zh.md deleted file mode 100644 index 5a25bad7426f0..0000000000000 --- a/docs/404.zh.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: "404" -permalink: /404.html -layout: 404_base ---- - - -The page you are looking for has been moved. This could be because of a recent reorganization of the -documentation. Redirecting to [Documentation Home Page](/) in 5 seconds. diff --git a/docs/Gemfile b/docs/Gemfile deleted file mode 100644 index 7dfc2ae0e3a4c..0000000000000 --- a/docs/Gemfile +++ /dev/null @@ -1,35 +0,0 @@ -################################################################################ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -source 'https://rubygems.org' - -ruby '>= 2.4.0' - -gem 'jekyll', '4.0.1' -gem 'addressable', '2.7.0' -gem 'octokit', '4.14.0' -gem 'therubyracer', '0.12.3' -gem 'json', '2.2.0' -gem 'jekyll-multiple-languages', '2.0.3' -gem 'jekyll-paginate', '1.1.0' -gem 'liquid-c', '4.0.0' # speed-up site generation -gem 'sassc', '2.2.1' # speed-up site generation - -# group :jekyll_plugins do -# gem 'hawkins' -# end diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock deleted file mode 100644 index 7d55d5cc6d63c..0000000000000 --- a/docs/Gemfile.lock +++ /dev/null @@ -1,96 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - addressable (2.7.0) - public_suffix (>= 2.0.2, < 5.0) - colorator (1.1.0) - concurrent-ruby (1.1.6) - em-websocket (0.5.1) - eventmachine (>= 0.12.9) - http_parser.rb (~> 0.6.0) - eventmachine (1.2.7) - faraday (0.17.0) - multipart-post (>= 1.2, < 3) - ffi (1.11.2) - forwardable-extended (2.6.0) - http_parser.rb (0.6.0) - i18n (1.8.3) - concurrent-ruby (~> 1.0) - jekyll (4.0.1) - addressable (~> 2.4) - colorator (~> 1.0) - em-websocket (~> 0.5) - i18n (>= 0.9.5, < 2) - jekyll-sass-converter (~> 2.0) - jekyll-watch (~> 2.0) - kramdown (~> 2.1) - kramdown-parser-gfm (~> 1.0) - liquid (~> 4.0) - mercenary (~> 0.3.3) - pathutil (~> 0.9) - rouge (~> 3.0) - safe_yaml (~> 1.0) - terminal-table (~> 1.8) - jekyll-multiple-languages (2.0.3) - jekyll-paginate (1.1.0) - jekyll-sass-converter (2.1.0) - sassc (> 2.0.1, < 3.0) - jekyll-watch (2.2.1) - listen (~> 3.0) - json (2.2.0) - kramdown (2.2.1) - rexml - kramdown-parser-gfm (1.1.0) - kramdown (~> 2.0) - libv8 (3.16.14.19) - liquid (4.0.3) - liquid-c (4.0.0) - liquid (>= 3.0.0) - listen (3.2.1) - rb-fsevent (~> 0.10, >= 0.10.3) - rb-inotify (~> 0.9, >= 0.9.10) - mercenary (0.3.6) - multipart-post (2.1.1) - octokit (4.14.0) - sawyer (~> 0.8.0, >= 0.5.3) - pathutil (0.16.2) - forwardable-extended (~> 2.6) - public_suffix (4.0.1) - rb-fsevent (0.10.4) - rb-inotify (0.10.1) - ffi (~> 1.0) - ref (2.0.0) - rexml (3.2.4) - rouge (3.20.0) - safe_yaml (1.0.5) - sassc (2.2.1) - ffi (~> 1.9) - sawyer (0.8.2) - addressable (>= 2.3.5) - faraday (> 0.8, < 2.0) - terminal-table (1.8.0) - unicode-display_width (~> 1.1, >= 1.1.1) - therubyracer (0.12.3) - libv8 (~> 3.16.14.15) - ref - unicode-display_width (1.7.0) - -PLATFORMS - ruby - -DEPENDENCIES - addressable (= 2.7.0) - jekyll (= 4.0.1) - jekyll-multiple-languages (= 2.0.3) - jekyll-paginate (= 1.1.0) - json (= 2.2.0) - liquid-c (= 4.0.0) - octokit (= 4.14.0) - sassc (= 2.2.1) - therubyracer (= 0.12.3) - -RUBY VERSION - ruby 2.6.3p62 - -BUNDLED WITH - 1.17.2 diff --git a/docs/README.md b/docs/README.md index 509ba6db825ef..7aab7b52b8452 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,55 +6,23 @@ https://flink.apache.org/ is also generated from the files found here. # Requirements -The dependencies are declared in the Gemfile in this directory. We use Markdown -to write and Jekyll to translate the documentation to static HTML. All required -dependencies are installed locally when you build the documentation through the -`build_docs.sh` script. If you want to install the software manually, use Ruby's -Bundler Gem to install all dependencies: +### Build the site locally - gem install bundler -v 1.16.1 - bundle install +Make sure you have installed [Hugo](https://gohugo.io/getting-started/installing/) on your +system. To build the Flink docs, you need the *extended version* of Hugo with Sass/SCSS support. -Note that in Ubuntu based systems, it may be necessary to install the following -packages: `rubygems ruby-dev libssl-dev build-essential`. +From this directory: -# Using Dockerized Jekyll + * Fetch the theme submodule + ```sh + git submodule update --init --recursive + ``` + * Start local server + ```sh + hugo -b "" serve + ``` -We dockerized the jekyll environment above. If you have [docker](https://docs.docker.com/), -you can run the following command to start the container. - -``` -cd flink/docs/docker -./run.sh -``` - -It takes a few moments to build the image for the first time but will be a second from the second time. -The run.sh command brings you in a bash session where you run the `./build_docs.sh` script mentioned above. - - -# Build - -The `docs/build_docs.sh` script installs dependencies locally, calls Jekyll, and -generates the documentation in `docs/content`. You can then point your browser -to `docs/content/index.html` and start reading. - -If you call the script with the preview flag `build_docs.sh -p`, Jekyll will -start a web server at `localhost:4000` and watch the docs directory for -updates. Use this mode to preview changes locally. - -You can call the script with the incremental flag `build_docs.sh -i`. -Jekyll will then serve a live preview at `localhost:4000`, -and it will be much faster because it will only rebuild the pages corresponding -to files that are modified. Note that if you are making changes that affect -the sidebar navigation, you'll have to build the entire site to see -those changes reflected on every page. - -| Flag | Action | -| -----| -------| -| -p | Run interactive preview | -| -i | Incremental builds | -| -e | Build only English docs | -| -z | Build only Chinese docs | +The site can be viewed at http://localhost:1313/ ## Generate configuration tables @@ -64,11 +32,11 @@ Configuration descriptions are auto generated from code. To trigger the generati mvn -Pgenerate-config-docs install ``` -The resulting html files will be written to `_includes/generated`. Tables are regenerated each time the command is invoked. +The resulting html files will be written to `layouts/shortcodes/generated`. Tables are regenerated each time the command is invoked. These tables can be directly included into the documentation: ``` -{% include generated/file_name.html %} +{{< generated/file_name >}} ``` # Contribute @@ -85,11 +53,13 @@ In addition to Markdown, every page contains a Jekyll front matter, which specif title: "Title of the Page" --- -Furthermore, you can access the variables found in `docs/_config.yml` as follows: - - {{ site.NAME }} - -This will be replaced with the value of the variable called `NAME` when generating the docs. + --- + title: "Title of the Page" <-- Title rendered in the side nave + weight: 1 <-- Weight controls the ordering of pages in the side nav. + type: docs <-- required + aliases: <-- Alias to setup redirect from removed page to this one + - /alias/to/removed/page.html + --- ## Structure @@ -100,8 +70,8 @@ This will be replaced with the value of the variable called `NAME` when generati All documents are structured with headings. From these headings, you can automatically generate a page table of contents (see below). ``` -# Level-1 Heading <- Used for the title of the page (don't use this) -## Level-2 Heading <- Start with this one +# Level-1 Heading <- Used for the title of the page +## Level-2 Heading <- Start with this one for content ### Level-3 heading #### Level-4 heading ##### Level-5 heading @@ -111,47 +81,149 @@ Please stick to the "logical order" when using the headlines, e.g. start with le #### Table of Contents - * This will be replaced by the TOC - {:toc} +Table of contents are added automatically to every page, based on heading levels 2 - 4. +The ToC can be ommitted by adding the following to the front matter of the page: + + --- + bookToc: false + --- + +### ShortCodes + +Flink uses [shortcodes](https://gohugo.io/content-management/shortcodes/) to add custom functionality +to its documentation markdown. The following are available for use: +#### Flink Artifact -Add this markup (both lines) to the document in order to generate a table of contents for the page. Headings until level 3 headings are included. + {{< artfiact flink-streaming-java withScalaVersion >}} -You can exclude a heading from the table of contents: +This will be replaced by the maven artifact for flink-streaming-java that users should copy into their pom.xml file. It will render out to: - # Excluded heading - {:.no_toc} +```xml + + org.apache.flink + flink-streaming-java_2.11 + + +``` + +It includes a number of optional flags: + +* withScalaVersion: Includes the scala version suffix to the artifact id +* withTestScope: Includes `test` to the module. Useful for marking test dependencies. +* withTestClassifier: Includes `tests`. Useful when users should be pulling in Flinks tests dependencies. This is mostly for the test harnesses and probably not what you want. #### Back to Top - {% top %} + {{< top >}} This will be replaced by a back to top link. It is recommended to use these links at least at the end of each level-2 section. -#### Labels +#### Info Hints + + {{< hint info >}} + Some interesting information + {{< /hint >}} + +The hint will be rendered in a blue box. This hint is useful when providing +additional information for the user that does not fit into the flow of the documentation. + +#### Info Warning + + {{< hint warning >}} + Something to watch out for. + {{< /hint >}} + +The hint will be rendered in a yellow box. This hint is useful when highlighting +information users should watch out for to prevent errors. + +#### Info Danger + + {{< hint danger >}} + Something to avoid + {{< /hint >}} + +The hint will be rendered in a red box. This hint is useful when highlighting +information users need to know to avoid data loss or to point out broken +functionality. + +#### Label + + {{< label "My Label" >}} + +The label will be rendered in an inlined blue box. This is useful for labeling functionality +such as whether a SQL feature works for only batch or streaming execution. + +#### Flink version + + {{< version >}} + +Interpolates the current Flink version + +#### Scala Version + + {{< scala_verison >}} + +Interpolates the default scala version + +#### Stable + + {{< stable >}} + Some content + {{< /stable >}} + +This shortcode will only render its content if the site is marked as stable. + +#### Unstable - {% info %} - {% warn %} + {{< unstable >}} + Some content + {{< /unstable >}} + +This shortcode will only render its content if the site is marked as unstable. -These will be replaced by an info or warning label. You can change the text of the label by providing an argument: +#### Query State Warning - {% info Recommendation %} + {{< query_state_warning >}} + +Will render a warning the current SQL feature may have unbounded state requirements. -### Documentation +#### tab -#### Navigation + {{< tabs "sometab" >}} + {{< tab "Java" >}} + ```java + System.out.println("Hello World!"); + ``` + {{< /tab >}} + {{< tab "Scala" >}} + ```scala + println("Hello World!"); + ``` + {< /tab >}} + {{< /tabs }} + +Prints the content in tabs. IMPORTANT: The label in the outermost "tabs" shortcode must +be unique for the page. -The navigation on the left side of the docs is automatically generated when building the docs. You can modify the markup in `_include/sidenav.html`. +#### Github Repo -The structure of the navigation is determined by the front matter of all pages. The fields used to determine the structure are: + {{< github_repo >}} + +Renders a link to the apache flink repo. -- `nav-id` => ID of this page. Other pages can use this ID as their parent ID. -- `nav-parent_id` => ID of the parent. This page will be listed under the page with id `nav-parent_id`. +#### Github Link -Level 0 is made up of all pages, which have nav-parent_id set to `root`. There is no limitation on how many levels you can nest. + {{< gh_link file="/some/file.java" name="Some file" >}} + +Renders a link to a file in the Apache Flink repo with a given name. + +#### JavaDocs Link + {{< javadoc file="some/file" name="Some file" >}} -The `title` of the page is used as the default link text. You can override this via `nav-title`. The relative position per navigational level is determined by `nav-pos`. +Renders a link to a file in the Apache Flink Java Documentation. -If you have a page with sub pages, the link target will be used to expand the sub level navigation. If you want to actually add a link to the page as well, you can add the `nav-show_overview: true` field to the front matter. This will then add an `Overview` sub page to the expanded list. +#### PythonDocs Link + {< pythondoc file="some/file" name="Some file" >}} -The nesting is also used for the breadcrumbs like `Application Development > Libraries > Machine Learning > Optimization`. +Renders a link to a file in the Apache Flink Python Documentation. diff --git a/docs/_config.yml b/docs/_config.yml deleted file mode 100644 index 019b0345f6d0d..0000000000000 --- a/docs/_config.yml +++ /dev/null @@ -1,112 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -#------------------------------------------------------------------------------ -# VARIABLES -#------------------------------------------------------------------------------ -# Variables specified in this file can be used in the documentation via: -# {{ site.CONFIG_KEY }} -#------------------------------------------------------------------------------ - -# This are the version referenced in the docs. Please only use these variables -# to reference a specific Flink version, because this is the only place where -# we change the version for the complete docs when forking of a release branch -# etc. -# The full version string as referenced in Maven (e.g. 1.2.1) -version: "1.13-SNAPSHOT" -# For stable releases, leave the bugfix version out (e.g. 1.2). For snapshot -# release this should be the same as the regular version -version_title: "1.13-SNAPSHOT" -# Branch on Github for this version -github_branch: "master" - -# Plain Scala version is needed for e.g. the Gradle quickstart. -scala_version: "2.11" -# This suffix is appended to the Scala-dependent Maven artifact names -scala_version_suffix: "_2.11" - -# Some commonly linked pages (this was more important to have as a variable -# during incubator; by now it should also be fine to hardcode these.) -website_url: "https://flink.apache.org" -jira_url: "https://issues.apache.org/jira/browse/FLINK" -github_url: "https://github.com/apache/flink" -download_url: "https://flink.apache.org/downloads.html" -zh_download_url: "https://flink.apache.org/zh/downloads.html" - -# please use a protocol relative URL here -baseurl: //ci.apache.org/projects/flink/flink-docs-master -stable_baseurl: //ci.apache.org/projects/flink/flink-docs-stable - -javadocs_baseurl: //ci.apache.org/projects/flink/flink-docs-master -pythondocs_baseurl: //ci.apache.org/projects/flink/flink-docs-master - -statefundocs_baseurl: //ci.apache.org/projects/flink/flink-statefun-docs-master -statefundocs_stable_baseurl: //ci.apache.org/projects/flink/flink-statefun-docs-stable - -# Flag whether this is a stable version or not. Used for the quickstart page. -is_stable: false - -# Flag to indicate whether an outdated warning should be shown. -show_outdated_warning: false - -previous_docs: - '1.12': http://ci.apache.org/projects/flink/flink-docs-release-1.12 - '1.11': http://ci.apache.org/projects/flink/flink-docs-release-1.11 - '1.10': http://ci.apache.org/projects/flink/flink-docs-release-1.10 - '1.9': http://ci.apache.org/projects/flink/flink-docs-release-1.9 - '1.8': http://ci.apache.org/projects/flink/flink-docs-release-1.8 - '1.7': http://ci.apache.org/projects/flink/flink-docs-release-1.7 - '1.6': http://ci.apache.org/projects/flink/flink-docs-release-1.6 - '1.5': http://ci.apache.org/projects/flink/flink-docs-release-1.5 - '1.4': http://ci.apache.org/projects/flink/flink-docs-release-1.4 - '1.3': http://ci.apache.org/projects/flink/flink-docs-release-1.3 - '1.2': http://ci.apache.org/projects/flink/flink-docs-release-1.2 - '1.1': http://ci.apache.org/projects/flink/flink-docs-release-1.1 - '1.0': http://ci.apache.org/projects/flink/flink-docs-release-1.0 - -#------------------------------------------------------------------------------ -# BUILD CONFIG -#------------------------------------------------------------------------------ -# These variables configure the jekyll build (./build_docs.sh). You don't need -# to change anything here. -#------------------------------------------------------------------------------ - -exclude: - - "build_docs.sh" - - "check_links.sh" - - "spider.log" - -# Used in some documents to initialize arrays. Don't delete. -array: [] - -defaults: - - - scope: - path: "" - values: - layout: plain - nav-pos: 99999 # Move to end if no pos specified - -host: 0.0.0.0 - -kramdown: - toc_levels: 1..3 # Include h1-h3 for ToC - -# The all languages used -languages: ['en', 'zh'] - -plugins: ['jekyll-paginate', 'jekyll-multiple-languages'] diff --git a/docs/_config_dev_en.yml b/docs/_config_dev_en.yml deleted file mode 100644 index eb11fbf39f319..0000000000000 --- a/docs/_config_dev_en.yml +++ /dev/null @@ -1,24 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License - -exclude: - - "*.zh.md" - - "build_docs.sh" - - "check_links.sh" - - "content" - - "content_en" - - "content_zh" diff --git a/docs/_config_dev_zh.yml b/docs/_config_dev_zh.yml deleted file mode 100644 index 8b9ddeb88caaf..0000000000000 --- a/docs/_config_dev_zh.yml +++ /dev/null @@ -1,27 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License - -exclude: - - "*.md" - - "build_docs.sh" - - "check_links.sh" - - "content" - - "content_en" - - "content_zh" - -include: - - "*.zh.md" diff --git a/docs/_data/sql-connectors.yml b/docs/_data/sql-connectors.yml deleted file mode 100644 index 46a94256a6482..0000000000000 --- a/docs/_data/sql-connectors.yml +++ /dev/null @@ -1,154 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License - -# INSTRUCTIONS: -# -# In order to add a new connector/format add a new entry to this file. -# You need specify a name that will be used in e.g. the description of the connector/format and -# a category (either "format" or "connector"). The category determines which table will the entry -# end up in on the Download page. The "maven" parameter describes the name of the maven module. The -# three parameters are required. -# -# If you specify "built-in=true" the corresponding table on the connector/format will not contain -# a link, but just a "Built-in" entry. If the built-in is set to true you do not need to provide the -# sql-url. -# -# If a connector comes with different versions for the external system, you can put those under a -# "versions" property. Each entry in the "versions" section should have a "version", which -# determines name for the version and "maven" and "sql-url" entries for that particular version. -# If you use the "versions" property, "maven" and "sql-url" should not be present in the top level -# section of the connector. (Multiple versions are supported only for the connector for now. If you -# need multiple versions support for formats, please update downloads.md) -# -# NOTE: You can use liquid variables in "sql-url" and "maven" properties. - -avro: - name: Avro - maven: flink-sql-avro - category: format - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-avro/{{site.version}}/flink-sql-avro-{{site.version}}.jar - -avro-confluent: - name: Avro Schema Registry - maven: flink-avro-confluent-registry - category: format - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-avro-confluent-registry/{{site.version}}/flink-sql-avro-confluent-registry-{{site.version}}.jar - -orc: - name: ORC - maven: flink-orc{{site.scala_version_suffix}} - category: format - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-orc{{site.scala_version_suffix}}/{{site.version}}/flink-sql-orc{{site.scala_version_suffix}}-{{site.version}}.jar - -parquet: - name: Parquet - maven: flink-parquet{{site.scala_version_suffix}} - category: format - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-parquet{{site.scala_version_suffix}}/{{site.version}}/flink-sql-parquet{{site.scala_version_suffix}}-{{site.version}}.jar - -debezium-avro-confluent: - name: Debezium - maven: flink-avro-confluent-registry - category: format - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-avro-confluent-registry/{{site.version}}/flink-sql-avro-confluent-registry-{{site.version}}.jar - -debezium-json: - name: Debezium - maven: flink-json - category: format - built-in: true - -canal: - name: Canal - maven: flink-json - category: format - built-in: true - -maxwell: - name: Maxwell - maven: flink-json - category: format - built-in: true - -csv: - name: CSV - maven: flink-csv - category: format - built-in: true - -json: - name: Json - maven: flink-json - category: format - built-in: true - -raw: - name: RAW - maven: - category: format - built-in: true - -elastic: - name: Elasticsearch - category: connector - versions: - - version: 6.x - maven: flink-connector-elasticsearch6{{site.scala_version_suffix}} - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch6{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-elasticsearch6{{site.scala_version_suffix}}-{{site.version}}.jar - - version: 7.x and later versions - maven: flink-connector-elasticsearch7{{site.scala_version_suffix}} - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-elasticsearch7{{site.scala_version_suffix}}-{{site.version}}.jar - -hbase: - name: HBase - category: connector - versions: - - version: 1.4.x - maven: flink-connector-hbase-1.4{{site.scala_version_suffix}} - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hbase-1.4{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-hbase-1.4{{site.scala_version_suffix}}-{{site.version}}.jar - - version: 2.2.x - maven: flink-connector-hbase-2.2{{site.scala_version_suffix}} - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hbase-2.2{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-hbase-2.2{{site.scala_version_suffix}}-{{site.version}}.jar - -jdbc: - name: JDBC - category: connector - maven: flink-connector-jdbc{{site.scala_version_suffix}} - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc{{site.scala_version_suffix}}/{{site.version}}/flink-connector-jdbc{{site.scala_version_suffix}}-{{site.version}}.jar - -kafka: - name: Kafka - category: connector - versions: - - version: universal - maven: flink-connector-kafka{{site.scala_version_suffix}} - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-kafka{{site.scala_version_suffix}}-{{site.version}}.jar - -upsert-kafka: - name: Upsert Kafka - category: connector - versions: - - version: universal - maven: flink-connector-kafka{{site.scala_version_suffix}} - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-kafka{{site.scala_version_suffix}}-{{site.version}}.jar - -kinesis: - name: Kinesis - category: connector - maven: flink-connector-kinesis{{ site.scala_version_suffix }} - sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kinesis{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-kinesis{{site.scala_version_suffix}}-{{site.version}}.jar - diff --git a/docs/_includes/generated/all_jobmanager_section.html b/docs/_includes/generated/all_jobmanager_section.html deleted file mode 100644 index 3cf1e6116c14f..0000000000000 --- a/docs/_includes/generated/all_jobmanager_section.html +++ /dev/null @@ -1,66 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
jobmanager.archive.fs.dir
(none)StringDictionary for JobManager to store the archives of completed jobs.
jobmanager.execution.attempts-history-size
16IntegerThe maximum number of prior execution attempts kept in history.
jobmanager.execution.failover-strategy
"region"StringThis option specifies how the job computation recovers from task failures. Accepted values are:
  • 'full': Restarts all tasks to recover the job.
  • 'region': Restarts all tasks that could be affected by the task failure. More details can be found here.
jobmanager.retrieve-taskmanager-hostname
trueBooleanFlag indicating whether JobManager would retrieve canonical host name of TaskManager during registration. If the option is set to "false", TaskManager registration with JobManager could be faster, since no reverse DNS lookup is performed. However, local input split assignment (such as for HDFS files) may be impacted.
jobmanager.rpc.address
(none)StringThe config parameter defining the network address to connect to for communication with the job manager. This value is only interpreted in setups where a single JobManager with static name or address exists (simple standalone setups, or container setups with dynamic service name resolution). It is not used in many high-availability setups, when a leader-election service (like ZooKeeper) is used to elect and discover the JobManager leader from potentially multiple standby JobManagers.
jobmanager.rpc.port
6123IntegerThe config parameter defining the network port to connect to for communication with the job manager. Like jobmanager.rpc.address, this value is only interpreted in setups where a single JobManager with static name/address and port exists (simple standalone setups, or container setups with dynamic service name resolution). This config option is not used in many high-availability setups, when a leader-election service (like ZooKeeper) is used to elect and discover the JobManager leader from potentially multiple standby JobManagers.
jobstore.cache-size
52428800LongThe job store cache size in bytes which is used to keep completed jobs in memory.
jobstore.expiration-time
3600LongThe time in seconds after which a completed job expires and is purged from the job store.
jobstore.max-capacity
2147483647IntegerThe max number of completed jobs that can be kept in the job store.
diff --git a/docs/_includes/generated/common_state_backends_section.html b/docs/_includes/generated/common_state_backends_section.html deleted file mode 100644 index c24b0ceb915da..0000000000000 --- a/docs/_includes/generated/common_state_backends_section.html +++ /dev/null @@ -1,54 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
state.backend
(none)StringThe state backend to be used to store and checkpoint state.
state.checkpoints.dir
(none)StringThe default directory used for storing the data files and meta data of checkpoints in a Flink supported filesystem. The storage path must be accessible from all participating processes/nodes(i.e. all TaskManagers and JobManagers).
state.savepoints.dir
(none)StringThe default directory for savepoints. Used by the state backends that write savepoints to file systems (MemoryStateBackend, FsStateBackend, RocksDBStateBackend).
state.backend.incremental
falseBooleanOption whether the state backend should create incremental checkpoints, if possible. For an incremental checkpoint, only a diff from the previous checkpoint is stored, rather than the complete checkpoint state. Once enabled, the state size shown in web UI or fetched from rest API only represents the delta checkpoint size instead of full checkpoint size. Some state backends may not support incremental checkpoints and ignore this option.
state.backend.local-recovery
falseBooleanThis option configures local recovery for this state backend. By default, local recovery is deactivated. Local recovery currently only covers keyed state backends. Currently, MemoryStateBackend does not support local recovery and ignore this option.
state.checkpoints.num-retained
1IntegerThe maximum number of completed checkpoints to retain.
taskmanager.state.local.root-dirs
(none)StringThe config parameter defining the root directories for storing file-based state for local recovery. Local recovery currently only covers keyed state backends. Currently, MemoryStateBackend does not support local recovery and ignore this option
diff --git a/docs/_includes/generated/execution_checkpointing_configuration.html b/docs/_includes/generated/execution_checkpointing_configuration.html deleted file mode 100644 index 719e4b9510b34..0000000000000 --- a/docs/_includes/generated/execution_checkpointing_configuration.html +++ /dev/null @@ -1,72 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
execution.checkpointing.alignment-timeout
0 msDurationOnly relevant if `execution.checkpointing.unaligned` is enabled.

If timeout is 0, checkpoints will always start unaligned.

If timeout has a positive value, checkpoints will start aligned. If during checkpointing, checkpoint start delay exceeds this timeout, alignment will timeout and checkpoint barrier will start working as unaligned checkpoint.
execution.checkpointing.externalized-checkpoint-retention
(none)

Enum

Possible values: [DELETE_ON_CANCELLATION, RETAIN_ON_CANCELLATION]
Externalized checkpoints write their meta data out to persistent storage and are not automatically cleaned up when the owning job fails or is suspended (terminating with job status `JobStatus#FAILED` or `JobStatus#SUSPENDED`. In this case, you have to manually clean up the checkpoint state, both the meta data and actual program state.

The mode defines how an externalized checkpoint should be cleaned up on job cancellation. If you choose to retain externalized checkpoints on cancellation you have to handle checkpoint clean up manually when you cancel the job as well (terminating with job status `JobStatus#CANCELED`).

The target directory for externalized checkpoints is configured via `state.checkpoints.dir`.
execution.checkpointing.interval
(none)DurationGets the interval in which checkpoints are periodically scheduled.

This setting defines the base interval. Checkpoint triggering may be delayed by the settings `execution.checkpointing.max-concurrent-checkpoints` and `execution.checkpointing.min-pause`
execution.checkpointing.max-concurrent-checkpoints
1IntegerThe maximum number of checkpoint attempts that may be in progress at the same time. If this value is n, then no checkpoints will be triggered while n checkpoint attempts are currently in flight. For the next checkpoint to be triggered, one checkpoint attempt would need to finish or expire.
execution.checkpointing.min-pause
0 msDurationThe minimal pause between checkpointing attempts. This setting defines how soon thecheckpoint coordinator may trigger another checkpoint after it becomes possible to triggeranother checkpoint with respect to the maximum number of concurrent checkpoints(see `execution.checkpointing.max-concurrent-checkpoints`).

If the maximum number of concurrent checkpoints is set to one, this setting makes effectively sure that a minimum amount of time passes where no checkpoint is in progress at all.
execution.checkpointing.mode
EXACTLY_ONCE

Enum

Possible values: [EXACTLY_ONCE, AT_LEAST_ONCE]
The checkpointing mode (exactly-once vs. at-least-once).
execution.checkpointing.prefer-checkpoint-for-recovery
falseBooleanIf enabled, a job recovery should fallback to checkpoint when there is a more recent savepoint.
execution.checkpointing.timeout
10 minDurationThe maximum time that a checkpoint may take before being discarded.
execution.checkpointing.tolerable-failed-checkpoints
(none)IntegerThe tolerable checkpoint failure number. If set to 0, that means we do not tolerance any checkpoint failure.
execution.checkpointing.unaligned
falseBooleanEnables unaligned checkpoints, which greatly reduce checkpointing times under backpressure.

Unaligned checkpoints contain data stored in buffers as part of the checkpoint state, which allows checkpoint barriers to overtake these buffers. Thus, the checkpoint duration becomes independent of the current throughput as checkpoint barriers are effectively not embedded into the stream of data anymore.

Unaligned checkpoints can only be enabled if `execution.checkpointing.mode` is `EXACTLY_ONCE` and if `execution.checkpointing.max-concurrent-checkpoints` is 1
diff --git a/docs/_includes/generated/expert_high_availability_zk_section.html b/docs/_includes/generated/expert_high_availability_zk_section.html deleted file mode 100644 index d7774e22560e9..0000000000000 --- a/docs/_includes/generated/expert_high_availability_zk_section.html +++ /dev/null @@ -1,84 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
high-availability.zookeeper.client.acl
"open"StringDefines the ACL (open|creator) to be configured on ZK node. The configuration value can be set to “creator” if the ZooKeeper server configuration has the “authProvider” property mapped to use SASLAuthenticationProvider and the cluster is configured to run in secure mode (Kerberos).
high-availability.zookeeper.client.connection-timeout
15000IntegerDefines the connection timeout for ZooKeeper in ms.
high-availability.zookeeper.client.max-retry-attempts
3IntegerDefines the number of connection retries before the client gives up.
high-availability.zookeeper.client.retry-wait
5000IntegerDefines the pause between consecutive retries in ms.
high-availability.zookeeper.client.session-timeout
60000IntegerDefines the session timeout for the ZooKeeper session in ms.
high-availability.zookeeper.path.checkpoint-counter
"/checkpoint-counter"StringZooKeeper root path (ZNode) for checkpoint counters.
high-availability.zookeeper.path.checkpoints
"/checkpoints"StringZooKeeper root path (ZNode) for completed checkpoints.
high-availability.zookeeper.path.jobgraphs
"/jobgraphs"StringZooKeeper root path (ZNode) for job graphs
high-availability.zookeeper.path.latch
"/leaderlatch"StringDefines the znode of the leader latch which is used to elect the leader.
high-availability.zookeeper.path.leader
"/leader"StringDefines the znode of the leader which contains the URL to the leader and the current leader session ID.
high-availability.zookeeper.path.mesos-workers
"/mesos-workers"StringThe ZooKeeper root path for persisting the Mesos worker information.
high-availability.zookeeper.path.running-registry
"/running_job_registry/"String
diff --git a/docs/_includes/generated/expert_rest_section.html b/docs/_includes/generated/expert_rest_section.html deleted file mode 100644 index ab1a70cb67e99..0000000000000 --- a/docs/_includes/generated/expert_rest_section.html +++ /dev/null @@ -1,66 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
rest.await-leader-timeout
30000LongThe time in ms that the client waits for the leader address, e.g., Dispatcher or WebMonitorEndpoint
rest.client.max-content-length
104857600IntegerThe maximum content length in bytes that the client will handle.
rest.connection-timeout
15000LongThe maximum time in ms for the client to establish a TCP connection.
rest.idleness-timeout
300000LongThe maximum time in ms for a connection to stay idle before failing.
rest.retry.delay
3000LongThe time in ms that the client waits between retries (See also `rest.retry.max-attempts`).
rest.retry.max-attempts
20IntegerThe number of retries the client will attempt if a retryable operations fails.
rest.server.max-content-length
104857600IntegerThe maximum content length in bytes that the server will handle.
rest.server.numThreads
4IntegerThe number of threads for the asynchronous processing of requests.
rest.server.thread-priority
5IntegerThread priority of the REST server's executor for processing asynchronous requests. Lowering the thread priority will give Flink's main components more CPU time whereas increasing will allocate more time for the REST server's processing.
diff --git a/docs/_includes/generated/expert_scheduling_section.html b/docs/_includes/generated/expert_scheduling_section.html deleted file mode 100644 index 9020268e0e869..0000000000000 --- a/docs/_includes/generated/expert_scheduling_section.html +++ /dev/null @@ -1,36 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
cluster.evenly-spread-out-slots
falseBooleanEnable the slot spread out allocation strategy. This strategy tries to spread out the slots evenly across all available `TaskExecutors`.
slot.idle.timeout
50000LongThe timeout in milliseconds for a idle slot in Slot Pool.
slot.request.timeout
300000LongThe timeout in milliseconds for requesting a slot from Slot Pool.
slotmanager.number-of-slots.max
2147483647IntegerDefines the maximum number of slots that the Flink cluster allocates. This configuration option is meant for limiting the resource consumption for batch workloads. It is not recommended to configure this option for streaming workloads, which may fail if there are not enough slots. Note that this configuration option does not take effect for standalone clusters, where how many slots are allocated is not controlled by Flink.
diff --git a/docs/_includes/generated/expert_security_ssl_section.html b/docs/_includes/generated/expert_security_ssl_section.html deleted file mode 100644 index 290667823bca9..0000000000000 --- a/docs/_includes/generated/expert_security_ssl_section.html +++ /dev/null @@ -1,42 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
security.ssl.internal.close-notify-flush-timeout
-1IntegerThe timeout (in ms) for flushing the `close_notify` that was triggered by closing a channel. If the `close_notify` was not flushed in the given timeout the channel will be closed forcibly. (-1 = use system default)
security.ssl.internal.handshake-timeout
-1IntegerThe timeout (in ms) during SSL handshake. (-1 = use system default)
security.ssl.internal.session-cache-size
-1IntegerThe size of the cache used for storing SSL session objects. According to here, you should always set this to an appropriate number to not run into a bug with stalling IO threads during garbage collection. (-1 = use system default).
security.ssl.internal.session-timeout
-1IntegerThe timeout (in ms) for the cached SSL session objects. (-1 = use system default)
security.ssl.provider
"JDK"StringThe SSL engine provider to use for the ssl transport:
  • `JDK`: default Java-based SSL engine
  • `OPENSSL`: openSSL-based SSL engine using system libraries
`OPENSSL` is based on netty-tcnative and comes in two flavours:
  • dynamically linked: This will use your system's openSSL libraries (if compatible) and requires `opt/flink-shaded-netty-tcnative-dynamic-*.jar` to be copied to `lib/`
  • statically linked: Due to potential licensing issues with openSSL (see LEGAL-393), we cannot ship pre-built libraries. However, you can build the required library yourself and put it into `lib/`:
    `git clone https://github.com/apache/flink-shaded.git && cd flink-shaded && mvn clean package -Pinclude-netty-tcnative-static -pl flink-shaded-netty-tcnative-static`
diff --git a/docs/_includes/generated/expert_state_backends_section.html b/docs/_includes/generated/expert_state_backends_section.html deleted file mode 100644 index 0fed8674dbb67..0000000000000 --- a/docs/_includes/generated/expert_state_backends_section.html +++ /dev/null @@ -1,30 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
state.backend.async
trueBooleanOption whether the state backend should use an asynchronous snapshot method where possible and configurable. Some state backends may not support asynchronous snapshots, or only support asynchronous snapshots, and ignore this option.
state.backend.fs.memory-threshold
20 kbMemorySizeThe minimum size of state data files. All state chunks smaller than that are stored inline in the root checkpoint metadata file. The max memory threshold for this configuration is 1MB.
state.backend.fs.write-buffer-size
4096IntegerThe default size of the write buffer for the checkpoint streams that write to file systems. The actual write buffer size is determined to be the maximum of the value of this option and option 'state.backend.fs.memory-threshold'.
diff --git a/docs/_includes/generated/exponential_delay_restart_strategy_configuration.html b/docs/_includes/generated/exponential_delay_restart_strategy_configuration.html deleted file mode 100644 index a0b51c700a776..0000000000000 --- a/docs/_includes/generated/exponential_delay_restart_strategy_configuration.html +++ /dev/null @@ -1,42 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
restart-strategy.exponential-delay.backoff-multiplier
2.0DoubleBackoff value is multiplied by this value after every failure,until max backoff is reached if `restart-strategy` has been set to `exponential-delay`.
restart-strategy.exponential-delay.initial-backoff
1 sDurationStarting duration between restarts if `restart-strategy` has been set to `exponential-delay`. It can be specified using notation: "1 min", "20 s"
restart-strategy.exponential-delay.jitter-factor
0.1DoubleJitter specified as a portion of the backoff if `restart-strategy` has been set to `exponential-delay`. It represents how large random value will be added or subtracted to the backoff. Useful when you want to avoid restarting multiple jobs at the same time.
restart-strategy.exponential-delay.max-backoff
5 minDurationThe highest possible duration between restarts if `restart-strategy` has been set to `exponential-delay`. It can be specified using notation: "1 min", "20 s"
restart-strategy.exponential-delay.reset-backoff-threshold
1 hDurationThreshold when the backoff is reset to its initial value if `restart-strategy` has been set to `exponential-delay`. It specifies how long the job must be running without failure to reset the exponentially increasing backoff to its initial value. It can be specified using notation: "1 min", "20 s"
diff --git a/docs/_includes/generated/failure_rate_restart_strategy_configuration.html b/docs/_includes/generated/failure_rate_restart_strategy_configuration.html deleted file mode 100644 index 93bcd22bbdf00..0000000000000 --- a/docs/_includes/generated/failure_rate_restart_strategy_configuration.html +++ /dev/null @@ -1,30 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
restart-strategy.failure-rate.delay
1 sDurationDelay between two consecutive restart attempts if `restart-strategy` has been set to `failure-rate`. It can be specified using notation: "1 min", "20 s"
restart-strategy.failure-rate.failure-rate-interval
1 minDurationTime interval for measuring failure rate if `restart-strategy` has been set to `failure-rate`. It can be specified using notation: "1 min", "20 s"
restart-strategy.failure-rate.max-failures-per-interval
1IntegerMaximum number of restarts in given time interval before failing a job if `restart-strategy` has been set to `failure-rate`.
diff --git a/docs/_includes/generated/fixed_delay_restart_strategy_configuration.html b/docs/_includes/generated/fixed_delay_restart_strategy_configuration.html deleted file mode 100644 index d4d74fe070135..0000000000000 --- a/docs/_includes/generated/fixed_delay_restart_strategy_configuration.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
restart-strategy.fixed-delay.attempts
1IntegerThe number of times that Flink retries the execution before the job is declared as failed if `restart-strategy` has been set to `fixed-delay`.
restart-strategy.fixed-delay.delay
1 sDurationDelay between two consecutive restart attempts if `restart-strategy` has been set to `fixed-delay`. Delaying the retries can be helpful when the program interacts with external systems where for example connections or pending transactions should reach a timeout before re-execution is attempted. It can be specified using notation: "1 min", "20 s"
diff --git a/docs/_includes/generated/rest_configuration.html b/docs/_includes/generated/rest_configuration.html deleted file mode 100644 index bcef4f6a8ae02..0000000000000 --- a/docs/_includes/generated/rest_configuration.html +++ /dev/null @@ -1,90 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
rest.address
(none)StringThe address that should be used by clients to connect to the server. Attention: This option is respected only if the high-availability configuration is NONE.
rest.await-leader-timeout
30000LongThe time in ms that the client waits for the leader address, e.g., Dispatcher or WebMonitorEndpoint
rest.bind-address
(none)StringThe address that the server binds itself.
rest.bind-port
"8081"StringThe port that the server binds itself. Accepts a list of ports (“50100,50101”), ranges (“50100-50200”) or a combination of both. It is recommended to set a range of ports to avoid collisions when multiple Rest servers are running on the same machine.
rest.client.max-content-length
104857600IntegerThe maximum content length in bytes that the client will handle.
rest.connection-timeout
15000LongThe maximum time in ms for the client to establish a TCP connection.
rest.idleness-timeout
300000LongThe maximum time in ms for a connection to stay idle before failing.
rest.port
8081IntegerThe port that the client connects to. If rest.bind-port has not been specified, then the REST server will bind to this port. Attention: This option is respected only if the high-availability configuration is NONE.
rest.retry.delay
3000LongThe time in ms that the client waits between retries (See also `rest.retry.max-attempts`).
rest.retry.max-attempts
20IntegerThe number of retries the client will attempt if a retryable operations fails.
rest.server.max-content-length
104857600IntegerThe maximum content length in bytes that the server will handle.
rest.server.numThreads
4IntegerThe number of threads for the asynchronous processing of requests.
rest.server.thread-priority
5IntegerThread priority of the REST server's executor for processing asynchronous requests. Lowering the thread priority will give Flink's main components more CPU time whereas increasing will allocate more time for the REST server's processing.
diff --git a/docs/_includes/generated/rest_v1_dispatcher.html b/docs/_includes/generated/rest_v1_dispatcher.html deleted file mode 100644 index 071d1bbb159b3..0000000000000 --- a/docs/_includes/generated/rest_v1_dispatcher.html +++ /dev/null @@ -1,4758 +0,0 @@ - - - - - - - - - - - - - - - - - - - -
/cluster
Verb: DELETEResponse code: 200 OK
Shuts down the cluster
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/config
Verb: GETResponse code: 200 OK
Returns the configuration of the WebUI.
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:DashboardConfiguration",
-  "properties" : {
-    "features" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:DashboardConfiguration:Features",
-      "properties" : {
-        "web-submit" : {
-          "type" : "boolean"
-        }
-      }
-    },
-    "flink-revision" : {
-      "type" : "string"
-    },
-    "flink-version" : {
-      "type" : "string"
-    },
-    "refresh-interval" : {
-      "type" : "integer"
-    },
-    "timezone-name" : {
-      "type" : "string"
-    },
-    "timezone-offset" : {
-      "type" : "integer"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/datasets
Verb: GETResponse code: 200 OK
Returns all cluster data sets.
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:dataset:ClusterDataSetListResponseBody",
-  "properties" : {
-    "dataSets" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:dataset:ClusterDataSetEntry",
-        "properties" : {
-          "id" : {
-            "type" : "string"
-          },
-          "isComplete" : {
-            "type" : "boolean"
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/datasets/delete/:triggerid
Verb: GETResponse code: 200 OK
Returns the status for the delete operation of a cluster data set.
Path parameters
-
    -
  • triggerid - 32-character hexadecimal string that identifies an asynchronous operation trigger ID. The ID was returned then the operation was triggered.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:AsynchronousOperationResult",
-  "properties" : {
-    "operation" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:AsynchronousOperationInfo",
-      "properties" : {
-        "failure-cause" : {
-          "type" : "any"
-        }
-      }
-    },
-    "status" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:queue:QueueStatus",
-      "properties" : {
-        "id" : {
-          "type" : "string",
-          "required" : true,
-          "enum" : [ "IN_PROGRESS", "COMPLETED" ]
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/datasets/:datasetid
Verb: DELETEResponse code: 202 Accepted
Triggers the deletion of a cluster data set. This async operation would return a 'triggerid' for further query identifier.
Path parameters
-
    -
  • datasetid - 32-character hexadecimal string value that identifies a cluster data set.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:TriggerResponse",
-  "properties" : {
-    "request-id" : {
-      "type" : "any"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/jars
Verb: GETResponse code: 200 OK
Returns a list of all jars previously uploaded via '/jars/upload'.
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:webmonitor:handlers:JarListInfo",
-  "properties" : {
-    "address" : {
-      "type" : "string"
-    },
-    "files" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:webmonitor:handlers:JarListInfo:JarFileInfo",
-        "properties" : {
-          "entry" : {
-            "type" : "array",
-            "items" : {
-              "type" : "object",
-              "id" : "urn:jsonschema:org:apache:flink:runtime:webmonitor:handlers:JarListInfo:JarEntryInfo",
-              "properties" : {
-                "description" : {
-                  "type" : "string"
-                },
-                "name" : {
-                  "type" : "string"
-                }
-              }
-            }
-          },
-          "id" : {
-            "type" : "string"
-          },
-          "name" : {
-            "type" : "string"
-          },
-          "uploaded" : {
-            "type" : "integer"
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/jars/upload
Verb: POSTResponse code: 200 OK
Uploads a jar to the cluster. The jar must be sent as multi-part data. Make sure that the "Content-Type" header is set to "application/x-java-archive", as some http libraries do not add the header by default. -Using 'curl' you can upload a jar via 'curl -X POST -H "Expect:" -F "jarfile=@path/to/flink-job.jar" http://hostname:port/jars/upload'.
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:webmonitor:handlers:JarUploadResponseBody",
-  "properties" : {
-    "filename" : {
-      "type" : "string"
-    },
-    "status" : {
-      "type" : "string",
-      "enum" : [ "success" ]
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jars/:jarid
Verb: DELETEResponse code: 200 OK
Deletes a jar previously uploaded via '/jars/upload'.
Path parameters
-
    -
  • jarid - String value that identifies a jar. When uploading the jar a path is returned, where the filename is the ID. This value is equivalent to the `id` field in the list of uploaded jars (/jars).
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/jars/:jarid/plan
Verb: GETResponse code: 200 OK
Returns the dataflow plan of a job contained in a jar previously uploaded via '/jars/upload'. Program arguments can be passed both via the JSON request (recommended) or query parameters.
Path parameters
-
    -
  • jarid - String value that identifies a jar. When uploading the jar a path is returned, where the filename is the ID. This value is equivalent to the `id` field in the list of uploaded jars (/jars).
  • -
-
Query parameters
-
    -
  • program-args (optional): Deprecated, please use 'programArg' instead. String value that specifies the arguments for the program or plan
  • -
  • programArg (optional): Comma-separated list of program arguments.
  • -
  • entry-class (optional): String value that specifies the fully qualified name of the entry point class. Overrides the class defined in the jar file manifest.
  • -
  • parallelism (optional): Positive integer value that specifies the desired parallelism for the job.
  • -
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:webmonitor:handlers:JarPlanRequestBody",
-  "properties" : {
-    "entryClass" : {
-      "type" : "string"
-    },
-    "jobId" : {
-      "type" : "any"
-    },
-    "parallelism" : {
-      "type" : "integer"
-    },
-    "programArgs" : {
-      "type" : "string"
-    },
-    "programArgsList" : {
-      "type" : "array",
-      "items" : {
-        "type" : "string"
-      }
-    }
-  }
-}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobPlanInfo",
-  "properties" : {
-    "plan" : {
-      "type" : "any"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/jars/:jarid/plan
Verb: POSTResponse code: 200 OK
Returns the dataflow plan of a job contained in a jar previously uploaded via '/jars/upload'. Program arguments can be passed both via the JSON request (recommended) or query parameters.
Path parameters
-
    -
  • jarid - String value that identifies a jar. When uploading the jar a path is returned, where the filename is the ID. This value is equivalent to the `id` field in the list of uploaded jars (/jars).
  • -
-
Query parameters
-
    -
  • program-args (optional): Deprecated, please use 'programArg' instead. String value that specifies the arguments for the program or plan
  • -
  • programArg (optional): Comma-separated list of program arguments.
  • -
  • entry-class (optional): String value that specifies the fully qualified name of the entry point class. Overrides the class defined in the jar file manifest.
  • -
  • parallelism (optional): Positive integer value that specifies the desired parallelism for the job.
  • -
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:webmonitor:handlers:JarPlanRequestBody",
-  "properties" : {
-    "entryClass" : {
-      "type" : "string"
-    },
-    "jobId" : {
-      "type" : "any"
-    },
-    "parallelism" : {
-      "type" : "integer"
-    },
-    "programArgs" : {
-      "type" : "string"
-    },
-    "programArgsList" : {
-      "type" : "array",
-      "items" : {
-        "type" : "string"
-      }
-    }
-  }
-}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobPlanInfo",
-  "properties" : {
-    "plan" : {
-      "type" : "any"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/jars/:jarid/run
Verb: POSTResponse code: 200 OK
Submits a job by running a jar previously uploaded via '/jars/upload'. Program arguments can be passed both via the JSON request (recommended) or query parameters.
Path parameters
-
    -
  • jarid - String value that identifies a jar. When uploading the jar a path is returned, where the filename is the ID. This value is equivalent to the `id` field in the list of uploaded jars (/jars).
  • -
-
Query parameters
-
    -
  • allowNonRestoredState (optional): Boolean value that specifies whether the job submission should be rejected if the savepoint contains state that cannot be mapped back to the job.
  • -
  • savepointPath (optional): String value that specifies the path of the savepoint to restore the job from.
  • -
  • program-args (optional): Deprecated, please use 'programArg' instead. String value that specifies the arguments for the program or plan
  • -
  • programArg (optional): Comma-separated list of program arguments.
  • -
  • entry-class (optional): String value that specifies the fully qualified name of the entry point class. Overrides the class defined in the jar file manifest.
  • -
  • parallelism (optional): Positive integer value that specifies the desired parallelism for the job.
  • -
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:webmonitor:handlers:JarRunRequestBody",
-  "properties" : {
-    "allowNonRestoredState" : {
-      "type" : "boolean"
-    },
-    "entryClass" : {
-      "type" : "string"
-    },
-    "jobId" : {
-      "type" : "any"
-    },
-    "parallelism" : {
-      "type" : "integer"
-    },
-    "programArgs" : {
-      "type" : "string"
-    },
-    "programArgsList" : {
-      "type" : "array",
-      "items" : {
-        "type" : "string"
-      }
-    },
-    "savepointPath" : {
-      "type" : "string"
-    }
-  }
-}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:webmonitor:handlers:JarRunResponseBody",
-  "properties" : {
-    "jobid" : {
-      "type" : "any"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/jobmanager/config
Verb: GETResponse code: 200 OK
Returns the cluster configuration.
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "array",
-  "items" : {
-    "type" : "object",
-    "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:ClusterConfigurationInfoEntry",
-    "properties" : {
-      "key" : {
-        "type" : "string"
-      },
-      "value" : {
-        "type" : "string"
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/jobmanager/logs
Verb: GETResponse code: 200 OK
Returns the list of log files on the JobManager.
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:LogListInfo",
-  "properties" : {
-    "logs" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:LogInfo",
-        "properties" : {
-          "name" : {
-            "type" : "string"
-          },
-          "size" : {
-            "type" : "integer"
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobmanager/metrics
Verb: GETResponse code: 200 OK
Provides access to job manager metrics.
Query parameters
-
    -
  • get (optional): Comma-separated list of string values to select specific metrics.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "any"
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/jobs
Verb: GETResponse code: 200 OK
Returns an overview over all jobs and their current state.
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:messages:webmonitor:JobIdsWithStatusOverview",
-  "properties" : {
-    "jobs" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:messages:webmonitor:JobIdsWithStatusOverview:JobIdWithStatus",
-        "properties" : {
-          "id" : {
-            "type" : "any"
-          },
-          "status" : {
-            "type" : "string",
-            "enum" : [ "INITIALIZING", "CREATED", "RUNNING", "FAILING", "FAILED", "CANCELLING", "CANCELED", "FINISHED", "RESTARTING", "SUSPENDED", "RECONCILING" ]
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/jobs
Verb: POSTResponse code: 202 Accepted
Submits a job. This call is primarily intended to be used by the Flink client. This call expects a multipart/form-data request that consists of file uploads for the serialized JobGraph, jars and distributed cache artifacts and an attribute named "request" for the JSON payload.
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:JobSubmitRequestBody",
-  "properties" : {
-    "jobArtifactFileNames" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:JobSubmitRequestBody:DistributedCacheFile",
-        "properties" : {
-          "entryName" : {
-            "type" : "string"
-          },
-          "fileName" : {
-            "type" : "string"
-          }
-        }
-      }
-    },
-    "jobGraphFileName" : {
-      "type" : "string"
-    },
-    "jobJarFileNames" : {
-      "type" : "array",
-      "items" : {
-        "type" : "string"
-      }
-    }
-  }
-}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:JobSubmitResponseBody",
-  "properties" : {
-    "jobUrl" : {
-      "type" : "string"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/metrics
Verb: GETResponse code: 200 OK
Provides access to aggregated job metrics.
Query parameters
-
    -
  • get (optional): Comma-separated list of string values to select specific metrics.
  • -
  • agg (optional): Comma-separated list of aggregation modes which should be calculated. Available aggregations are: "min, max, sum, avg".
  • -
  • jobs (optional): Comma-separated list of 32-character hexadecimal strings to select specific jobs.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "any"
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/jobs/overview
Verb: GETResponse code: 200 OK
Returns an overview over all jobs.
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:messages:webmonitor:MultipleJobsDetails",
-  "properties" : {
-    "jobs" : {
-      "type" : "array",
-      "items" : {
-        "type" : "any"
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid
Verb: GETResponse code: 200 OK
Returns details of a job.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:JobDetailsInfo",
-  "properties" : {
-    "duration" : {
-      "type" : "integer"
-    },
-    "end-time" : {
-      "type" : "integer"
-    },
-    "isStoppable" : {
-      "type" : "boolean"
-    },
-    "jid" : {
-      "type" : "any"
-    },
-    "name" : {
-      "type" : "string"
-    },
-    "now" : {
-      "type" : "integer"
-    },
-    "plan" : {
-      "type" : "string"
-    },
-    "start-time" : {
-      "type" : "integer"
-    },
-    "state" : {
-      "type" : "string",
-      "enum" : [ "INITIALIZING", "CREATED", "RUNNING", "FAILING", "FAILED", "CANCELLING", "CANCELED", "FINISHED", "RESTARTING", "SUSPENDED", "RECONCILING" ]
-    },
-    "status-counts" : {
-      "type" : "object",
-      "additionalProperties" : {
-        "type" : "integer"
-      }
-    },
-    "timestamps" : {
-      "type" : "object",
-      "additionalProperties" : {
-        "type" : "integer"
-      }
-    },
-    "vertices" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:JobDetailsInfo:JobVertexDetailsInfo",
-        "properties" : {
-          "duration" : {
-            "type" : "integer"
-          },
-          "end-time" : {
-            "type" : "integer"
-          },
-          "id" : {
-            "type" : "any"
-          },
-          "metrics" : {
-            "type" : "object",
-            "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:metrics:IOMetricsInfo",
-            "properties" : {
-              "read-bytes" : {
-                "type" : "integer"
-              },
-              "read-bytes-complete" : {
-                "type" : "boolean"
-              },
-              "read-records" : {
-                "type" : "integer"
-              },
-              "read-records-complete" : {
-                "type" : "boolean"
-              },
-              "write-bytes" : {
-                "type" : "integer"
-              },
-              "write-bytes-complete" : {
-                "type" : "boolean"
-              },
-              "write-records" : {
-                "type" : "integer"
-              },
-              "write-records-complete" : {
-                "type" : "boolean"
-              }
-            }
-          },
-          "name" : {
-            "type" : "string"
-          },
-          "parallelism" : {
-            "type" : "integer"
-          },
-          "start-time" : {
-            "type" : "integer"
-          },
-          "status" : {
-            "type" : "string",
-            "enum" : [ "CREATED", "SCHEDULED", "DEPLOYING", "RUNNING", "FINISHED", "CANCELING", "CANCELED", "FAILED", "RECONCILING" ]
-          },
-          "tasks" : {
-            "type" : "object",
-            "additionalProperties" : {
-              "type" : "integer"
-            }
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid
Verb: PATCHResponse code: 202 Accepted
Terminates a job.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
Query parameters
-
    -
  • mode (optional): String value that specifies the termination mode. The only supported value is: "cancel".
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/accumulators
Verb: GETResponse code: 200 OK
Returns the accumulators for all tasks of a job, aggregated across the respective subtasks.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
Query parameters
-
    -
  • includeSerializedValue (optional): Boolean value that specifies whether serialized user task accumulators should be included in the response.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobAccumulatorsInfo",
-  "properties" : {
-    "job-accumulators" : {
-      "type" : "array",
-      "items" : {
-        "type" : "any"
-      }
-    },
-    "serialized-user-task-accumulators" : {
-      "type" : "object",
-      "additionalProperties" : {
-        "type" : "any"
-      }
-    },
-    "user-task-accumulators" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobAccumulatorsInfo:UserTaskAccumulator",
-        "properties" : {
-          "name" : {
-            "type" : "string"
-          },
-          "type" : {
-            "type" : "string"
-          },
-          "value" : {
-            "type" : "string"
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/checkpoints
Verb: GETResponse code: 200 OK
Returns checkpointing statistics for a job.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointingStatistics",
-  "properties" : {
-    "counts" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointingStatistics:Counts",
-      "properties" : {
-        "completed" : {
-          "type" : "integer"
-        },
-        "failed" : {
-          "type" : "integer"
-        },
-        "in_progress" : {
-          "type" : "integer"
-        },
-        "restored" : {
-          "type" : "integer"
-        },
-        "total" : {
-          "type" : "integer"
-        }
-      }
-    },
-    "history" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointStatistics",
-        "properties" : {
-          "alignment_buffered" : {
-            "type" : "integer"
-          },
-          "checkpoint_type" : {
-            "type" : "string",
-            "enum" : [ "CHECKPOINT", "SAVEPOINT", "SYNC_SAVEPOINT" ]
-          },
-          "end_to_end_duration" : {
-            "type" : "integer"
-          },
-          "id" : {
-            "type" : "integer"
-          },
-          "is_savepoint" : {
-            "type" : "boolean"
-          },
-          "latest_ack_timestamp" : {
-            "type" : "integer"
-          },
-          "num_acknowledged_subtasks" : {
-            "type" : "integer"
-          },
-          "num_subtasks" : {
-            "type" : "integer"
-          },
-          "persisted_data" : {
-            "type" : "integer"
-          },
-          "processed_data" : {
-            "type" : "integer"
-          },
-          "state_size" : {
-            "type" : "integer"
-          },
-          "status" : {
-            "type" : "string",
-            "enum" : [ "IN_PROGRESS", "COMPLETED", "FAILED" ]
-          },
-          "tasks" : {
-            "type" : "object",
-            "additionalProperties" : {
-              "type" : "object",
-              "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:TaskCheckpointStatistics"
-            }
-          },
-          "trigger_timestamp" : {
-            "type" : "integer"
-          }
-        }
-      }
-    },
-    "latest" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointingStatistics:LatestCheckpoints",
-      "properties" : {
-        "completed" : {
-          "type" : "object",
-          "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointStatistics:CompletedCheckpointStatistics",
-          "properties" : {
-            "alignment_buffered" : {
-              "type" : "integer"
-            },
-            "checkpoint_type" : {
-              "type" : "string",
-              "enum" : [ "CHECKPOINT", "SAVEPOINT", "SYNC_SAVEPOINT" ]
-            },
-            "discarded" : {
-              "type" : "boolean"
-            },
-            "end_to_end_duration" : {
-              "type" : "integer"
-            },
-            "external_path" : {
-              "type" : "string"
-            },
-            "id" : {
-              "type" : "integer"
-            },
-            "is_savepoint" : {
-              "type" : "boolean"
-            },
-            "latest_ack_timestamp" : {
-              "type" : "integer"
-            },
-            "num_acknowledged_subtasks" : {
-              "type" : "integer"
-            },
-            "num_subtasks" : {
-              "type" : "integer"
-            },
-            "persisted_data" : {
-              "type" : "integer"
-            },
-            "processed_data" : {
-              "type" : "integer"
-            },
-            "state_size" : {
-              "type" : "integer"
-            },
-            "status" : {
-              "type" : "string",
-              "enum" : [ "IN_PROGRESS", "COMPLETED", "FAILED" ]
-            },
-            "tasks" : {
-              "type" : "object",
-              "additionalProperties" : {
-                "type" : "object",
-                "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:TaskCheckpointStatistics",
-                "properties" : {
-                  "alignment_buffered" : {
-                    "type" : "integer"
-                  },
-                  "end_to_end_duration" : {
-                    "type" : "integer"
-                  },
-                  "id" : {
-                    "type" : "integer"
-                  },
-                  "latest_ack_timestamp" : {
-                    "type" : "integer"
-                  },
-                  "num_acknowledged_subtasks" : {
-                    "type" : "integer"
-                  },
-                  "num_subtasks" : {
-                    "type" : "integer"
-                  },
-                  "persisted_data" : {
-                    "type" : "integer"
-                  },
-                  "processed_data" : {
-                    "type" : "integer"
-                  },
-                  "state_size" : {
-                    "type" : "integer"
-                  },
-                  "status" : {
-                    "type" : "string",
-                    "enum" : [ "IN_PROGRESS", "COMPLETED", "FAILED" ]
-                  }
-                }
-              }
-            },
-            "trigger_timestamp" : {
-              "type" : "integer"
-            }
-          }
-        },
-        "failed" : {
-          "type" : "object",
-          "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointStatistics:FailedCheckpointStatistics",
-          "properties" : {
-            "alignment_buffered" : {
-              "type" : "integer"
-            },
-            "checkpoint_type" : {
-              "type" : "string",
-              "enum" : [ "CHECKPOINT", "SAVEPOINT", "SYNC_SAVEPOINT" ]
-            },
-            "end_to_end_duration" : {
-              "type" : "integer"
-            },
-            "failure_message" : {
-              "type" : "string"
-            },
-            "failure_timestamp" : {
-              "type" : "integer"
-            },
-            "id" : {
-              "type" : "integer"
-            },
-            "is_savepoint" : {
-              "type" : "boolean"
-            },
-            "latest_ack_timestamp" : {
-              "type" : "integer"
-            },
-            "num_acknowledged_subtasks" : {
-              "type" : "integer"
-            },
-            "num_subtasks" : {
-              "type" : "integer"
-            },
-            "persisted_data" : {
-              "type" : "integer"
-            },
-            "processed_data" : {
-              "type" : "integer"
-            },
-            "state_size" : {
-              "type" : "integer"
-            },
-            "status" : {
-              "type" : "string",
-              "enum" : [ "IN_PROGRESS", "COMPLETED", "FAILED" ]
-            },
-            "tasks" : {
-              "type" : "object",
-              "additionalProperties" : {
-                "type" : "object",
-                "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:TaskCheckpointStatistics"
-              }
-            },
-            "trigger_timestamp" : {
-              "type" : "integer"
-            }
-          }
-        },
-        "restored" : {
-          "type" : "object",
-          "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointingStatistics:RestoredCheckpointStatistics",
-          "properties" : {
-            "external_path" : {
-              "type" : "string"
-            },
-            "id" : {
-              "type" : "integer"
-            },
-            "is_savepoint" : {
-              "type" : "boolean"
-            },
-            "restore_timestamp" : {
-              "type" : "integer"
-            }
-          }
-        },
-        "savepoint" : {
-          "type" : "object",
-          "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointStatistics:CompletedCheckpointStatistics"
-        }
-      }
-    },
-    "summary" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointingStatistics:Summary",
-      "properties" : {
-        "alignment_buffered" : {
-          "type" : "object",
-          "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-        },
-        "end_to_end_duration" : {
-          "type" : "object",
-          "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-        },
-        "persisted_data" : {
-          "type" : "object",
-          "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-        },
-        "processed_data" : {
-          "type" : "object",
-          "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-        },
-        "state_size" : {
-          "type" : "object",
-          "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics",
-          "properties" : {
-            "avg" : {
-              "type" : "integer"
-            },
-            "max" : {
-              "type" : "integer"
-            },
-            "min" : {
-              "type" : "integer"
-            }
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/checkpoints/config
Verb: GETResponse code: 200 OK
Returns the checkpointing configuration.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointConfigInfo",
-  "properties" : {
-    "externalization" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointConfigInfo:ExternalizedCheckpointInfo",
-      "properties" : {
-        "delete_on_cancellation" : {
-          "type" : "boolean"
-        },
-        "enabled" : {
-          "type" : "boolean"
-        }
-      }
-    },
-    "interval" : {
-      "type" : "integer"
-    },
-    "max_concurrent" : {
-      "type" : "integer"
-    },
-    "min_pause" : {
-      "type" : "integer"
-    },
-    "mode" : {
-      "type" : "any"
-    },
-    "state_backend" : {
-      "type" : "string"
-    },
-    "timeout" : {
-      "type" : "integer"
-    },
-    "tolerable_failed_checkpoints" : {
-      "type" : "integer"
-    },
-    "unaligned_checkpoints" : {
-      "type" : "boolean"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/checkpoints/details/:checkpointid
Verb: GETResponse code: 200 OK
Returns details for a checkpoint.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • checkpointid - Long value that identifies a checkpoint.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:CheckpointStatistics",
-  "properties" : {
-    "alignment_buffered" : {
-      "type" : "integer"
-    },
-    "checkpoint_type" : {
-      "type" : "string",
-      "enum" : [ "CHECKPOINT", "SAVEPOINT", "SYNC_SAVEPOINT" ]
-    },
-    "end_to_end_duration" : {
-      "type" : "integer"
-    },
-    "id" : {
-      "type" : "integer"
-    },
-    "is_savepoint" : {
-      "type" : "boolean"
-    },
-    "latest_ack_timestamp" : {
-      "type" : "integer"
-    },
-    "num_acknowledged_subtasks" : {
-      "type" : "integer"
-    },
-    "num_subtasks" : {
-      "type" : "integer"
-    },
-    "persisted_data" : {
-      "type" : "integer"
-    },
-    "processed_data" : {
-      "type" : "integer"
-    },
-    "state_size" : {
-      "type" : "integer"
-    },
-    "status" : {
-      "type" : "string",
-      "enum" : [ "IN_PROGRESS", "COMPLETED", "FAILED" ]
-    },
-    "tasks" : {
-      "type" : "object",
-      "additionalProperties" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:TaskCheckpointStatistics",
-        "properties" : {
-          "alignment_buffered" : {
-            "type" : "integer"
-          },
-          "end_to_end_duration" : {
-            "type" : "integer"
-          },
-          "id" : {
-            "type" : "integer"
-          },
-          "latest_ack_timestamp" : {
-            "type" : "integer"
-          },
-          "num_acknowledged_subtasks" : {
-            "type" : "integer"
-          },
-          "num_subtasks" : {
-            "type" : "integer"
-          },
-          "persisted_data" : {
-            "type" : "integer"
-          },
-          "processed_data" : {
-            "type" : "integer"
-          },
-          "state_size" : {
-            "type" : "integer"
-          },
-          "status" : {
-            "type" : "string",
-            "enum" : [ "IN_PROGRESS", "COMPLETED", "FAILED" ]
-          }
-        }
-      }
-    },
-    "trigger_timestamp" : {
-      "type" : "integer"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/checkpoints/details/:checkpointid/subtasks/:vertexid
Verb: GETResponse code: 200 OK
Returns checkpoint statistics for a task and its subtasks.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • checkpointid - Long value that identifies a checkpoint.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:TaskCheckpointStatisticsWithSubtaskDetails",
-  "properties" : {
-    "alignment_buffered" : {
-      "type" : "integer"
-    },
-    "end_to_end_duration" : {
-      "type" : "integer"
-    },
-    "id" : {
-      "type" : "integer"
-    },
-    "latest_ack_timestamp" : {
-      "type" : "integer"
-    },
-    "num_acknowledged_subtasks" : {
-      "type" : "integer"
-    },
-    "num_subtasks" : {
-      "type" : "integer"
-    },
-    "persisted_data" : {
-      "type" : "integer"
-    },
-    "processed_data" : {
-      "type" : "integer"
-    },
-    "state_size" : {
-      "type" : "integer"
-    },
-    "status" : {
-      "type" : "string",
-      "enum" : [ "IN_PROGRESS", "COMPLETED", "FAILED" ]
-    },
-    "subtasks" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:SubtaskCheckpointStatistics",
-        "properties" : {
-          "index" : {
-            "type" : "integer"
-          },
-          "status" : {
-            "type" : "string"
-          }
-        }
-      }
-    },
-    "summary" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:TaskCheckpointStatisticsWithSubtaskDetails:Summary",
-      "properties" : {
-        "alignment" : {
-          "type" : "object",
-          "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:TaskCheckpointStatisticsWithSubtaskDetails:CheckpointAlignment",
-          "properties" : {
-            "buffered" : {
-              "type" : "object",
-              "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-            },
-            "duration" : {
-              "type" : "object",
-              "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-            },
-            "persisted" : {
-              "type" : "object",
-              "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-            },
-            "processed" : {
-              "type" : "object",
-              "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-            }
-          }
-        },
-        "checkpoint_duration" : {
-          "type" : "object",
-          "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:TaskCheckpointStatisticsWithSubtaskDetails:CheckpointDuration",
-          "properties" : {
-            "async" : {
-              "type" : "object",
-              "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-            },
-            "sync" : {
-              "type" : "object",
-              "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-            }
-          }
-        },
-        "end_to_end_duration" : {
-          "type" : "object",
-          "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-        },
-        "start_delay" : {
-          "type" : "object",
-          "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics"
-        },
-        "state_size" : {
-          "type" : "object",
-          "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:checkpoints:MinMaxAvgStatistics",
-          "properties" : {
-            "avg" : {
-              "type" : "integer"
-            },
-            "max" : {
-              "type" : "integer"
-            },
-            "min" : {
-              "type" : "integer"
-            }
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/config
Verb: GETResponse code: 200 OK
Returns the configuration of a job.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "any"
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/exceptions
Verb: GETResponse code: 200 OK
Returns the non-recoverable exceptions that have been observed by the job. The truncated flag defines whether more exceptions occurred, but are not listed, because the response would otherwise get too big.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
Query parameters
-
    -
  • maxExceptions (optional): Comma-separated list of integer values that specifies the upper limit of exceptions to return.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobExceptionsInfo",
-  "properties" : {
-    "all-exceptions" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobExceptionsInfo:ExecutionExceptionInfo",
-        "properties" : {
-          "exception" : {
-            "type" : "string"
-          },
-          "location" : {
-            "type" : "string"
-          },
-          "task" : {
-            "type" : "string"
-          },
-          "timestamp" : {
-            "type" : "integer"
-          }
-        }
-      }
-    },
-    "root-exception" : {
-      "type" : "string"
-    },
-    "timestamp" : {
-      "type" : "integer"
-    },
-    "truncated" : {
-      "type" : "boolean"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/execution-result
Verb: GETResponse code: 200 OK
Returns the result of a job execution. Gives access to the execution time of the job and to all accumulators created by this job.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:JobExecutionResultResponseBody",
-  "properties" : {
-    "job-execution-result" : {
-      "type" : "any"
-    },
-    "status" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:queue:QueueStatus",
-      "required" : true,
-      "properties" : {
-        "id" : {
-          "type" : "string",
-          "required" : true,
-          "enum" : [ "IN_PROGRESS", "COMPLETED" ]
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/metrics
Verb: GETResponse code: 200 OK
Provides access to job metrics.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
Query parameters
-
    -
  • get (optional): Comma-separated list of string values to select specific metrics.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "any"
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/plan
Verb: GETResponse code: 200 OK
Returns the dataflow plan of a job.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobPlanInfo",
-  "properties" : {
-    "plan" : {
-      "type" : "any"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/rescaling
Verb: PATCHResponse code: 200 OK
Triggers the rescaling of a job. This async operation would return a 'triggerid' for further query identifier.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
Query parameters
-
    -
  • parallelism (mandatory): Positive integer value that specifies the desired parallelism.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:TriggerResponse",
-  "properties" : {
-    "request-id" : {
-      "type" : "any"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/rescaling/:triggerid
Verb: GETResponse code: 200 OK
Returns the status of a rescaling operation.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • triggerid - 32-character hexadecimal string that identifies an asynchronous operation trigger ID. The ID was returned then the operation was triggered.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:AsynchronousOperationResult",
-  "properties" : {
-    "operation" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:AsynchronousOperationInfo",
-      "properties" : {
-        "failure-cause" : {
-          "type" : "any"
-        }
-      }
-    },
-    "status" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:queue:QueueStatus",
-      "properties" : {
-        "id" : {
-          "type" : "string",
-          "required" : true,
-          "enum" : [ "IN_PROGRESS", "COMPLETED" ]
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/savepoints
Verb: POSTResponse code: 202 Accepted
Triggers a savepoint, and optionally cancels the job afterwards. This async operation would return a 'triggerid' for further query identifier.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:savepoints:SavepointTriggerRequestBody",
-  "properties" : {
-    "cancel-job" : {
-      "type" : "boolean"
-    },
-    "target-directory" : {
-      "type" : "string"
-    }
-  }
-}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:TriggerResponse",
-  "properties" : {
-    "request-id" : {
-      "type" : "any"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/savepoints/:triggerid
Verb: GETResponse code: 200 OK
Returns the status of a savepoint operation.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • triggerid - 32-character hexadecimal string that identifies an asynchronous operation trigger ID. The ID was returned then the operation was triggered.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:AsynchronousOperationResult",
-  "properties" : {
-    "operation" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:savepoints:SavepointInfo",
-      "properties" : {
-        "failure-cause" : {
-          "type" : "any"
-        },
-        "location" : {
-          "type" : "string"
-        }
-      }
-    },
-    "status" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:queue:QueueStatus",
-      "properties" : {
-        "id" : {
-          "type" : "string",
-          "required" : true,
-          "enum" : [ "IN_PROGRESS", "COMPLETED" ]
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/stop
Verb: POSTResponse code: 202 Accepted
Stops a job with a savepoint. Optionally, it can also emit a MAX_WATERMARK before taking the savepoint to flush out any state waiting for timers to fire. This async operation would return a 'triggerid' for further query identifier.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:savepoints:stop:StopWithSavepointRequestBody",
-  "properties" : {
-    "drain" : {
-      "type" : "boolean"
-    },
-    "targetDirectory" : {
-      "type" : "string"
-    }
-  }
-}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:TriggerResponse",
-  "properties" : {
-    "request-id" : {
-      "type" : "any"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid
Verb: GETResponse code: 200 OK
Returns details for a task, with a summary for each of its subtasks.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobVertexDetailsInfo",
-  "properties" : {
-    "id" : {
-      "type" : "any"
-    },
-    "name" : {
-      "type" : "string"
-    },
-    "now" : {
-      "type" : "integer"
-    },
-    "parallelism" : {
-      "type" : "integer"
-    },
-    "subtasks" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:SubtaskExecutionAttemptDetailsInfo",
-        "properties" : {
-          "attempt" : {
-            "type" : "integer"
-          },
-          "duration" : {
-            "type" : "integer"
-          },
-          "end-time" : {
-            "type" : "integer"
-          },
-          "host" : {
-            "type" : "string"
-          },
-          "metrics" : {
-            "type" : "object",
-            "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:metrics:IOMetricsInfo",
-            "properties" : {
-              "read-bytes" : {
-                "type" : "integer"
-              },
-              "read-bytes-complete" : {
-                "type" : "boolean"
-              },
-              "read-records" : {
-                "type" : "integer"
-              },
-              "read-records-complete" : {
-                "type" : "boolean"
-              },
-              "write-bytes" : {
-                "type" : "integer"
-              },
-              "write-bytes-complete" : {
-                "type" : "boolean"
-              },
-              "write-records" : {
-                "type" : "integer"
-              },
-              "write-records-complete" : {
-                "type" : "boolean"
-              }
-            }
-          },
-          "start-time" : {
-            "type" : "integer"
-          },
-          "start_time" : {
-            "type" : "integer"
-          },
-          "status" : {
-            "type" : "string",
-            "enum" : [ "CREATED", "SCHEDULED", "DEPLOYING", "RUNNING", "FINISHED", "CANCELING", "CANCELED", "FAILED", "RECONCILING" ]
-          },
-          "subtask" : {
-            "type" : "integer"
-          },
-          "taskmanager-id" : {
-            "type" : "string"
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/accumulators
Verb: GETResponse code: 200 OK
Returns user-defined accumulators of a task, aggregated across all subtasks.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobVertexAccumulatorsInfo",
-  "properties" : {
-    "id" : {
-      "type" : "string"
-    },
-    "user-accumulators" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:UserAccumulator",
-        "properties" : {
-          "name" : {
-            "type" : "string"
-          },
-          "type" : {
-            "type" : "string"
-          },
-          "value" : {
-            "type" : "string"
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/backpressure
Verb: GETResponse code: 200 OK
Returns back-pressure information for a job, and may initiate back-pressure sampling if necessary.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobVertexBackPressureInfo",
-  "properties" : {
-    "backpressure-level" : {
-      "type" : "string",
-      "enum" : [ "ok", "low", "high" ]
-    },
-    "end-timestamp" : {
-      "type" : "integer"
-    },
-    "status" : {
-      "type" : "string",
-      "enum" : [ "deprecated", "ok" ]
-    },
-    "subtasks" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobVertexBackPressureInfo:SubtaskBackPressureInfo",
-        "properties" : {
-          "backpressure-level" : {
-            "type" : "string",
-            "enum" : [ "ok", "low", "high" ]
-          },
-          "ratio" : {
-            "type" : "number"
-          },
-          "subtask" : {
-            "type" : "integer"
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/metrics
Verb: GETResponse code: 200 OK
Provides access to task metrics.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
-
Query parameters
-
    -
  • get (optional): Comma-separated list of string values to select specific metrics.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "any"
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/subtasks/accumulators
Verb: GETResponse code: 200 OK
Returns all user-defined accumulators for all subtasks of a task.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:SubtasksAllAccumulatorsInfo",
-  "properties" : {
-    "id" : {
-      "type" : "any"
-    },
-    "parallelism" : {
-      "type" : "integer"
-    },
-    "subtasks" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:SubtasksAllAccumulatorsInfo:SubtaskAccumulatorsInfo",
-        "properties" : {
-          "attempt" : {
-            "type" : "integer"
-          },
-          "host" : {
-            "type" : "string"
-          },
-          "subtask" : {
-            "type" : "integer"
-          },
-          "user-accumulators" : {
-            "type" : "array",
-            "items" : {
-              "type" : "object",
-              "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:UserAccumulator",
-              "properties" : {
-                "name" : {
-                  "type" : "string"
-                },
-                "type" : {
-                  "type" : "string"
-                },
-                "value" : {
-                  "type" : "string"
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/subtasks/metrics
Verb: GETResponse code: 200 OK
Provides access to aggregated subtask metrics.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
-
Query parameters
-
    -
  • get (optional): Comma-separated list of string values to select specific metrics.
  • -
  • agg (optional): Comma-separated list of aggregation modes which should be calculated. Available aggregations are: "min, max, sum, avg".
  • -
  • subtasks (optional): Comma-separated list of integer ranges (e.g. "1,3,5-9") to select specific subtasks.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "any"
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/subtasks/:subtaskindex
Verb: GETResponse code: 200 OK
Returns details of the current or latest execution attempt of a subtask.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
  • subtaskindex - Positive integer value that identifies a subtask.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:SubtaskExecutionAttemptDetailsInfo",
-  "properties" : {
-    "attempt" : {
-      "type" : "integer"
-    },
-    "duration" : {
-      "type" : "integer"
-    },
-    "end-time" : {
-      "type" : "integer"
-    },
-    "host" : {
-      "type" : "string"
-    },
-    "metrics" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:metrics:IOMetricsInfo",
-      "properties" : {
-        "read-bytes" : {
-          "type" : "integer"
-        },
-        "read-bytes-complete" : {
-          "type" : "boolean"
-        },
-        "read-records" : {
-          "type" : "integer"
-        },
-        "read-records-complete" : {
-          "type" : "boolean"
-        },
-        "write-bytes" : {
-          "type" : "integer"
-        },
-        "write-bytes-complete" : {
-          "type" : "boolean"
-        },
-        "write-records" : {
-          "type" : "integer"
-        },
-        "write-records-complete" : {
-          "type" : "boolean"
-        }
-      }
-    },
-    "start-time" : {
-      "type" : "integer"
-    },
-    "start_time" : {
-      "type" : "integer"
-    },
-    "status" : {
-      "type" : "string",
-      "enum" : [ "CREATED", "SCHEDULED", "DEPLOYING", "RUNNING", "FINISHED", "CANCELING", "CANCELED", "FAILED", "RECONCILING" ]
-    },
-    "subtask" : {
-      "type" : "integer"
-    },
-    "taskmanager-id" : {
-      "type" : "string"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/subtasks/:subtaskindex/attempts/:attempt
Verb: GETResponse code: 200 OK
Returns details of an execution attempt of a subtask. Multiple execution attempts happen in case of failure/recovery.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
  • subtaskindex - Positive integer value that identifies a subtask.
  • -
  • attempt - Positive integer value that identifies an execution attempt.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:SubtaskExecutionAttemptDetailsInfo",
-  "properties" : {
-    "attempt" : {
-      "type" : "integer"
-    },
-    "duration" : {
-      "type" : "integer"
-    },
-    "end-time" : {
-      "type" : "integer"
-    },
-    "host" : {
-      "type" : "string"
-    },
-    "metrics" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:metrics:IOMetricsInfo",
-      "properties" : {
-        "read-bytes" : {
-          "type" : "integer"
-        },
-        "read-bytes-complete" : {
-          "type" : "boolean"
-        },
-        "read-records" : {
-          "type" : "integer"
-        },
-        "read-records-complete" : {
-          "type" : "boolean"
-        },
-        "write-bytes" : {
-          "type" : "integer"
-        },
-        "write-bytes-complete" : {
-          "type" : "boolean"
-        },
-        "write-records" : {
-          "type" : "integer"
-        },
-        "write-records-complete" : {
-          "type" : "boolean"
-        }
-      }
-    },
-    "start-time" : {
-      "type" : "integer"
-    },
-    "start_time" : {
-      "type" : "integer"
-    },
-    "status" : {
-      "type" : "string",
-      "enum" : [ "CREATED", "SCHEDULED", "DEPLOYING", "RUNNING", "FINISHED", "CANCELING", "CANCELED", "FAILED", "RECONCILING" ]
-    },
-    "subtask" : {
-      "type" : "integer"
-    },
-    "taskmanager-id" : {
-      "type" : "string"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/subtasks/:subtaskindex/attempts/:attempt/accumulators
Verb: GETResponse code: 200 OK
Returns the accumulators of an execution attempt of a subtask. Multiple execution attempts happen in case of failure/recovery.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
  • subtaskindex - Positive integer value that identifies a subtask.
  • -
  • attempt - Positive integer value that identifies an execution attempt.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:SubtaskExecutionAttemptAccumulatorsInfo",
-  "properties" : {
-    "attempt" : {
-      "type" : "integer"
-    },
-    "id" : {
-      "type" : "string"
-    },
-    "subtask" : {
-      "type" : "integer"
-    },
-    "user-accumulators" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:UserAccumulator",
-        "properties" : {
-          "name" : {
-            "type" : "string"
-          },
-          "type" : {
-            "type" : "string"
-          },
-          "value" : {
-            "type" : "string"
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/subtasks/:subtaskindex/metrics
Verb: GETResponse code: 200 OK
Provides access to subtask metrics.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
  • subtaskindex - Positive integer value that identifies a subtask.
  • -
-
Query parameters
-
    -
  • get (optional): Comma-separated list of string values to select specific metrics.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "any"
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/subtasktimes
Verb: GETResponse code: 200 OK
Returns time-related information for all subtasks of a task.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:SubtasksTimesInfo",
-  "properties" : {
-    "id" : {
-      "type" : "string"
-    },
-    "name" : {
-      "type" : "string"
-    },
-    "now" : {
-      "type" : "integer"
-    },
-    "subtasks" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:SubtasksTimesInfo:SubtaskTimeInfo",
-        "properties" : {
-          "duration" : {
-            "type" : "integer"
-          },
-          "host" : {
-            "type" : "string"
-          },
-          "subtask" : {
-            "type" : "integer"
-          },
-          "timestamps" : {
-            "type" : "object",
-            "additionalProperties" : {
-              "type" : "integer"
-            }
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/taskmanagers
Verb: GETResponse code: 200 OK
Returns task information aggregated by task manager.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobVertexTaskManagersInfo",
-  "properties" : {
-    "id" : {
-      "type" : "any"
-    },
-    "name" : {
-      "type" : "string"
-    },
-    "now" : {
-      "type" : "integer"
-    },
-    "taskmanagers" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:JobVertexTaskManagersInfo:TaskManagersInfo",
-        "properties" : {
-          "duration" : {
-            "type" : "integer"
-          },
-          "end-time" : {
-            "type" : "integer"
-          },
-          "host" : {
-            "type" : "string"
-          },
-          "metrics" : {
-            "type" : "object",
-            "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:metrics:IOMetricsInfo",
-            "properties" : {
-              "read-bytes" : {
-                "type" : "integer"
-              },
-              "read-bytes-complete" : {
-                "type" : "boolean"
-              },
-              "read-records" : {
-                "type" : "integer"
-              },
-              "read-records-complete" : {
-                "type" : "boolean"
-              },
-              "write-bytes" : {
-                "type" : "integer"
-              },
-              "write-bytes-complete" : {
-                "type" : "boolean"
-              },
-              "write-records" : {
-                "type" : "integer"
-              },
-              "write-records-complete" : {
-                "type" : "boolean"
-              }
-            }
-          },
-          "start-time" : {
-            "type" : "integer"
-          },
-          "status" : {
-            "type" : "string",
-            "enum" : [ "CREATED", "SCHEDULED", "DEPLOYING", "RUNNING", "FINISHED", "CANCELING", "CANCELED", "FAILED", "RECONCILING" ]
-          },
-          "status-counts" : {
-            "type" : "object",
-            "additionalProperties" : {
-              "type" : "integer"
-            }
-          },
-          "taskmanager-id" : {
-            "type" : "string"
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/jobs/:jobid/vertices/:vertexid/watermarks
Verb: GETResponse code: 200 OK
Returns the watermarks for all subtasks of a task.
Path parameters
-
    -
  • jobid - 32-character hexadecimal string value that identifies a job.
  • -
  • vertexid - 32-character hexadecimal string value that identifies a job vertex.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "any"
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/overview
Verb: GETResponse code: 200 OK
Returns an overview over the Flink cluster.
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:legacy:messages:ClusterOverviewWithVersion",
-  "properties" : {
-    "flink-commit" : {
-      "type" : "string"
-    },
-    "flink-version" : {
-      "type" : "string"
-    },
-    "jobs-cancelled" : {
-      "type" : "integer"
-    },
-    "jobs-failed" : {
-      "type" : "integer"
-    },
-    "jobs-finished" : {
-      "type" : "integer"
-    },
-    "jobs-running" : {
-      "type" : "integer"
-    },
-    "slots-available" : {
-      "type" : "integer"
-    },
-    "slots-total" : {
-      "type" : "integer"
-    },
-    "taskmanagers" : {
-      "type" : "integer"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/savepoint-disposal
Verb: POSTResponse code: 200 OK
Triggers the desposal of a savepoint. This async operation would return a 'triggerid' for further query identifier.
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:job:savepoints:SavepointDisposalRequest",
-  "properties" : {
-    "savepoint-path" : {
-      "type" : "string"
-    }
-  }
-}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:TriggerResponse",
-  "properties" : {
-    "request-id" : {
-      "type" : "any"
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/savepoint-disposal/:triggerid
Verb: GETResponse code: 200 OK
Returns the status of a savepoint disposal operation.
Path parameters
-
    -
  • triggerid - 32-character hexadecimal string that identifies an asynchronous operation trigger ID. The ID was returned then the operation was triggered.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:AsynchronousOperationResult",
-  "properties" : {
-    "operation" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:handler:async:AsynchronousOperationInfo",
-      "properties" : {
-        "failure-cause" : {
-          "type" : "any"
-        }
-      }
-    },
-    "status" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:queue:QueueStatus",
-      "properties" : {
-        "id" : {
-          "type" : "string",
-          "required" : true,
-          "enum" : [ "IN_PROGRESS", "COMPLETED" ]
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - -
/taskmanagers
Verb: GETResponse code: 200 OK
Returns an overview over all task managers.
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:taskmanager:TaskManagersInfo",
-  "properties" : {
-    "taskmanagers" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:taskmanager:TaskManagerInfo",
-        "properties" : {
-          "dataPort" : {
-            "type" : "integer"
-          },
-          "freeResource" : {
-            "type" : "object",
-            "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:ResourceProfileInfo"
-          },
-          "freeSlots" : {
-            "type" : "integer"
-          },
-          "hardware" : {
-            "type" : "object",
-            "id" : "urn:jsonschema:org:apache:flink:runtime:instance:HardwareDescription",
-            "properties" : {
-              "cpuCores" : {
-                "type" : "integer"
-              },
-              "freeMemory" : {
-                "type" : "integer"
-              },
-              "managedMemory" : {
-                "type" : "integer"
-              },
-              "physicalMemory" : {
-                "type" : "integer"
-              }
-            }
-          },
-          "id" : {
-            "type" : "any"
-          },
-          "jmxPort" : {
-            "type" : "integer"
-          },
-          "memoryConfiguration" : {
-            "type" : "object",
-            "id" : "urn:jsonschema:org:apache:flink:runtime:taskexecutor:TaskExecutorMemoryConfiguration",
-            "properties" : {
-              "frameworkHeap" : {
-                "type" : "integer"
-              },
-              "frameworkOffHeap" : {
-                "type" : "integer"
-              },
-              "jvmMetaspace" : {
-                "type" : "integer"
-              },
-              "jvmOverhead" : {
-                "type" : "integer"
-              },
-              "managedMemory" : {
-                "type" : "integer"
-              },
-              "networkMemory" : {
-                "type" : "integer"
-              },
-              "taskHeap" : {
-                "type" : "integer"
-              },
-              "taskOffHeap" : {
-                "type" : "integer"
-              },
-              "totalFlinkMemory" : {
-                "type" : "integer"
-              },
-              "totalProcessMemory" : {
-                "type" : "integer"
-              }
-            }
-          },
-          "path" : {
-            "type" : "string"
-          },
-          "slotsNumber" : {
-            "type" : "integer"
-          },
-          "timeSinceLastHeartbeat" : {
-            "type" : "integer"
-          },
-          "totalResource" : {
-            "type" : "object",
-            "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:ResourceProfileInfo",
-            "properties" : {
-              "cpuCores" : {
-                "type" : "number"
-              },
-              "extendedResources" : {
-                "type" : "object",
-                "additionalProperties" : {
-                  "type" : "number"
-                }
-              },
-              "managedMemory" : {
-                "type" : "integer"
-              },
-              "networkMemory" : {
-                "type" : "integer"
-              },
-              "taskHeapMemory" : {
-                "type" : "integer"
-              },
-              "taskOffHeapMemory" : {
-                "type" : "integer"
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/taskmanagers/metrics
Verb: GETResponse code: 200 OK
Provides access to aggregated task manager metrics.
Query parameters
-
    -
  • get (optional): Comma-separated list of string values to select specific metrics.
  • -
  • agg (optional): Comma-separated list of aggregation modes which should be calculated. Available aggregations are: "min, max, sum, avg".
  • -
  • taskmanagers (optional): Comma-separated list of 32-character hexadecimal strings to select specific task managers.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "any"
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/taskmanagers/:taskmanagerid
Verb: GETResponse code: 200 OK
Returns details for a task manager. "metrics.memorySegmentsAvailable" and "metrics.memorySegmentsTotal" are deprecated. Please use "metrics.nettyShuffleMemorySegmentsAvailable" and "metrics.nettyShuffleMemorySegmentsTotal" instead.
Path parameters
-
    -
  • taskmanagerid - 32-character hexadecimal string that identifies a task manager.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:taskmanager:TaskManagerDetailsInfo",
-  "properties" : {
-    "dataPort" : {
-      "type" : "integer"
-    },
-    "freeResource" : {
-      "type" : "object",
-      "$ref" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:ResourceProfileInfo"
-    },
-    "freeSlots" : {
-      "type" : "integer"
-    },
-    "hardware" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:instance:HardwareDescription",
-      "properties" : {
-        "cpuCores" : {
-          "type" : "integer"
-        },
-        "freeMemory" : {
-          "type" : "integer"
-        },
-        "managedMemory" : {
-          "type" : "integer"
-        },
-        "physicalMemory" : {
-          "type" : "integer"
-        }
-      }
-    },
-    "id" : {
-      "type" : "any"
-    },
-    "jmxPort" : {
-      "type" : "integer"
-    },
-    "memoryConfiguration" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:taskexecutor:TaskExecutorMemoryConfiguration",
-      "properties" : {
-        "frameworkHeap" : {
-          "type" : "integer"
-        },
-        "frameworkOffHeap" : {
-          "type" : "integer"
-        },
-        "jvmMetaspace" : {
-          "type" : "integer"
-        },
-        "jvmOverhead" : {
-          "type" : "integer"
-        },
-        "managedMemory" : {
-          "type" : "integer"
-        },
-        "networkMemory" : {
-          "type" : "integer"
-        },
-        "taskHeap" : {
-          "type" : "integer"
-        },
-        "taskOffHeap" : {
-          "type" : "integer"
-        },
-        "totalFlinkMemory" : {
-          "type" : "integer"
-        },
-        "totalProcessMemory" : {
-          "type" : "integer"
-        }
-      }
-    },
-    "metrics" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:taskmanager:TaskManagerMetricsInfo",
-      "properties" : {
-        "directCount" : {
-          "type" : "integer"
-        },
-        "directMax" : {
-          "type" : "integer"
-        },
-        "directUsed" : {
-          "type" : "integer"
-        },
-        "garbageCollectors" : {
-          "type" : "array",
-          "items" : {
-            "type" : "object",
-            "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:taskmanager:TaskManagerMetricsInfo:GarbageCollectorInfo",
-            "properties" : {
-              "count" : {
-                "type" : "integer"
-              },
-              "name" : {
-                "type" : "string"
-              },
-              "time" : {
-                "type" : "integer"
-              }
-            }
-          }
-        },
-        "heapCommitted" : {
-          "type" : "integer"
-        },
-        "heapMax" : {
-          "type" : "integer"
-        },
-        "heapUsed" : {
-          "type" : "integer"
-        },
-        "mappedCount" : {
-          "type" : "integer"
-        },
-        "mappedMax" : {
-          "type" : "integer"
-        },
-        "mappedUsed" : {
-          "type" : "integer"
-        },
-        "memorySegmentsAvailable" : {
-          "type" : "integer"
-        },
-        "memorySegmentsTotal" : {
-          "type" : "integer"
-        },
-        "nettyShuffleMemoryAvailable" : {
-          "type" : "integer"
-        },
-        "nettyShuffleMemorySegmentsAvailable" : {
-          "type" : "integer"
-        },
-        "nettyShuffleMemorySegmentsTotal" : {
-          "type" : "integer"
-        },
-        "nettyShuffleMemorySegmentsUsed" : {
-          "type" : "integer"
-        },
-        "nettyShuffleMemoryTotal" : {
-          "type" : "integer"
-        },
-        "nettyShuffleMemoryUsed" : {
-          "type" : "integer"
-        },
-        "nonHeapCommitted" : {
-          "type" : "integer"
-        },
-        "nonHeapMax" : {
-          "type" : "integer"
-        },
-        "nonHeapUsed" : {
-          "type" : "integer"
-        }
-      }
-    },
-    "path" : {
-      "type" : "string"
-    },
-    "slotsNumber" : {
-      "type" : "integer"
-    },
-    "timeSinceLastHeartbeat" : {
-      "type" : "integer"
-    },
-    "totalResource" : {
-      "type" : "object",
-      "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:ResourceProfileInfo",
-      "properties" : {
-        "cpuCores" : {
-          "type" : "number"
-        },
-        "extendedResources" : {
-          "type" : "object",
-          "additionalProperties" : {
-            "type" : "number"
-          }
-        },
-        "managedMemory" : {
-          "type" : "integer"
-        },
-        "networkMemory" : {
-          "type" : "integer"
-        },
-        "taskHeapMemory" : {
-          "type" : "integer"
-        },
-        "taskOffHeapMemory" : {
-          "type" : "integer"
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/taskmanagers/:taskmanagerid/logs
Verb: GETResponse code: 200 OK
Returns the list of log files on a TaskManager.
Path parameters
-
    -
  • taskmanagerid - 32-character hexadecimal string that identifies a task manager.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:LogListInfo",
-  "properties" : {
-    "logs" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:LogInfo",
-        "properties" : {
-          "name" : {
-            "type" : "string"
-          },
-          "size" : {
-            "type" : "integer"
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
/taskmanagers/:taskmanagerid/metrics
Verb: GETResponse code: 200 OK
Provides access to task manager metrics.
Path parameters
-
    -
  • taskmanagerid - 32-character hexadecimal string that identifies a task manager.
  • -
-
Query parameters
-
    -
  • get (optional): Comma-separated list of string values to select specific metrics.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "any"
-}            
-          
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - -
/taskmanagers/:taskmanagerid/thread-dump
Verb: GETResponse code: 200 OK
Returns the thread dump of the requested TaskManager.
Path parameters
-
    -
  • taskmanagerid - 32-character hexadecimal string that identifies a task manager.
  • -
-
- -
-
-            
-{}            
-          
-
-
- -
-
-            
-{
-  "type" : "object",
-  "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:taskmanager:ThreadDumpInfo",
-  "properties" : {
-    "threadInfos" : {
-      "type" : "array",
-      "items" : {
-        "type" : "object",
-        "id" : "urn:jsonschema:org:apache:flink:runtime:rest:messages:taskmanager:ThreadDumpInfo:ThreadInfo",
-        "properties" : {
-          "stringifiedThreadInfo" : {
-            "type" : "string"
-          },
-          "threadName" : {
-            "type" : "string"
-          }
-        }
-      }
-    }
-  }
-}            
-          
-
-
diff --git a/docs/_includes/generated/restart_strategy_configuration.html b/docs/_includes/generated/restart_strategy_configuration.html deleted file mode 100644 index f690cbaa5b6ac..0000000000000 --- a/docs/_includes/generated/restart_strategy_configuration.html +++ /dev/null @@ -1,18 +0,0 @@ - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
restart-strategy
(none)StringDefines the restart strategy to use in case of job failures.
Accepted values are:
  • `none`, `off`, `disable`: No restart strategy.
  • `fixeddelay`, `fixed-delay`: Fixed delay restart strategy. More details can be found here.
  • `failurerate`, `failure-rate`: Failure rate restart strategy. More details can be found here.
  • `exponentialdelay`, `exponential-delay`: Exponential delay restart strategy. More details can be found here.
If checkpointing is disabled, the default value is `none`. If checkpointing is enabled, the default value is `fixed-delay` with `Integer.MAX_VALUE` restart attempts and '`1 s`' delay.
diff --git a/docs/_includes/generated/security_auth_kerberos_section.html b/docs/_includes/generated/security_auth_kerberos_section.html deleted file mode 100644 index cf279171a54c9..0000000000000 --- a/docs/_includes/generated/security_auth_kerberos_section.html +++ /dev/null @@ -1,36 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
security.kerberos.login.contexts
(none)StringA comma-separated list of login contexts to provide the Kerberos credentials to (for example, `Client,KafkaClient` to use the credentials for ZooKeeper authentication and for Kafka authentication)
security.kerberos.login.keytab
(none)StringAbsolute path to a Kerberos keytab file that contains the user credentials.
security.kerberos.login.principal
(none)StringKerberos principal name associated with the keytab.
security.kerberos.login.use-ticket-cache
trueBooleanIndicates whether to read from your Kerberos ticket cache.
diff --git a/docs/_includes/generated/table_config_configuration.html b/docs/_includes/generated/table_config_configuration.html deleted file mode 100644 index d530df816f0c3..0000000000000 --- a/docs/_includes/generated/table_config_configuration.html +++ /dev/null @@ -1,36 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
KeyDefaultTypeDescription
table.dynamic-table-options.enabled

Batch Streaming
falseBooleanEnable or disable the OPTIONS hint used to specify table options dynamically, if disabled, an exception would be thrown if any OPTIONS hint is specified
table.generated-code.max-length

Batch Streaming
64000IntegerSpecifies a threshold where generated code will be split into sub-function calls. Java has a maximum method length of 64 KB. This setting allows for finer granularity if necessary.
table.local-time-zone

Batch Streaming
"default"StringThe local time zone defines current session time zone id. It is used when converting to/from <code>TIMESTAMP WITH LOCAL TIME ZONE</code>. Internally, timestamps with local time zone are always represented in the UTC time zone. However, when converting to data types that don't include a time zone (e.g. TIMESTAMP, TIME, or simply STRING), the session time zone is used during conversion. The input of option is either an abbreviation such as "PST", a full name such as "America/Los_Angeles", or a custom timezone id such as "GMT-8:00".
table.sql-dialect

Batch Streaming
"default"StringThe SQL dialect defines how to parse a SQL query. A different SQL dialect may support different SQL grammar. Currently supported dialects are: default and hive
diff --git a/docs/_includes/latex_commands.html b/docs/_includes/latex_commands.html deleted file mode 100644 index b2f8f08f2b5a4..0000000000000 --- a/docs/_includes/latex_commands.html +++ /dev/null @@ -1,38 +0,0 @@ - - - -$$ -\newcommand{\R}{\mathbb{R}} -\newcommand{\E}{\mathbb{E}} -\newcommand{\x}{\mathbf{x}} -\newcommand{\y}{\mathbf{y}} -\newcommand{\wv}{\mathbf{w}} -\newcommand{\av}{\mathbf{\alpha}} -\newcommand{\bv}{\mathbf{b}} -\newcommand{\N}{\mathbb{N}} -\newcommand{\id}{\mathbf{I}} -\newcommand{\ind}{\mathbf{1}} -\newcommand{\0}{\mathbf{0}} -\newcommand{\unit}{\mathbf{e}} -\newcommand{\one}{\mathbf{1}} -\newcommand{\zero}{\mathbf{0}} -\newcommand\rfrac[2]{^{#1}\!/_{#2}} -\newcommand{\norm}[1]{\left\lVert#1\right\rVert} -$$ \ No newline at end of file diff --git a/docs/_includes/note.html b/docs/_includes/note.html deleted file mode 100644 index a0ac41357bcbb..0000000000000 --- a/docs/_includes/note.html +++ /dev/null @@ -1,23 +0,0 @@ - - - diff --git a/docs/_includes/sidenav.html b/docs/_includes/sidenav.html deleted file mode 100644 index ed767f23a0f8c..0000000000000 --- a/docs/_includes/sidenav.html +++ /dev/null @@ -1,184 +0,0 @@ - - -{%- comment -%} -============================================================================== -Extract the active nav IDs. -============================================================================== -{%- endcomment -%} - -{%- assign active_nav_ids = site.array -%} -{%- assign parent_id = page.nav-parent_id -%} - -{%- for i in (1..10) -%} - {%- if parent_id -%} - {%- assign active_nav_ids = active_nav_ids | push: parent_id -%} - {%- assign current = (site.pages_by_language[page.language] | where: "nav-id" , parent_id | sort: "nav-pos") -%} - {%- if current.size > 0 -%} - {%- assign parent_id = current[0].nav-parent_id -%} - {%- else -%} - {%- break -%} - {%- endif -%} - {%- else -%} - {%- break -%} - {%- endif -%} -{%- endfor -%} - -{%- if page.language == "en" -%} - {%- capture baseurl_i18n -%}{{ site.baseurl }}{%- endcapture -%} -{%- else if page.language == "zh" -%} - {%- capture baseurl_i18n -%}{{ site.baseurl }}/{{ page.language }}{%- endcapture -%} -{%- endif -%} - -{%- comment -%} -============================================================================== -Build the nested list from nav-id and nav-parent_id relations. -============================================================================== -This builds a nested list from all pages. The fields used to determine the -structure are: - -- 'nav-id' => ID of this page. Other pages can use this ID as their - parent ID. -- 'nav-parent_id' => ID of the parent. This page will be listed under - the page with id 'nav-parent_id'. - -Level 0 is made up of all pages, which have nav-parent_id set to 'root'. - -The 'title' of the page is used as the default link text. You can -override this via 'nav-title'. The relative position per navigational -level is determined by 'nav-pos'. -{%- endcomment -%} - -{%- assign elementsPosStack = site.array -%} -{%- assign posStack = site.array -%} - -{%- assign elements = site.array -%} -{%- assign all_pages_by_nav_parent = (site.pages_by_language[page.language] | where_exp: "item", "item.nav-parent_id != nil" | group_by: "nav-parent_id") -%} -{%- assign children = (all_pages_by_nav_parent | where: "name" , "root") -%} -{%- assign children = (children[0].items | sort: "nav-pos") -%} -{%- if children.size > 0 -%} - {%- assign elements = elements | push: children -%} -{%- endif -%} - -{%- assign elementsPos = 0 -%} -{%- assign pos = 0 -%} - - - - {%- assign elementsPosStack = elementsPosStack | pop -%} - {%- assign posStack = posStack | pop -%} - {%- endif -%} - {%- else -%} - {%- assign this = elements[elementsPos][pos] -%} - - {%- if this.url == page.url -%} - {%- assign active = true -%} - {%- elsif this.nav-id and active_nav_ids contains this.nav-id -%} - {%- assign active = true -%} - {%- else -%} - {%- assign active = false -%} - {%- endif -%} - - {%- capture title -%}{%- if this.nav-title -%}{{ this.nav-title }}{%- else -%}{{ this.title }}{%- endif -%}{%- endcapture -%} - {%- capture target -%}"{{ site.baseurl }}{{ this.url }}"{%- if active %} class="active"{%- endif -%}{%- endcapture -%} - {%- capture overview_target -%}"{{ site.baseurl }}{{ this.url }}"{%- if this.url == page.url -%} class="active"{%- endif -%}{%- endcapture -%} - - {% if this.section-break %}
{% endif -%} - - {%- assign pos = pos | plus: 1 -%} - {%- if this.nav-id -%} - {%- assign children = (all_pages_by_nav_parent | where: "name" , this.nav-id) -%} - {%- if children.size > 0 -%} - {%- assign children = (children[0].items | sort: "nav-pos") -%} - {%- capture collapse_target -%}"#collapse-{{ i }}" data-toggle="collapse"{%- if active -%} class="active"{%- endif -%}{%- endcapture -%} - {%- capture expand -%}{%- unless active -%} {%- endunless -%}{%- endcapture %} -
  • {{ title }}{{ expand }}
    - - - -
    - -
    - -
    - {%- if page.is_default_language -%} - - - - - {%- else -%} - - - - {%- endif %} -
    diff --git a/docs/_includes/sql-connector-download-table.html b/docs/_includes/sql-connector-download-table.html deleted file mode 100644 index 981bca07b5c64..0000000000000 --- a/docs/_includes/sql-connector-download-table.html +++ /dev/null @@ -1,105 +0,0 @@ - - -

    In order to use the {{ include.connector.name }} {{ include.connector.category }} the following -dependencies are required for both projects using a build automation tool (such as Maven or SBT) -and SQL Client with SQL JAR bundles.

    - - -{% comment %} - The 'liquify' filter makes it possible to include liquid variables such as e.g. site.version. -{% endcomment %} - -{% if include.connector.versions == nil %} - - - - - - - - - - - - - {% if include.connector.built-in %} - - {% elsif site.is_stable %} - {% if include.connector.sql-url != nil %} - - {% else %} - - {% endif %} - {% else %} - - {% endif %} - - -
    Maven dependencySQL Client JAR
    - - <dependency>
    -   <groupId>org.apache.flink</groupId>
    -   <artifactId>{{ include.connector.maven | liquify }}</artifactId>
    -   <version>{{site.version}}</version>
    - </dependency> -
    -
    Built-inDownloadThere is no sql jar available yet.Only available for stable releases.
    -{% else %} - - - - - - - - - - {% for version in include.connector.versions %} - - - - {% if include.connector.built-in %} - - {% elsif include.connector.no-sql-jar %} - {% elsif site.is_stable %} - {% if version.sql-url != nil %} - - {% else %} - - {% endif %} - {% else %} - - {% endif %} - - {% endfor %} - -
    {{ include.connector.name }} versionMaven dependencySQL Client JAR
    {{ version.version | liquify }} - - - <dependency>
    -   <groupId>org.apache.flink</groupId>
    -   <artifactId>{{ version.maven | liquify }}</artifactId>
    -   <version>{{site.version}}</version>
    - </dependency> - -
    - -
    Built-inDownloadThere is no sql jar available yet.Only available for stable releases.
    -{% endif %} diff --git a/docs/_includes/sql-connector-download-table.zh.html b/docs/_includes/sql-connector-download-table.zh.html deleted file mode 100644 index 3ac2cd0bf52d2..0000000000000 --- a/docs/_includes/sql-connector-download-table.zh.html +++ /dev/null @@ -1,97 +0,0 @@ - - -

    为使用 {{ include.connector.name }} {{ include.connector.category }},以下依赖在使用自动化构建工具(如 Maven - 或 SBT)构建的工程和带有 SQL JAR 的 SQL 客户端时都必须提供。

    - -{% comment %} -The 'liquify' filter makes it possible to include liquid variables such as e.g. site.version. -{% endcomment %} - -{% if include.connector.versions == nil %} - - - - - - - - - - - {% if include.connector.built-in %} - - {% elsif site.is_stable %} - {% if include.connector.sql-url != nil %} - - {% else %} - - {% endif %} - {% else %} - - {% endif %} - - -
    Maven 依赖SQL 客户端 JAR
    - - <dependency>
    -   <groupId>org.apache.flink</groupId>
    -   <artifactId>{{ include.connector.maven | liquify }}</artifactId>
    -   <version>{{site.version}}</version>
    - </dependency> -
    -
    Built-in下载目前无 SQL JAR 可用只在稳定版本可用
    -{% else %} - - - - - - - - - - {% for version in include.connector.versions %} - - - - {% if include.connector.built-in %} - - {% elsif include.connector.no-sql-jar %} - {% elsif site.is_stable %} - {% if version.sql-url != nil %} - - {% else %} - - {% endif %} - {% else %} - - {% endif %} - - {% endfor %} - -
    {{ include.connector.name }} 版本Maven 依赖SQL 客户端 JAR
    {{ version.version | liquify }} - - <dependency>
    -   <groupId>org.apache.flink</groupId>
    -   <artifactId>{{ include.connector.maven | liquify }}</artifactId>
    -   <version>{{site.version}}</version>
    - </dependency> -
    -
    内置下载目前无 SQL JAR 可用只在稳定版本可用
    -{% endif %} diff --git a/docs/_includes/warning.html b/docs/_includes/warning.html deleted file mode 100644 index 7ac591c85b7e7..0000000000000 --- a/docs/_includes/warning.html +++ /dev/null @@ -1,23 +0,0 @@ - - -
    - Warning: - {{ include.content }} -
    diff --git a/docs/_layouts/404_base.html b/docs/_layouts/404_base.html deleted file mode 100644 index 9c6d5f878ab7f..0000000000000 --- a/docs/_layouts/404_base.html +++ /dev/null @@ -1,40 +0,0 @@ ---- -layout: base ---- - -{% capture index_link %}{% if page.language == "en" %}index.md{% else if page.language == "zh" %}index.zh.md{% endif %}{% endcapture %} - - - - -

    {{ page.title }}

    - -{{ content }} diff --git a/docs/_layouts/base.html b/docs/_layouts/base.html deleted file mode 100644 index c21cd8d9dbefd..0000000000000 --- a/docs/_layouts/base.html +++ /dev/null @@ -1,121 +0,0 @@ - - - - - - - - - Apache Flink {{ site.version_title }} Documentation: {{ page.title }} - - - - - - - - - - - {% if page.mathjax %} - - - {% endif %} - - - - - - {% if site.show_outdated_warning %} -
    - {% if page.language == "en" %} - This documentation is for an out-of-date version of Apache Flink. We recommend you use the latest stable version. - {% else if page.language == "zh" %} - 本文档是 Apache Flink 的旧版本。建议访问 最新的稳定版本。 - {% endif %} -
    - {% endif %} - - -
    - {% comment %} - This is the base for all content. The content from the layouts found in - the _layouts directory goes here. - {% endcomment %} -
    -
    - {% include sidenav.html %} -
    -
    - {%- if page.mathjax -%} - {%- include latex_commands.html -%} - {%- endif %} - - {{ content }} -
    -
    -
    - - - - - - - - - - - - - - - - {% comment %} - - {% endcomment %} - - diff --git a/docs/_layouts/plain.html b/docs/_layouts/plain.html deleted file mode 100644 index bcde9f2dda6fc..0000000000000 --- a/docs/_layouts/plain.html +++ /dev/null @@ -1,78 +0,0 @@ ---- -layout: base ---- - - -{%- assign active_pages = site.array -%} -{%- assign active = page -%} - -{%- for i in (1..10) -%} - {%- assign active_pages = active_pages | push: active -%} - {%- if active.nav-parent_id -%} - {%- assign next = site.pages_by_language[page.language] | where: "nav-id" , active.nav-parent_id -%} - {%- if next.size > 0 -%} - {%- assign active = next[0] -%} - {%- else -%} - {%- break -%} - {%- endif -%} - {%- else -%} - {%- break -%} - {%- endif -%} -{%- endfor -%} - -{% assign active_pages = active_pages | reverse %} - - - -

    {{ page.title }}{% if page.is_beta %} Beta{% endif %}

    -{% if site.show_outdated_warning %} - -{%- endif %} - -{{ content }} - - - - diff --git a/docs/_layouts/redirect.html b/docs/_layouts/redirect.html deleted file mode 100644 index e28cb0d307d3f..0000000000000 --- a/docs/_layouts/redirect.html +++ /dev/null @@ -1,39 +0,0 @@ ---- -layout: base ---- - - -{% if page.language == "en" %} - - - -

    Page '{{ page.title }}' Has Moved

    - - The page {{ page.title }} has been moved. Redirecting to {% link {{ page.redirect }}.md %} in 1 second. - -{% else if page.language == "zh" %} - - - -

    '{{ page.title }}' 页面已被移动

    - - {{ page.title }} 页面已经被移动了。将在 1 秒后重定向到 {% link {{ page.redirect }}.zh.md %} 。 - -{% endif %} diff --git a/docs/_plugins/build_time.rb b/docs/_plugins/build_time.rb deleted file mode 100644 index 61aa5e83bf682..0000000000000 --- a/docs/_plugins/build_time.rb +++ /dev/null @@ -1,31 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Jekyll - class BuildTimeTag < Liquid::Tag - - def initialize(tag_name, input, tokens) - super - end - - def render(context) - Time.now.strftime("%D, %r %Z") - end - end -end - -Liquid::Template.register_tag('build_time', Jekyll::BuildTimeTag) \ No newline at end of file diff --git a/docs/_plugins/gh_link.rb b/docs/_plugins/gh_link.rb deleted file mode 100644 index bdaa2d44ac3b2..0000000000000 --- a/docs/_plugins/gh_link.rb +++ /dev/null @@ -1,52 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# --------------------------------------------------------- -# Expands a github link shortcut into a proper markdown link -# --------------------------------------------------------- - -module Jekyll - class GitHubLinkTag < Liquid::Tag - - def initialize(tag_name, input, tokens) - super - @input = input - end - - def render(context) - input = @input.sub(/".*"/, "").split - name = @input.match(/".*"/).to_s.gsub(/"/, "")#@input.split.drop(2).join(" ") - config = context.registers[:site].config - - path = input[0] - file = path.split('/').last - - page_gh_tag = context["page"]["gh_link_tag"] - # tag precendence: - # 1. input[1], - # 2. 'gh_link_tag' of page frontmatter - # 3. "master" (default) - gh_tag = input[1].nil? ? (page_gh_tag.nil? ? "master" : page_gh_tag) : input[1] - name = name.to_s == '' ? file : name - #refname = input[2].nil? ? file : input[2] - - "[#{name}](#{config["github_url"]}/blob/#{gh_tag}/#{path})" - end - end -end - -Liquid::Template.register_tag('gh_link', Jekyll::GitHubLinkTag) diff --git a/docs/_plugins/include_without_header.rb b/docs/_plugins/include_without_header.rb deleted file mode 100644 index 8a7792e3f5557..0000000000000 --- a/docs/_plugins/include_without_header.rb +++ /dev/null @@ -1,40 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Jekyll - module Tags - - class IncludeWithoutHeaderTag < Liquid::Tag - - def initialize(tag_name, text, tokens) - super - @file = text.strip - end - - def render(context) - source = File.expand_path(context.registers[:site].config['source']) - path = File.join(source, @file) - content = File.read(path, :encoding => 'UTF-8') - content = content.split(//, 2)[1] - partial = Liquid::Template.parse(content) - partial.render!(context) - end - end - end -end - -Liquid::Template.register_tag("include_without_header", Jekyll::Tags::IncludeWithoutHeaderTag) diff --git a/docs/_plugins/info.rb b/docs/_plugins/info.rb deleted file mode 100644 index ef3c210b92eaf..0000000000000 --- a/docs/_plugins/info.rb +++ /dev/null @@ -1,37 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Jekyll - class InfoTag < Liquid::Tag - - def initialize(tag_name, text, tokens) - super - @text = text - end - - def render(context) - if @text.to_s == '' - @text = "Info" - end - - @text = @text.strip! || @text if !@text.nil? - "#{@text}" - end - end -end - -Liquid::Template.register_tag('info', Jekyll::InfoTag) diff --git a/docs/_plugins/liquify.rb b/docs/_plugins/liquify.rb deleted file mode 100644 index 57528f6873e05..0000000000000 --- a/docs/_plugins/liquify.rb +++ /dev/null @@ -1,31 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# *A Jekyll filter that can parse Liquid in Liquid variables -# -# Usage: -# e.g. Welcome to {{ page.title | liquify }}! - -module Jekyll - module LiquifyFilter - def liquify(input) - Liquid::Template.parse(input).render(@context) - end - end -end - -Liquid::Template.register_filter(Jekyll::LiquifyFilter) diff --git a/docs/_plugins/panel.rb b/docs/_plugins/panel.rb deleted file mode 100644 index 1dfef6346ac0c..0000000000000 --- a/docs/_plugins/panel.rb +++ /dev/null @@ -1,33 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Jekyll - class PanelTag < Liquid::Tag - - def initialize(tag_name, text, tokens) - super - @text = text - end - - def render(context) - @text = @text.strip! || @text if !@text.nil? - "
    #{@text}
    " - end - end -end - -Liquid::Template.register_tag('panel', Jekyll::PanelTag) diff --git a/docs/_plugins/removeDuplicateLicenseHeaders.rb b/docs/_plugins/removeDuplicateLicenseHeaders.rb deleted file mode 100644 index 2ac653fb40bae..0000000000000 --- a/docs/_plugins/removeDuplicateLicenseHeaders.rb +++ /dev/null @@ -1,75 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# --------------------------------------------------------- -# Ensures that the documentation contains the Apache License -# headers once, not repeatedly for each include. -# --------------------------------------------------------- - -module Jekyll - - module LicenseRemover - - AL2 = "\n" - - def writeFile(dest, content) - path = self.destination(dest) - FileUtils.mkdir_p(File.dirname(path)) - File.open(path, 'w') do |f| - # remove all Apache Licenses - withoutLicense = content.gsub(//,'') - # put single Apache License on top - singleLicense = AL2+withoutLicense - # write file out - f.write(singleLicense) - end - end - - end - - class Post - include LicenseRemover - def write(dest) - self.writeFile(dest, self.output) - end - end - - class Page - include LicenseRemover - def write(dest) - self.writeFile(dest, self.output) - end - end - -end diff --git a/docs/_plugins/top.rb b/docs/_plugins/top.rb deleted file mode 100644 index b79781c8c95df..0000000000000 --- a/docs/_plugins/top.rb +++ /dev/null @@ -1,31 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -module Jekyll - class TopTag < Liquid::Tag - - def initialize(tag_name, text, tokens) - super - end - - def render(context) - " Back to top" - end - end -end - -Liquid::Template.register_tag('top', Jekyll::TopTag) diff --git a/docs/_plugins/warn.rb b/docs/_plugins/warn.rb deleted file mode 100644 index c8bd3af19ce8f..0000000000000 --- a/docs/_plugins/warn.rb +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# --------------------------------------------------------- -# Expands a github link shortcut into a proper markdown link -# --------------------------------------------------------- - -module Jekyll - class WarnTag < Liquid::Tag - - def initialize(tag_name, text, tokens) - super - @text = text - end - - def render(context) - if @text.to_s == '' - @text = "Warning" - end - - @text = @text.strip! || @text if !@text.nil? - "#{@text}" - end - end -end - -Liquid::Template.register_tag('warn', Jekyll::WarnTag) diff --git a/docs/annotations.xml b/docs/annotations.xml deleted file mode 100644 index a857770aa6047..0000000000000 --- a/docs/annotations.xml +++ /dev/null @@ -1,66 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/assets/_custom.scss b/docs/assets/_custom.scss new file mode 100644 index 0000000000000..33ccbb4060a8b --- /dev/null +++ b/docs/assets/_custom.scss @@ -0,0 +1,239 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +@import "github"; + +.link { + padding-bottom: 5px; +} + +.appetizer { + color: #FBB142; +} + +.maindish { + color: #7E4F89; +} + +.dessert { + color: #E6526F; +} + +.book-menu nav { + background: #f8f8f8; +} + +.book-page { + padding: 2rem 2rem; +} + +.book-search input { + background: white; +} + +.markdown a { + text-decoration: none; + color: #05b; +} + +.markdown a:visited { + text-decoration: none; + color: #05b; +} + +.markdown { + line-height: 1.43; + + h1, + h2, + h3, + h4, + h5, + h6 { + font-weight: 500; + padding-top: 0; + margin-top: 1em; + } +} + +body { + letter-spacing: normal; + -webkit-font-smoothing: auto; +} + +aside nav ul { + li { + margin: 0.5em 0; + } +} + +.book-search { + border: 2px solid #ebebeb; +} + +@media screen and (max-width: 768px) { + .toc { + display: none; + } +} + +aside.book-menu nav { + a:hover { + font-weight: bold; + opacity: 1.0; + } + + a.active { + font-weight: bold; + color: var(--body-font-color); + } +} + +aside.book-menu > li { + padding: 10px 5px 5px 5px; +} + +aside.book-toc { + h3 { + margin-top: 0; + padding-top: 0; + font-size: 1.2em; + } +} + +html { + line-height: 1.43; +} + +h1, h2, h3, h4, h5, h6 { + line-height: 1.1; +} + +h1, h2, h3 { + margin-top: 20px; + margin-bottom: 10px; +} + +h2, h3, h4 { + padding-top: 1em; +} + +h1 { + font-size: 36px; +} + +h2 { + font-size: 30px; + border-bottom: 1px solid #e5e5e5; +} + +h3 { + font-size: 24px; +} + +h4 { + font-size: 18px; +} + +.markdown code { + background: white; + padding: 0; + border-radius: 0; +} + +pre.chroma code { + line-height: 1.43; +} + +.book-languages { + border: 2px solid black; +} + +.menu-break { + opacity: 0.1; +} + +#book-search-results { + padding: 2px; + background-color: white; +} + +.label { + display: inline; + padding: .2em .6em .3em; + font-size: 75%; + font-weight: 700; + line-height: 1; + color: #fff; + text-align: center; + white-space: nowrap; + vertical-align: baseline; + border-radius: .25em; + background-color: #337ab7; +} + +.expand-toc { + position: fixed; + top: 2em; + right: 5em; + display: none; +} + +.container { + max-width: 90rem; +} + +#book-search-input:focus { + outline: none; +} + +.rest-api h5 { + margin-top: .5em; + margin-bottom: .5em; + font-size: 1em; +} + +.rest-api tbody { + display: table; + width: 100%; + background: white; +} + +.rest-api td { + background: white; +} + +.rest-api .book-expand label { + padding: 0rem 0rem; + background: white; +} + +.rest-api .book-expand { + background: white; +} + +.rest-api .book-expand .book-expand-head { + background: white; +} + +.configuration td { + background: white; +} + +.markdown table tr:nth-child(2n) { + background: white; +} \ No newline at end of file diff --git a/docs/assets/_fonts.scss b/docs/assets/_fonts.scss new file mode 100644 index 0000000000000..dc57189cf04b6 --- /dev/null +++ b/docs/assets/_fonts.scss @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +body { + font-family: "Helvetica Neue",Helvetica,Arial,sans-serif; + font-size: 14px; +} + +code { + font-family: "Menlo", "Lucida Console", monospace; +} \ No newline at end of file diff --git a/docs/assets/github.css b/docs/assets/github.css new file mode 100644 index 0000000000000..25600e34e7103 --- /dev/null +++ b/docs/assets/github.css @@ -0,0 +1,87 @@ +/** + * Syntax highlighting generated via + * hugo gen chromastyles --style=github > chroma.css + */ + +/* Background */ .chroma { background-color: #ffffff } +/* Other */ .chroma .x { } +/* Error */ .chroma .err { color: #a61717; background-color: #e3d2d2 } +/* LineTableTD */ .chroma .lntd { vertical-align: top; padding: 0; margin: 0; border: 0; } +/* LineTable */ .chroma .lntable { border-spacing: 0; padding: 0; margin: 0; border: 0; width: auto; overflow: auto; display: block; } +/* LineHighlight */ .chroma .hl { display: block; width: 100%;background-color: #ffffcc } +/* LineNumbersTable */ .chroma .lnt { margin-right: 0.4em; padding: 0 0.4em 0 0.4em;color: #7f7f7f } +/* LineNumbers */ .chroma .ln { margin-right: 0.4em; padding: 0 0.4em 0 0.4em;color: #7f7f7f } +/* Keyword */ .chroma .k { color: #000000; font-weight: bold } +/* KeywordConstant */ .chroma .kc { color: #000000; font-weight: bold } +/* KeywordDeclaration */ .chroma .kd { color: #000000; font-weight: bold } +/* KeywordNamespace */ .chroma .kn { color: #000000; font-weight: bold } +/* KeywordPseudo */ .chroma .kp { color: #000000; font-weight: bold } +/* KeywordReserved */ .chroma .kr { color: #000000; font-weight: bold } +/* KeywordType */ .chroma .kt { color: #445588; font-weight: bold } +/* Name */ .chroma .n { } +/* NameAttribute */ .chroma .na { color: #008080 } +/* NameBuiltin */ .chroma .nb { color: #0086b3 } +/* NameBuiltinPseudo */ .chroma .bp { color: #999999 } +/* NameClass */ .chroma .nc { color: #445588; font-weight: bold } +/* NameConstant */ .chroma .no { color: #008080 } +/* NameDecorator */ .chroma .nd { color: #3c5d5d; font-weight: bold } +/* NameEntity */ .chroma .ni { color: #800080 } +/* NameException */ .chroma .ne { color: #990000; font-weight: bold } +/* NameFunction */ .chroma .nf { color: #990000; font-weight: bold } +/* NameFunctionMagic */ .chroma .fm { } +/* NameLabel */ .chroma .nl { color: #990000; font-weight: bold } +/* NameNamespace */ .chroma .nn { color: #555555 } +/* NameOther */ .chroma .nx { } +/* NameProperty */ .chroma .py { } +/* NameTag */ .chroma .nt { color: #000080 } +/* NameVariable */ .chroma .nv { color: #008080 } +/* NameVariableClass */ .chroma .vc { color: #008080 } +/* NameVariableGlobal */ .chroma .vg { color: #008080 } +/* NameVariableInstance */ .chroma .vi { color: #008080 } +/* NameVariableMagic */ .chroma .vm { } +/* Literal */ .chroma .l { } +/* LiteralDate */ .chroma .ld { } +/* LiteralString */ .chroma .s { color: #dd1144 } +/* LiteralStringAffix */ .chroma .sa { color: #dd1144 } +/* LiteralStringBacktick */ .chroma .sb { color: #dd1144 } +/* LiteralStringChar */ .chroma .sc { color: #dd1144 } +/* LiteralStringDelimiter */ .chroma .dl { color: #dd1144 } +/* LiteralStringDoc */ .chroma .sd { color: #dd1144 } +/* LiteralStringDouble */ .chroma .s2 { color: #dd1144 } +/* LiteralStringEscape */ .chroma .se { color: #dd1144 } +/* LiteralStringHeredoc */ .chroma .sh { color: #dd1144 } +/* LiteralStringInterpol */ .chroma .si { color: #dd1144 } +/* LiteralStringOther */ .chroma .sx { color: #dd1144 } +/* LiteralStringRegex */ .chroma .sr { color: #009926 } +/* LiteralStringSingle */ .chroma .s1 { color: #dd1144 } +/* LiteralStringSymbol */ .chroma .ss { color: #990073 } +/* LiteralNumber */ .chroma .m { color: #009999 } +/* LiteralNumberBin */ .chroma .mb { color: #009999 } +/* LiteralNumberFloat */ .chroma .mf { color: #009999 } +/* LiteralNumberHex */ .chroma .mh { color: #009999 } +/* LiteralNumberInteger */ .chroma .mi { color: #009999 } +/* LiteralNumberIntegerLong */ .chroma .il { color: #009999 } +/* LiteralNumberOct */ .chroma .mo { color: #009999 } +/* Operator */ .chroma .o { color: #000000; font-weight: bold } +/* OperatorWord */ .chroma .ow { color: #000000; font-weight: bold } +/* Punctuation */ .chroma .p { } +/* Comment */ .chroma .c { color: #999988; font-style: italic } +/* CommentHashbang */ .chroma .ch { color: #999988; font-style: italic } +/* CommentMultiline */ .chroma .cm { color: #999988; font-style: italic } +/* CommentSingle */ .chroma .c1 { color: #999988; font-style: italic } +/* CommentSpecial */ .chroma .cs { color: #999999; font-weight: bold; font-style: italic } +/* CommentPreproc */ .chroma .cp { color: #999999; font-weight: bold; font-style: italic } +/* CommentPreprocFile */ .chroma .cpf { color: #999999; font-weight: bold; font-style: italic } +/* Generic */ .chroma .g { } +/* GenericDeleted */ .chroma .gd { color: #000000; background-color: #ffdddd } +/* GenericEmph */ .chroma .ge { color: #000000; font-style: italic } +/* GenericError */ .chroma .gr { color: #aa0000 } +/* GenericHeading */ .chroma .gh { color: #999999 } +/* GenericInserted */ .chroma .gi { color: #000000; background-color: #ddffdd } +/* GenericOutput */ .chroma .go { color: #888888 } +/* GenericPrompt */ .chroma .gp { color: #555555 } +/* GenericStrong */ .chroma .gs { font-weight: bold } +/* GenericSubheading */ .chroma .gu { color: #aaaaaa } +/* GenericTraceback */ .chroma .gt { color: #aa0000 } +/* GenericUnderline */ .chroma .gl { text-decoration: underline } +/* TextWhitespace */ .chroma .w { color: #bbbbbb } diff --git a/docs/assets/search-data.js b/docs/assets/search-data.js new file mode 100644 index 0000000000000..620fc380cf2b7 --- /dev/null +++ b/docs/assets/search-data.js @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +(function () { + const indexCfg = {{ with i18n "bookSearchConfig" }} + {{ . }}; + {{ else }} + {}; + {{ end }} + + indexCfg.doc = { + id: 'id', + field: ['title', 'content'], + store: ['title', 'href', 'section'], + }; + + const index = FlexSearch.create('balance', indexCfg); + window.bookSearchIndex = index; + + {{- $pages := where .Site.Pages "Kind" "in" (slice "page" "section") -}} + {{- $pages = where $pages "Params.booksearchexclude" "!=" true -}} + {{- $pages = where $pages "Content" "not in" (slice nil "") -}} + + {{ range $index, $page := $pages }} + index.add({ + 'id': {{ $index }}, + 'href': '{{ $page.RelPermalink }}', + 'title': {{ (partial "docs/simple-title" $page) | jsonify }}, + 'section': {{ (partial "docs/simple-title" $page.Parent) | jsonify }}, + 'content': {{ $page.Plain | jsonify }} + }); + {{- end -}} +})(); diff --git a/docs/build_docs.sh b/docs/build_docs.sh index b2ceff4e66016..36eba0cc2de51 100755 --- a/docs/build_docs.sh +++ b/docs/build_docs.sh @@ -17,94 +17,12 @@ # limitations under the License. ################################################################################ -RUBY=${RUBY:-ruby} -GEM=${GEM:-gem} -CACHE_DIR=${CACHE_DIR:-".rubydeps"} - -set -e -cd "$(dirname ${BASH_SOURCE[0]})" - -DIR="`pwd`" - -# We need at least bundler to proceed -if [ "`command -v bundle`" == "" ]; then - RUBYGEM_BINDIR="" - - # Adjust the PATH to discover locally installed ruby gem binaries - export PATH="$(${RUBY} -e 'puts Gem.user_dir')/bin:$PATH" - - if [ "`command -v bundle`" == "" ]; then - echo "WARN: Could not find bundle." - echo "Attempting to install locally. If this doesn't work, please install with 'gem install bundler'." - - # install bundler locally - ${GEM} install --user-install --no-format-executable bundler - fi +if ! command -v hugo &> /dev/null +then + echo "Hugo must be installed to run the docs locally" + echo "Please see docs/README.md for more details" + exit 1 fi +git submodule update --init --recursive -# Install Ruby dependencies locally -bundle install --path ${CACHE_DIR} - -DOCS_SRC=${DIR} -DOCS_DST=${DOCS_SRC}/content - -# default jekyll command is to just build site -JEKYLL_CMD="build" - -JEKYLL_CONFIG="" - -# config options that only apply to the barebone "build" without any arguments. -JEKYLL_BUILD_CONFIG=${JEKYLL_BUILD_CONFIG:-} - -DOC_LANGUAGES="en zh" - -# if -p flag is provided, serve site on localhost -# -i is like -p, but incremental (only rebuilds the modified file) -# -e builds only english documentation -# -z builds only chinese documentation -while getopts "piez" opt; do - case $opt in - p) - JEKYLL_CMD="serve --baseurl= --watch" - ;; - i) - [[ `${RUBY} -v` =~ 'ruby 1' ]] && echo "Error: building the docs with the incremental option requires at least ruby 2.0" && exit 1 - JEKYLL_CMD="serve --baseurl= --watch --incremental" - ;; - e) - JEKYLL_CONFIG="--config _config.yml,_config_dev_en.yml" - ;; - z) - JEKYLL_CONFIG="--config _config.yml,_config_dev_zh.yml" - ;; - *) echo "usage: $0 [-e|-z] [-i|-p]" >&2 - exit 1 ;; - esac -done - -# use 'bundle exec' to insert the local Ruby dependencies - -if [ "${JEKYLL_CMD}" = "build" ] && [ -z "${JEKYLL_CONFIG}" ]; then - # run parallel builds for all languages if not serving or creating a single language only - - # run processes and store pids - echo "Spawning parallel builds for languages: ${DOC_LANGUAGES}..." - pids="" - for lang in ${DOC_LANGUAGES}; do - bundle exec jekyll ${JEKYLL_CMD} ${JEKYLL_BUILD_CONFIG} --config _config.yml,_config_dev_${lang}.yml --source "${DOCS_SRC}" --destination "${DOCS_DST}_${lang}" & - pid=$! - pids="${pids} ${pid}" - done - - # wait for all pids (since jekyll returns 0 even in case of failures, we do not parse exit codes) - wait ${pids} - rm -rf "${DOCS_DST}" - mkdir -p "${DOCS_DST}" - for lang in ${DOC_LANGUAGES}; do - cp -aln "${DOCS_DST}_${lang}/." "${DOCS_DST}" - rm -rf "${DOCS_DST}_${lang}" - done - exit 0 -else - bundle exec jekyll ${JEKYLL_CMD} ${JEKYLL_CONFIG} --source "${DOCS_SRC}" --destination "${DOCS_DST}" -fi +hugo -b "" serve diff --git a/docs/check_links.sh b/docs/check_links.sh deleted file mode 100755 index dbe3766df96ef..0000000000000 --- a/docs/check_links.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env bash -################################################################################ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -DOCS_CHECK_DIR="`dirname \"$0\"`" # relative -DOCS_CHECK_DIR="`( cd \"$DOCS_CHECK_DIR\" && pwd -P)`" # absolutized and normalized -if [ -z "$DOCS_CHECK_DIR" ] ; then - # error; for some reason, the path is not accessible - # to the script (e.g. permissions re-evaled after suid) - exit 1 # fail -fi - -echo "Check docs directory: $DOCS_CHECK_DIR" - -target=${1:-"http://localhost:4000"} - -# Crawl the docs, ignoring robots.txt, storing nothing locally -wget --spider -r -nd -nv -e robots=off -p -o $DOCS_CHECK_DIR/spider.log "$target" - -# Abort for anything other than 0 and 4 ("Network failure") -status=$? - -if [ $status -ne 0 ] && [ $status -ne 4 ]; then - exit $status -fi - -# Fail the build if any broken links are found -no_broken_links_str_count=$(grep 'Found no broken links' $DOCS_CHECK_DIR/spider.log | wc -l) -if [ $no_broken_links_str_count -ne 1 ]; then - grep -B 1 "Remote file does not exist -- broken link!!!" $DOCS_CHECK_DIR/spider.log - echo "---------------------------------------------------------------------------" - echo "Check the spider.log file for errors!" - exit 1 -fi - -echo 'All links in docs are valid!' -exit 0 diff --git a/docs/concepts/flink-architecture.md b/docs/concepts/flink-architecture.md deleted file mode 100644 index eb6bf00d9cfb1..0000000000000 --- a/docs/concepts/flink-architecture.md +++ /dev/null @@ -1,252 +0,0 @@ ---- -title: Flink Architecture -nav-id: flink-architecture -nav-pos: 4 -nav-title: Flink Architecture -nav-parent_id: concepts ---- - - -Flink is a distributed system and requires effective allocation and management -of compute resources in order to execute streaming applications. It integrates -with all common cluster resource managers such as [Hadoop -YARN](https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/YARN.html), -[Apache Mesos](https://mesos.apache.org/) and -[Kubernetes](https://kubernetes.io/), but can also be set up to run as a -standalone cluster or even as a library. - -This section contains an overview of Flink’s architecture and describes how its -main components interact to execute applications and recover from failures. - -* This will be replaced by the TOC -{:toc} - -## Anatomy of a Flink Cluster - -The Flink runtime consists of two types of processes: a _JobManager_ and one or more _TaskManagers_. - -The processes involved in executing a Flink dataflow - -The *Client* is not part of the runtime and program execution, but is used to -prepare and send a dataflow to the JobManager. After that, the client can -disconnect (_detached mode_), or stay connected to receive progress reports -(_attached mode_). The client runs either as part of the Java/Scala program -that triggers the execution, or in the command line process `./bin/flink run -...`. - -The JobManager and TaskManagers can be started in various ways: directly on -the machines as a [standalone cluster]({% link -deployment/resource-providers/standalone/index.md %}), in containers, or managed by resource -frameworks like [YARN]({% link deployment/resource-providers/yarn.md -%}) or [Mesos]({% link deployment/resource-providers/mesos.md %}). -TaskManagers connect to JobManagers, announcing themselves as available, and -are assigned work. - -### JobManager - -The _JobManager_ has a number of responsibilities related to coordinating the distributed execution of Flink Applications: -it decides when to schedule the next task (or set of tasks), reacts to finished -tasks or execution failures, coordinates checkpoints, and coordinates recovery on -failures, among others. This process consists of three different components: - - * **ResourceManager** - - The _ResourceManager_ is responsible for resource de-/allocation and - provisioning in a Flink cluster — it manages **task slots**, which are the - unit of resource scheduling in a Flink cluster (see [TaskManagers](#taskmanagers)). - Flink implements multiple ResourceManagers for different environments and - resource providers such as YARN, Mesos, Kubernetes and standalone - deployments. In a standalone setup, the ResourceManager can only distribute - the slots of available TaskManagers and cannot start new TaskManagers on - its own. - - * **Dispatcher** - - The _Dispatcher_ provides a REST interface to submit Flink applications for - execution and starts a new JobMaster for each submitted job. It - also runs the Flink WebUI to provide information about job executions. - - * **JobMaster** - - A _JobMaster_ is responsible for managing the execution of a single - [JobGraph]({% link concepts/glossary.md %}#logical-graph). - Multiple jobs can run simultaneously in a Flink cluster, each having its - own JobMaster. - -There is always at least one JobManager. A high-availability setup might have -multiple JobManagers, one of which is always the *leader*, and the others are -*standby* (see [High Availability (HA)]({% link deployment/ha/index.md %})). - -### TaskManagers - -The *TaskManagers* (also called *workers*) execute the tasks of a dataflow, and buffer and exchange the data -streams. - -There must always be at least one TaskManager. The smallest unit of resource scheduling in a TaskManager is a task _slot_. The number of task slots in a -TaskManager indicates the number of concurrent processing tasks. Note that -multiple operators may execute in a task slot (see [Tasks and Operator -Chains](#tasks-and-operator-chains)). - -{% top %} - -## Tasks and Operator Chains - -For distributed execution, Flink *chains* operator subtasks together into -*tasks*. Each task is executed by one thread. Chaining operators together into -tasks is a useful optimization: it reduces the overhead of thread-to-thread -handover and buffering, and increases overall throughput while decreasing -latency. The chaining behavior can be configured; see the [chaining docs]({% -link dev/stream/operators/index.md %}#task-chaining-and-resource-groups) for details. - -The sample dataflow in the figure below is executed with five subtasks, and -hence with five parallel threads. - -Operator chaining into Tasks - -{% top %} - -## Task Slots and Resources - -Each worker (TaskManager) is a *JVM process*, and may execute one or more -subtasks in separate threads. To control how many tasks a TaskManager accepts, it -has so called **task slots** (at least one). - -Each *task slot* represents a fixed subset of resources of the TaskManager. A -TaskManager with three slots, for example, will dedicate 1/3 of its managed -memory to each slot. Slotting the resources means that a subtask will not -compete with subtasks from other jobs for managed memory, but instead has a -certain amount of reserved managed memory. Note that no CPU isolation happens -here; currently slots only separate the managed memory of tasks. - -By adjusting the number of task slots, users can define how subtasks are -isolated from each other. Having one slot per TaskManager means that each task -group runs in a separate JVM (which can be started in a separate container, for -example). Having multiple slots means more subtasks share the same JVM. Tasks -in the same JVM share TCP connections (via multiplexing) and heartbeat -messages. They may also share data sets and data structures, thus reducing the -per-task overhead. - -A TaskManager with Task Slots and Tasks - -By default, Flink allows subtasks to share slots even if they are subtasks of -different tasks, so long as they are from the same job. The result is that one -slot may hold an entire pipeline of the job. Allowing this *slot sharing* has -two main benefits: - - - A Flink cluster needs exactly as many task slots as the highest parallelism - used in the job. No need to calculate how many tasks (with varying - parallelism) a program contains in total. - - - It is easier to get better resource utilization. Without slot sharing, the - non-intensive *source/map()* subtasks would block as many resources as the - resource intensive *window* subtasks. With slot sharing, increasing the - base parallelism in our example from two to six yields full utilization of - the slotted resources, while making sure that the heavy subtasks are fairly - distributed among the TaskManagers. - -TaskManagers with shared Task Slots - -## Flink Application Execution - -A _Flink Application_ is any user program that spawns one or multiple Flink -jobs from its ``main()`` method. The execution of these jobs can happen in a -local JVM (``LocalEnvironment``) or on a remote setup of clusters with multiple -machines (``RemoteEnvironment``). For each program, the -[``ExecutionEnvironment``]({{ site.javadocs_baseurl }}/api/java/) provides methods to -control the job execution (e.g. setting the parallelism) and to interact with -the outside world (see [Anatomy of a Flink Program]({% -link dev/datastream_api.md %}#anatomy-of-a-flink-program)). - -The jobs of a Flink Application can either be submitted to a long-running -[Flink Session Cluster]({% -link concepts/glossary.md %}#flink-session-cluster), a dedicated [Flink Job -Cluster]({% link concepts/glossary.md %}#flink-job-cluster), or a -[Flink Application Cluster]({% -link concepts/glossary.md %}#flink-application-cluster). The difference between -these options is mainly related to the cluster’s lifecycle and to resource -isolation guarantees. - -### Flink Session Cluster - -* **Cluster Lifecycle**: in a Flink Session Cluster, the client connects to a - pre-existing, long-running cluster that can accept multiple job submissions. - Even after all jobs are finished, the cluster (and the JobManager) will - keep running until the session is manually stopped. The lifetime of a Flink - Session Cluster is therefore not bound to the lifetime of any Flink Job. - -* **Resource Isolation**: TaskManager slots are allocated by the - ResourceManager on job submission and released once the job is finished. - Because all jobs are sharing the same cluster, there is some competition for - cluster resources — like network bandwidth in the submit-job phase. One - limitation of this shared setup is that if one TaskManager crashes, then all - jobs that have tasks running on this TaskManager will fail; in a similar way, if - some fatal error occurs on the JobManager, it will affect all jobs running - in the cluster. - -* **Other considerations**: having a pre-existing cluster saves a considerable - amount of time applying for resources and starting TaskManagers. This is - important in scenarios where the execution time of jobs is very short and a - high startup time would negatively impact the end-to-end user experience — as - is the case with interactive analysis of short queries, where it is desirable - that jobs can quickly perform computations using existing resources. - -
    Note: Formerly, a Flink Session Cluster was also known as a Flink Cluster in session mode.
    - -### Flink Job Cluster - -* **Cluster Lifecycle**: in a Flink Job Cluster, the available cluster manager - (like YARN or Kubernetes) is used to spin up a cluster for each submitted job - and this cluster is available to that job only. Here, the client first - requests resources from the cluster manager to start the JobManager and - submits the job to the Dispatcher running inside this process. TaskManagers - are then lazily allocated based on the resource requirements of the job. Once - the job is finished, the Flink Job Cluster is torn down. - -* **Resource Isolation**: a fatal error in the JobManager only affects the one job running in that Flink Job Cluster. - -* **Other considerations**: because the ResourceManager has to apply and wait - for external resource management components to start the TaskManager - processes and allocate resources, Flink Job Clusters are more suited to large - jobs that are long-running, have high-stability requirements and are not - sensitive to longer startup times. - -
    Note: Formerly, a Flink Job Cluster was also known as a Flink Cluster in job (or per-job) mode.
    - -### Flink Application Cluster - -* **Cluster Lifecycle**: a Flink Application Cluster is a dedicated Flink - cluster that only executes jobs from one Flink Application and where the - ``main()`` method runs on the cluster rather than the client. The job - submission is a one-step process: you don’t need to start a Flink cluster - first and then submit a job to the existing cluster session; instead, you - package your application logic and dependencies into a executable job JAR and - the cluster entrypoint (``ApplicationClusterEntryPoint``) - is responsible for calling the ``main()`` method to extract the JobGraph. - This allows you to deploy a Flink Application like any other application on - Kubernetes, for example. The lifetime of a Flink Application Cluster is - therefore bound to the lifetime of the Flink Application. - -* **Resource Isolation**: in a Flink Application Cluster, the ResourceManager - and Dispatcher are scoped to a single Flink Application, which provides a - better separation of concerns than the Flink Session Cluster. - -
    Note: A Flink Job Cluster can be seen as a “run-on-client” alternative to Flink Application Clusters.
    - -{% top %} diff --git a/docs/concepts/flink-architecture.zh.md b/docs/concepts/flink-architecture.zh.md deleted file mode 100644 index c1bd898d498d9..0000000000000 --- a/docs/concepts/flink-architecture.zh.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -title: Flink 架构 -nav-id: flink-architecture -nav-pos: 4 -nav-title: Flink 架构 -nav-parent_id: concepts ---- - - -Flink 是一个分布式系统,需要有效分配和管理计算资源才能执行流应用程序。它集成了所有常见的集群资源管理器,例如[Hadoop YARN](https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/YARN.html)、[Apache Mesos](https://mesos.apache.org/)和[Kubernetes](https://kubernetes.io/),但也可以设置作为独立集群甚至库运行。 - -本节概述了 Flink 架构,并且描述了其主要组件如何交互以执行应用程序和从故障中恢复。 - -* This will be replaced by the TOC -{:toc} - -## Flink 集群剖析 - -Flink 运行时由两种类型的进程组成:一个 _JobManager_ 和一个或者多个 _TaskManager_。 - -The processes involved in executing a Flink dataflow - -*Client* 不是运行时和程序执行的一部分,而是用于准备数据流并将其发送给 JobManager。之后,客户端可以断开连接(_分离模式_),或保持连接来接收进程报告(_附加模式_)。客户端可以作为触发执行 Java/Scala 程序的一部分运行,也可以在命令行进程`./bin/flink run ...`中运行。 - -可以通过多种方式启动 JobManager 和 TaskManager:直接在机器上作为[standalone 集群]({% link deployment/resource-providers/standalone/index.zh.md %})启动、在容器中启动、或者通过[YARN]({% link deployment/resource-providers/yarn.zh.md %})或[Mesos]({% link deployment/resource-providers/mesos.zh.md %})等资源框架管理并启动。TaskManager 连接到 JobManagers,宣布自己可用,并被分配工作。 - -### JobManager - -_JobManager_ 具有许多与协调 Flink 应用程序的分布式执行有关的职责:它决定何时调度下一个 task(或一组 task)、对完成的 task 或执行失败做出反应、协调 checkpoint、并且协调从失败中恢复等等。这个进程由三个不同的组件组成: - - * **ResourceManager** - - _ResourceManager_ 负责 Flink 集群中的资源提供、回收、分配 - 它管理 **task slots**,这是 Flink 集群中资源调度的单位(请参考[TaskManagers](#taskmanagers))。Flink 为不同的环境和资源提供者(例如 YARN、Mesos、Kubernetes 和 standalone 部署)实现了对应的 ResourceManager。在 standalone 设置中,ResourceManager 只能分配可用 TaskManager 的 slots,而不能自行启动新的 TaskManager。 - - * **Dispatcher** - - _Dispatcher_ 提供了一个 REST 接口,用来提交 Flink 应用程序执行,并为每个提交的作业启动一个新的 JobMaster。它还运行 Flink WebUI 用来提供作业执行信息。 - - * **JobMaster** - - _JobMaster_ 负责管理单个[JobGraph]({% link concepts/glossary.zh.md %}#logical-graph)的执行。Flink 集群中可以同时运行多个作业,每个作业都有自己的 JobMaster。 - -始终至少有一个 JobManager。高可用(HA)设置中可能有多个 JobManager,其中一个始终是 *leader*,其他的则是 *standby*(请参考 [高可用(HA)]({% link deployment/ha/index.zh.md %}))。 - -### TaskManagers - -*TaskManager*(也称为 *worker*)执行作业流的 task,并且缓存和交换数据流。 - -必须始终至少有一个 TaskManager。在 TaskManager 中资源调度的最小单位是 task _slot_。TaskManager 中 task slot 的数量表示并发处理 task 的数量。请注意一个 task slot 中可以执行多个算子(请参考[Tasks 和算子链](#tasks-and-operator-chains))。 - -{% top %} - -## Tasks 和算子链 - -对于分布式执行,Flink 将算子的 subtasks *链接*成 *tasks*。每个 task 由一个线程执行。将算子链接成 task 是个有用的优化:它减少线程间切换、缓冲的开销,并且减少延迟的同时增加整体吞吐量。链行为是可以配置的;请参考[链文档]({% link dev/stream/operators/index.zh.md %}#task-chaining-and-resource-groups)以获取详细信息。 - -下图中样例数据流用 5 个 subtask 执行,因此有 5 个并行线程。 - -Operator chaining into Tasks - -{% top %} - -## Task Slots 和资源 - -每个 worker(TaskManager)都是一个 *JVM 进程*,可以在单独的线程中执行一个或多个 subtask。为了控制一个 TaskManager 中接受多少个 task,就有了所谓的 **task slots**(至少一个)。 - -每个 *task slot* 代表 TaskManager 中资源的固定子集。例如,具有 3 个 slot 的 TaskManager,会将其托管内存 1/3 用于每个 slot。分配资源意味着 subtask 不会与其他作业的 subtask 竞争托管内存,而是具有一定数量的保留托管内存。注意此处没有 CPU 隔离;当前 slot 仅分离 task 的托管内存。 - -通过调整 task slot 的数量,用户可以定义 subtask 如何互相隔离。每个 TaskManager 有一个 slot,这意味着每个 task 组都在单独的 JVM 中运行(例如,可以在单独的容器中启动)。具有多个 slot 意味着更多 subtask 共享同一 JVM。同一 JVM 中的 task 共享 TCP 连接(通过多路复用)和心跳信息。它们还可以共享数据集和数据结构,从而减少了每个 task 的开销。 - -A TaskManager with Task Slots and Tasks - -默认情况下,Flink 允许 subtask 共享 slot,即便它们是不同的 task 的 subtask,只要是来自于同一作业即可。结果就是一个 slot 可以持有整个作业管道。允许 *slot 共享*有两个主要优点: - - - Flink 集群所需的 task slot 和作业中使用的最大并行度恰好一样。无需计算程序总共包含多少个 task(具有不同并行度)。 - - - 容易获得更好的资源利用。如果没有 slot 共享,非密集 subtask(*source/map()*)将阻塞和密集型 subtask(*window*) 一样多的资源。通过 slot 共享,我们示例中的基本并行度从 2 增加到 6,可以充分利用分配的资源,同时确保繁重的 subtask 在 TaskManager 之间公平分配。 - -TaskManagers with shared Task Slots - -## Flink 应用程序执行 - -_Flink 应用程序_ 是从其 ``main()`` 方法产生的一个或多个 Flink 作业的任何用户程序。这些作业的执行可以在本地 JVM(`LocalEnvironment``)中进行,或具有多台机器的集群的远程设置(``RemoteEnvironment``)中进行。对于每个程序,[``ExecutionEnvironment``]({{ site.javadocs_baseurl }}/api/java/) 提供了一些方法来控制作业执行(例如设置并行度)并与外界交互(请参考 [Flink 程序剖析]({% link dev/datastream_api.zh.md %}#anatomy-of-a-flink-program) )。 - -Flink 应用程序的作业可以被提交到长期运行的 [Flink Session 集群]({% link concepts/glossary.zh.md %}#flink-session-cluster)、专用的 [Flink Job 集群]({% link concepts/glossary.zh.md %}#flink-job-cluster) 或 [Flink Application 集群]({% link concepts/glossary.zh.md %}#flink-application-cluster)。这些选项之间的差异主要与集群的生命周期和资源隔离保证有关。 - -### Flink Session 集群 - -* **集群生命周期**:在 Flink Session 集群中,客户端连接到一个预先存在的、长期运行的集群,该集群可以接受多个作业提交。即使所有作业完成后,集群(和 JobManager)仍将继续运行直到手动停止 session 为止。因此,Flink Session 集群的寿命不受任何 Flink 作业寿命的约束。 - -* **资源隔离**:TaskManager slot 由 ResourceManager 在提交作业时分配,并在作业完成时释放。由于所有作业都共享同一集群,因此在集群资源方面存在一些竞争 — 例如提交工作阶段的网络带宽。此共享设置的局限性在于,如果 TaskManager 崩溃,则在此 TaskManager 上运行 task 的所有作业都将失败;类似的,如果 JobManager 上发生一些致命错误,它将影响集群中正在运行的所有作业。 - -* **其他注意事项**:拥有一个预先存在的集群可以节省大量时间申请资源和启动 TaskManager。有种场景很重要,作业执行时间短并且启动时间长会对端到端的用户体验产生负面的影响 — 就像对简短查询的交互式分析一样,希望作业可以使用现有资源快速执行计算。 - -
    注意: 以前,Flink Session 集群也被称为 session 模式下的 Flink 集群。
    - -### Flink Job 集群 - -* **集群生命周期**:在 Flink Job 集群中,可用的集群管理器(例如 YARN 或 Kubernetes)用于为每个提交的作业启动一个集群,并且该集群仅可用于该作业。在这里,客户端首先从集群管理器请求资源启动 JobManager,然后将作业提交给在这个进程中运行的 Dispatcher。然后根据作业的资源请求惰性的分配 TaskManager。一旦作业完成,Flink Job 集群将被拆除。 - -* **资源隔离**:JobManager 中的致命错误仅影响在 Flink Job 集群中运行的一个作业。 - -* **其他注意事项**:由于 ResourceManager 必须应用并等待外部资源管理组件来启动 TaskManager 进程和分配资源,因此 Flink Job 集群更适合长期运行、具有高稳定性要求且对较长的启动时间不敏感的大型作业。 - -
    注意: 以前,Flink Job 集群也被称为 job (or per-job) 模式下的 Flink 集群。
    - -### Flink Application 集群 - -* **集群生命周期**:Flink Application 集群是专用的 Flink 集群,仅从 Flink 应用程序执行作业,并且 ``main()``方法在集群上而不是客户端上运行。提交作业是一个单步骤过程:无需先启动 Flink 集群,然后将作业提交到现有的 session 集群;相反,将应用程序逻辑和依赖打包成一个可执行的作业 JAR 中,并且集群入口(``ApplicationClusterEntryPoint``)负责调用 ``main()``方法来提取 JobGraph。例如,这允许你像在 Kubernetes 上部署任何其他应用程序一样部署 Flink 应用程序。因此,Flink Application 集群的寿命与 Flink 应用程序的寿命有关。 - -* **资源隔离**:在 Flink Application 集群中,ResourceManager 和 Dispatcher 作用于单个的 Flink 应用程序,相比于 Flink Session 集群,它提供了更好的隔离。 - -
    注意: Flink Job 集群可以看做是 Flink Application 集群”客户端运行“的替代方案。
    - -{% top %} diff --git a/docs/concepts/glossary.md b/docs/concepts/glossary.md deleted file mode 100644 index c45cf1fc052de..0000000000000 --- a/docs/concepts/glossary.md +++ /dev/null @@ -1,190 +0,0 @@ ---- -title: Glossary -nav-pos: 10 -nav-title: Glossary -nav-parent_id: concepts ---- - - -#### Flink Application Cluster - -A Flink Application Cluster is a dedicated [Flink Cluster](#flink-cluster) that -only executes [Flink Jobs](#flink-job) from one [Flink -Application](#flink-application). The lifetime of the [Flink -Cluster](#flink-cluster) is bound to the lifetime of the Flink Application. - -#### Flink Job Cluster - -A Flink Job Cluster is a dedicated [Flink Cluster](#flink-cluster) that only -executes a single [Flink Job](#flink-job). The lifetime of the -[Flink Cluster](#flink-cluster) is bound to the lifetime of the Flink Job. - -#### Flink Cluster - -A distributed system consisting of (typically) one [JobManager](#flink-jobmanager) and one or more -[Flink TaskManager](#flink-taskmanager) processes. - -#### Event - -An event is a statement about a change of the state of the domain modelled by the -application. Events can be input and/or output of a stream or batch processing application. -Events are special types of [records](#Record). - -#### ExecutionGraph - -see [Physical Graph](#physical-graph) - -#### Function - -Functions are implemented by the user and encapsulate the -application logic of a Flink program. Most Functions are wrapped by a corresponding -[Operator](#operator). - -#### Instance - -The term *instance* is used to describe a specific instance of a specific type (usually -[Operator](#operator) or [Function](#function)) during runtime. As Apache Flink is mostly written in -Java, this corresponds to the definition of *Instance* or *Object* in Java. In the context of Apache -Flink, the term *parallel instance* is also frequently used to emphasize that multiple instances of -the same [Operator](#operator) or [Function](#function) type are running in parallel. - -#### Flink Application - -A Flink application is a Java Application that submits one or multiple [Flink -Jobs](#flink-job) from the `main()` method (or by some other means). Submitting -jobs is usually done by calling `execute()` on an execution environment. - -The jobs of an application can either be submitted to a long running [Flink -Session Cluster](#flink-session-cluster), to a dedicated [Flink Application -Cluster](#flink-application-cluster), or to a [Flink Job -Cluster](#flink-job-cluster). - -#### Flink Job - -A Flink Job is the runtime representation of a [logical graph](#logical-graph) -(also often called dataflow graph) that is created and submitted by calling -`execute()` in a [Flink Application](#flink-application). - -#### JobGraph - -see [Logical Graph](#logical-graph) - -#### Flink JobManager - -The JobManager is the orchestrator of a [Flink Cluster](#flink-cluster). It contains three distinct -components: Flink Resource Manager, Flink Dispatcher and one [Flink JobMaster](#flink-jobmaster) -per running [Flink Job](#flink-job). - -#### Flink JobMaster - -JobMasters are one of the components running in the [JobManager](#flink-jobmanager). A JobMaster is -responsible for supervising the execution of the [Tasks](#task) of a single job. - -#### Logical Graph - -A logical graph is a directed graph where the nodes are [Operators](#operator) -and the edges define input/output-relationships of the operators and correspond -to data streams or data sets. A logical graph is created by submitting jobs -from a [Flink Application](#flink-application). - -Logical graphs are also often referred to as *dataflow graphs*. - -#### Managed State - -Managed State describes application state which has been registered with the framework. For -Managed State, Apache Flink will take care about persistence and rescaling among other things. - -#### Operator - -Node of a [Logical Graph](#logical-graph). An Operator performs a certain operation, which is -usually executed by a [Function](#function). Sources and Sinks are special Operators for data -ingestion and data egress. - -#### Operator Chain - -An Operator Chain consists of two or more consecutive [Operators](#operator) without any -repartitioning in between. Operators within the same Operator Chain forward records to each other -directly without going through serialization or Flink's network stack. - -#### Partition - -A partition is an independent subset of the overall data stream or data set. A data stream or -data set is divided into partitions by assigning each [record](#Record) to one or more partitions. -Partitions of data streams or data sets are consumed by [Tasks](#task) during runtime. A -transformation which changes the way a data stream or data set is partitioned is often called -repartitioning. - -#### Physical Graph - -A physical graph is the result of translating a [Logical Graph](#logical-graph) for execution in a -distributed runtime. The nodes are [Tasks](#task) and the edges indicate input/output-relationships -or [partitions](#partition) of data streams or data sets. - -#### Record - -Records are the constituent elements of a data set or data stream. [Operators](#operator) and -[Functions](#Function) receive records as input and emit records as output. - -#### (Runtime) Execution Mode - -DataStream API programs can be executed in one of two execution modes: `BATCH` -or `STREAMING`. See [Execution Mode]({% link dev/datastream_execution_mode.md -%}) for more details. - -#### Flink Session Cluster - -A long-running [Flink Cluster](#flink-cluster) which accepts multiple [Flink Jobs](#flink-job) for -execution. The lifetime of this Flink Cluster is not bound to the lifetime of any Flink Job. -Formerly, a Flink Session Cluster was also known as a Flink Cluster in *session mode*. Compare to -[Flink Application Cluster](#flink-application-cluster). - -#### State Backend - -For stream processing programs, the State Backend of a [Flink Job](#flink-job) determines how its -[state](#managed-state) is stored on each TaskManager (Java Heap of TaskManager or (embedded) -RocksDB) as well as where it is written upon a checkpoint (Java Heap of -[JobManager](#flink-jobmanager) or Filesystem). - -#### Sub-Task - -A Sub-Task is a [Task](#task) responsible for processing a [partition](#partition) of -the data stream. The term "Sub-Task" emphasizes that there are multiple parallel Tasks for the same -[Operator](#operator) or [Operator Chain](#operator-chain). - -#### Task - -Node of a [Physical Graph](#physical-graph). A task is the basic unit of work, which is executed by -Flink's runtime. Tasks encapsulate exactly one parallel instance of an -[Operator](#operator) or [Operator Chain](#operator-chain). - -#### Flink TaskManager - -TaskManagers are the worker processes of a [Flink Cluster](#flink-cluster). [Tasks](#task) are -scheduled to TaskManagers for execution. They communicate with each other to exchange data between -subsequent Tasks. - -#### Transformation - -A Transformation is applied on one or more data streams or data sets and results in one or more -output data streams or data sets. A transformation might change a data stream or data set on a -per-record basis, but might also only change its partitioning or perform an aggregation. While -[Operators](#operator) and [Functions](#function) are the "physical" parts of Flink's API, -Transformations are only an API concept. Specifically, most transformations are -implemented by certain [Operators](#operator). diff --git a/docs/concepts/glossary.zh.md b/docs/concepts/glossary.zh.md deleted file mode 100644 index dc78acaeb0ec8..0000000000000 --- a/docs/concepts/glossary.zh.md +++ /dev/null @@ -1,144 +0,0 @@ ---- -title: 词汇表 -nav-pos: 10 -nav-title: 词汇表 -nav-parent_id: concepts ---- - - -#### Flink Application Cluster - -A Flink Application Cluster is a dedicated [Flink Cluster](#flink-cluster) that -only executes [Flink Jobs](#flink-job) from one [Flink -Application](#flink-application). The lifetime of the [Flink -Cluster](#flink-cluster) is bound to the lifetime of the Flink Application. - -#### Flink Job Cluster - -A Flink Job Cluster is a dedicated [Flink Cluster](#flink-cluster) that only -executes a single [Flink Job](#flink-job). The lifetime of the -[Flink Cluster](#flink-cluster) is bound to the lifetime of the Flink Job. - -#### Flink Cluster - -一般情况下,Flink 集群是由一个 [Flink JobManager](#flink-jobmanager) 和一个或多个 [Flink TaskManager](#flink-taskmanager) 进程组成的分布式系统。 - -#### Event - -Event 是对应用程序建模的域的状态更改的声明。它可以同时为流或批处理应用程序的 input 和 output,也可以单独是 input 或者 output 中的一种。Event 是特殊类型的 [Record](#record)。 - -#### ExecutionGraph - -见 [Physical Graph](#physical-graph)。 - -#### Function - -Function 是由用户实现的,并封装了 Flink 程序的应用程序逻辑。大多数 Function 都由相应的 [Operator](#operator) 封装。 - -#### Instance - -Instance 常用于描述运行时的特定类型(通常是 [Operator](#operator) 或者 [Function](#function))的一个具体实例。由于 Apache Flink 主要是用 Java 编写的,所以,这与 Java 中的 *Instance* 或 *Object* 的定义相对应。在 Apache Flink 的上下文中,*parallel instance* 也常用于强调同一 [Operator](#operator) 或者 [Function](#function) 的多个 instance 以并行的方式运行。 - -#### Flink Application - -A Flink application is a Java Application that submits one or multiple [Flink -Jobs](#flink-job) from the `main()` method (or by some other means). Submitting -jobs is usually done by calling `execute()` on an execution environment. - -The jobs of an application can either be submitted to a long running [Flink -Session Cluster](#flink-session-cluster), to a dedicated [Flink Application -Cluster](#flink-application-cluster), or to a [Flink Job -Cluster](#flink-job-cluster). - -#### Flink Job - -A Flink Job is the runtime representation of a [logical graph](#logical-graph) -(also often called dataflow graph) that is created and submitted by calling -`execute()` in a [Flink Application](#flink-application). - -#### JobGraph - -见 [Logical Graph](#logical-graph)。 - -#### Flink JobManager - -Flink JobManager 是 [Flink Cluster](#flink-cluster) 的主节点。它包含三个不同的组件:Flink Resource Manager、Flink Dispatcher、运行每个 [Flink Job](#flink-job) 的 [Flink JobMaster](#flink-jobmaster)。 - - -#### Flink JobMaster - -JobMaster 是在 [Flink JobManager](#flink-jobmanager) 运行中的组件之一。JobManager 负责监督单个作业 [Task](#task) 的执行。以前,整个 [Flink JobManager](#flink-jobmanager) 都叫做 JobManager。 - -#### Logical Graph - -A logical graph is a directed graph where the nodes are [Operators](#operator) -and the edges define input/output-relationships of the operators and correspond -to data streams or data sets. A logical graph is created by submitting jobs -from a [Flink Application](#flink-application). - -Logical graphs are also often referred to as *dataflow graphs*. - -#### Managed State - -Managed State 描述了已在框架中注册的应用程序的托管状态。对于托管状态,Apache Flink 会负责持久化和重伸缩等事宜。 - -#### Operator - -[Logical Graph](#logical-graph) 的节点。算子执行某种操作,该操作通常由 [Function](#function) 执行。Source 和 Sink 是数据输入和数据输出的特殊算子。 - -#### Operator Chain - -算子链由两个或多个连续的 [Operator](#operator) 组成,两者之间没有任何的重新分区。同一算子链内的算子可以彼此直接传递 record,而无需通过序列化或 Flink 的网络栈。 - -#### Partition - -分区是整个数据流或数据集的独立子集。通过将每个 [Record](#record) 分配给一个或多个分区,来把数据流或数据集划分为多个分区。在运行期间,[Task](#task) 会消费数据流或数据集的分区。改变数据流或数据集分区方式的转换通常称为重分区。 - -#### Physical Graph - -Physical graph 是一个在分布式运行时,把 [Logical Graph](#logical-graph) 转换为可执行的结果。节点是 [Task](#task),边表示数据流或数据集的输入/输出关系或 [partition](#partition)。 - -#### Record - -Record 是数据集或数据流的组成元素。[Operator](#operator) 和 [Function](#Function)接收 record 作为输入,并将 record 作为输出发出。 - -#### Flink Session Cluster - -长时间运行的 [Flink Cluster](#flink-cluster),它可以接受多个 [Flink Job](#flink-job) 的执行。此 [Flink Cluster](#flink-cluster) 的生命周期不受任何 [Flink Job](#flink-job) 生命周期的约束限制。以前,Flink Session Cluster 也称为 *session mode* 的 [Flink Cluster](#flink-cluster),和 [Flink Application Cluster](#flink-application-cluster) 相对应。 - -#### State Backend - -对于流处理程序,[Flink Job](#flink-job) 的 State Backend 决定了其 [state](#managed-state) 是如何存储在每个 TaskManager 上的( TaskManager 的 Java 堆栈或嵌入式 RocksDB),以及它在 checkpoint 时的写入位置( [Flink JobManager](#flink-jobmanager) 的 Java 堆或者 Filesystem)。 - -#### Sub-Task - -Sub-Task 是负责处理数据流 [Partition](#partition) 的 [Task](#task)。"Sub-Task"强调的是同一个 [Operator](#operator) 或者 [Operator Chain](#operator-chain) 具有多个并行的 Task 。 - -#### Task - -Task 是 [Physical Graph](#physical-graph) 的节点。它是基本的工作单元,由 Flink 的 runtime 来执行。Task 正好封装了一个 [Operator](#operator) 或者 [Operator Chain](#operator-chain) 的 *parallel instance*。 - -#### Flink TaskManager - -TaskManager 是 [Flink Cluster](#flink-cluster) 的工作进程。[Task](#task) 被调度到 TaskManager 上执行。TaskManager 相互通信,只为在后续的 Task 之间交换数据。 - -#### Transformation - -Transformation 应用于一个或多个数据流或数据集,并产生一个或多个输出数据流或数据集。Transformation 可能会在每个记录的基础上更改数据流或数据集,但也可以只更改其分区或执行聚合。虽然 [Operator](#operator) 和 [Function](#function) 是 Flink API 的“物理”部分,但 Transformation 只是一个 API 概念。具体来说,大多数(但不是全部)Transformation 是由某些 [Operator](#operator) 实现的。 diff --git a/docs/concepts/index.md b/docs/concepts/index.md deleted file mode 100644 index c17113d8fb0c1..0000000000000 --- a/docs/concepts/index.md +++ /dev/null @@ -1,89 +0,0 @@ ---- -title: Concepts -nav-id: concepts -nav-pos: 3 -nav-title: ' Concepts' -nav-parent_id: root -nav-show_overview: true -permalink: /concepts/index.html ---- - - -The [Hands-on Training]({% link learn-flink/index.md %}) explains the basic concepts -of stateful and timely stream processing that underlie Flink's APIs, and provides examples of how -these mechanisms are used in applications. Stateful stream processing is introduced in the context -of [Data Pipelines & ETL]({% link learn-flink/etl.md %}#stateful-transformations) -and is further developed in the section on [Fault Tolerance]({% link learn-flink/fault_tolerance.md %}). Timely stream processing is introduced in the section on -[Streaming Analytics]({% link learn-flink/streaming_analytics.md %}). - -This _Concepts in Depth_ section provides a deeper understanding of how Flink's architecture and runtime -implement these concepts. - -## Flink's APIs - -Flink offers different levels of abstraction for developing streaming/batch applications. - -Programming levels of abstraction - - - The lowest level abstraction simply offers **stateful and timely stream processing**. It is - embedded into the [DataStream API]({% link dev/datastream_api.md %}) via the [Process - Function]({% link dev/stream/operators/process_function.md %}). It allows - users to freely process events from one or more streams, and provides consistent, fault tolerant - *state*. In addition, users can register event time and processing time callbacks, allowing - programs to realize sophisticated computations. - - - In practice, many applications do not need the low-level - abstractions described above, and can instead program against the **Core APIs**: the - [DataStream API]({% link dev/datastream_api.md %}) - (bounded/unbounded streams) and the [DataSet API]({% link - dev/batch/index.md %}) (bounded data sets). These fluent APIs offer the - common building blocks for data processing, like various forms of - user-specified transformations, joins, aggregations, windows, state, etc. - Data types processed in these APIs are represented as classes in the - respective programming languages. - - The low level *Process Function* integrates with the *DataStream API*, - making it possible to use the lower-level abstraction on an as-needed basis. - The *DataSet API* offers additional primitives on bounded data sets, - like loops/iterations. - - - The **Table API** is a declarative DSL centered around *tables*, which may - be dynamically changing tables (when representing streams). The [Table - API]({% link dev/table/index.md %}) follows the - (extended) relational model: Tables have a schema attached (similar to - tables in relational databases) and the API offers comparable operations, - such as select, project, join, group-by, aggregate, etc. Table API - programs declaratively define *what logical operation should be done* - rather than specifying exactly *how the code for the operation looks*. - Though the Table API is extensible by various types of user-defined - functions, it is less expressive than the *Core APIs*, and more concise to - use (less code to write). In addition, Table API programs also go through - an optimizer that applies optimization rules before execution. - - One can seamlessly convert between tables and *DataStream*/*DataSet*, - allowing programs to mix the *Table API* with the *DataStream* and - *DataSet* APIs. - - - The highest level abstraction offered by Flink is **SQL**. This abstraction - is similar to the *Table API* both in semantics and expressiveness, but - represents programs as SQL query expressions. The [SQL]( - {% link dev/table/index.md %}#sql) abstraction closely interacts with the - Table API, and SQL queries can be executed over tables defined in the - *Table API*. diff --git a/docs/concepts/index.zh.md b/docs/concepts/index.zh.md deleted file mode 100644 index a6efcf294e49d..0000000000000 --- a/docs/concepts/index.zh.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -title: 概念透析 -nav-id: concepts -nav-pos: 3 -nav-title: ' 概念透析' -nav-parent_id: root -nav-show_overview: true -permalink: /concepts/index.html ---- - - -[实践练习]({% link learn-flink/index.zh.md %})章节介绍了作为 Flink API 根基的有状态实时流处理的基本概念,并且举例说明了如何在 Flink 应用中使用这些机制。其中 [Data Pipelines & ETL]({% link learn-flink/etl.zh.md %}#stateful-transformations) 小节介绍了有状态流处理的概念,并且在 [Fault Tolerance]({% link learn-flink/fault_tolerance.zh.md %}) 小节中进行了深入介绍。[Streaming Analytics]({% link learn-flink/streaming_analytics.zh.md %}) 小节介绍了实时流处理的概念。 - -本章将深入分析 Flink 分布式运行时架构如何实现这些概念。 - -## Flink 中的 API - -Flink 为流式/批式处理应用程序的开发提供了不同级别的抽象。 - -Programming levels of abstraction - - - Flink API 最底层的抽象为**有状态实时流处理**。其抽象实现是 [Process Function]({% link dev/stream/operators/process_function.zh.md %}),并且 **Process Function** 被 Flink 框架集成到了 [DataStream API]({% link dev/datastream_api.zh.md %}) 中来为我们使用。它允许用户在应用程序中自由地处理来自单流或多流的事件(数据),并提供具有全局一致性和容错保障的*状态*。此外,用户可以在此层抽象中注册事件时间(event time)和处理时间(processing time)回调方法,从而允许程序可以实现复杂计算。 - - - Flink API 第二层抽象是 **Core APIs**。实际上,许多应用程序不需要使用到上述最底层抽象的 API,而是可以使用 **Core APIs** 进行编程:其中包含 [DataStream API]({% link dev/datastream_api.zh.md %})(应用于有界/无界数据流场景)和 [DataSet API]({% link dev/batch/index.zh.md %})(应用于有界数据集场景)两部分。Core APIs 提供的流式 API(Fluent API)为数据处理提供了通用的模块组件,例如各种形式的用户自定义转换(transformations)、联接(joins)、聚合(aggregations)、窗口(windows)和状态(state)操作等。此层 API 中处理的数据类型在每种编程语言中都有其对应的类。 - - *Process Function* 这类底层抽象和 *DataStream API* 的相互集成使得用户可以选择使用更底层的抽象 API 来实现自己的需求。*DataSet API* 还额外提供了一些原语,比如循环/迭代(loop/iteration)操作。 - - - Flink API 第三层抽象是 **Table API**。**Table API** 是以表(Table)为中心的声明式编程(DSL)API,例如在流式数据场景下,它可以表示一张正在动态改变的表。[Table API]({% link dev/table/index.zh.md %}) 遵循(扩展)关系模型:即表拥有 schema(类似于关系型数据库中的 schema),并且 Table API 也提供了类似于关系模型中的操作,比如 select、project、join、group-by 和 aggregate 等。Table API 程序是以声明的方式定义*应执行的逻辑操作*,而不是确切地指定程序*应该执行的代码*。尽管 Table API 使用起来很简洁并且可以由各种类型的用户自定义函数扩展功能,但还是比 Core API 的表达能力差。此外,Table API 程序在执行之前还会使用优化器中的优化规则对用户编写的表达式进行优化。 - - 表和 *DataStream*/*DataSet* 可以进行无缝切换,Flink 允许用户在编写应用程序时将 *Table API* 与 *DataStream*/*DataSet* API 混合使用。 - - - Flink API 最顶层抽象是 **SQL**。这层抽象在语义和程序表达式上都类似于 *Table API*,但是其程序实现都是 SQL 查询表达式。[SQL]({% link dev/table/index.zh.md %}#sql) 抽象与 Table API 抽象之间的关联是非常紧密的,并且 SQL 查询语句可以在 *Table API* 中定义的表上执行。 diff --git a/docs/concepts/stateful-stream-processing.md b/docs/concepts/stateful-stream-processing.md deleted file mode 100644 index ba5bbb976d32f..0000000000000 --- a/docs/concepts/stateful-stream-processing.md +++ /dev/null @@ -1,370 +0,0 @@ ---- -title: Stateful Stream Processing -nav-id: stateful-stream-processing -nav-pos: 2 -nav-title: Stateful Stream Processing -nav-parent_id: concepts ---- - - -* This will be replaced by the TOC -{:toc} - -## What is State? - -While many operations in a dataflow simply look at one individual *event at a -time* (for example an event parser), some operations remember information -across multiple events (for example window operators). These operations are -called **stateful**. - -Some examples of stateful operations: - - - When an application searches for certain event patterns, the state will - store the sequence of events encountered so far. - - When aggregating events per minute/hour/day, the state holds the pending - aggregates. - - When training a machine learning model over a stream of data points, the - state holds the current version of the model parameters. - - When historic data needs to be managed, the state allows efficient access - to events that occurred in the past. - -Flink needs to be aware of the state in order to make it fault tolerant using -[checkpoints]({% link dev/stream/state/checkpointing.md %}) -and [savepoints]({%link ops/state/savepoints.md %}). - -Knowledge about the state also allows for rescaling Flink applications, meaning -that Flink takes care of redistributing state across parallel instances. - -[Queryable state]({% link dev/stream/state/queryable_state.md -%}) allows you to access state from outside of Flink during runtime. - -When working with state, it might also be useful to read about [Flink's state -backends]({% link ops/state/state_backends.md %}). Flink -provides different state backends that specify how and where state is stored. - -{% top %} - -## Keyed State - -Keyed state is maintained in what can be thought of as an embedded key/value -store. The state is partitioned and distributed strictly together with the -streams that are read by the stateful operators. Hence, access to the key/value -state is only possible on *keyed streams*, i.e. after a keyed/partitioned data -exchange, and is restricted to the values associated with the current event's -key. Aligning the keys of streams and state makes sure that all state updates -are local operations, guaranteeing consistency without transaction overhead. -This alignment also allows Flink to redistribute the state and adjust the -stream partitioning transparently. - -State and Partitioning - -Keyed State is further organized into so-called *Key Groups*. Key Groups are -the atomic unit by which Flink can redistribute Keyed State; there are exactly -as many Key Groups as the defined maximum parallelism. During execution each -parallel instance of a keyed operator works with the keys for one or more Key -Groups. - -## State Persistence - -Flink implements fault tolerance using a combination of **stream replay** and -**checkpointing**. A checkpoint marks a specific point in each of the -input streams along with the corresponding state for each of the operators. A -streaming dataflow can be resumed from a checkpoint while maintaining -consistency *(exactly-once processing semantics)* by restoring the state of the -operators and replaying the records from the point of the checkpoint. - -The checkpoint interval is a means of trading off the overhead of fault -tolerance during execution with the recovery time (the number of records that -need to be replayed). - -The fault tolerance mechanism continuously draws snapshots of the distributed -streaming data flow. For streaming applications with small state, these -snapshots are very light-weight and can be drawn frequently without much impact -on performance. The state of the streaming applications is stored at a -configurable place, usually in a distributed file system. - -In case of a program failure (due to machine-, network-, or software failure), -Flink stops the distributed streaming dataflow. The system then restarts the -operators and resets them to the latest successful checkpoint. The input -streams are reset to the point of the state snapshot. Any records that are -processed as part of the restarted parallel dataflow are guaranteed to not have -affected the previously checkpointed state. - -{% info Note %} By default, checkpointing is disabled. See [Checkpointing]({% -link dev/stream/state/checkpointing.md %}) for details on how to enable and -configure checkpointing. - -{% info Note %} For this mechanism to realize its full guarantees, the data -stream source (such as message queue or broker) needs to be able to rewind the -stream to a defined recent point. [Apache Kafka](http://kafka.apache.org) has -this ability and Flink's connector to Kafka exploits this. See [Fault -Tolerance Guarantees of Data Sources and Sinks]({% link -dev/connectors/guarantees.md %}) for more information about the guarantees -provided by Flink's connectors. - -{% info Note %} Because Flink's checkpoints are realized through distributed -snapshots, we use the words *snapshot* and *checkpoint* interchangeably. Often -we also use the term *snapshot* to mean either *checkpoint* or *savepoint*. - -### Checkpointing - -The central part of Flink's fault tolerance mechanism is drawing consistent -snapshots of the distributed data stream and operator state. These snapshots -act as consistent checkpoints to which the system can fall back in case of a -failure. Flink's mechanism for drawing these snapshots is described in -"[Lightweight Asynchronous Snapshots for Distributed -Dataflows](http://arxiv.org/abs/1506.08603)". It is inspired by the standard -[Chandy-Lamport -algorithm](http://research.microsoft.com/en-us/um/people/lamport/pubs/chandy.pdf) -for distributed snapshots and is specifically tailored to Flink's execution -model. - -Keep in mind that everything to do with checkpointing can be done -asynchronously. The checkpoint barriers don't travel in lock step and -operations can asynchronously snapshot their state. - -Since Flink 1.11, checkpoints can be taken with or without alignment. In this -section, we describe aligned checkpoints first. - -#### Barriers - -A core element in Flink's distributed snapshotting are the *stream barriers*. -These barriers are injected into the data stream and flow with the records as -part of the data stream. Barriers never overtake records, they flow strictly in -line. A barrier separates the records in the data stream into the set of -records that goes into the current snapshot, and the records that go into the -next snapshot. Each barrier carries the ID of the snapshot whose records it -pushed in front of it. Barriers do not interrupt the flow of the stream and are -hence very lightweight. Multiple barriers from different snapshots can be in -the stream at the same time, which means that various snapshots may happen -concurrently. - -
    - Checkpoint barriers in data streams -
    - -Stream barriers are injected into the parallel data flow at the stream sources. -The point where the barriers for snapshot *n* are injected (let's call it -Sn) is the position in the source stream up to which the -snapshot covers the data. For example, in Apache Kafka, this position would be -the last record's offset in the partition. This position Sn -is reported to the *checkpoint coordinator* (Flink's JobManager). - -The barriers then flow downstream. When an intermediate operator has received a -barrier for snapshot *n* from all of its input streams, it emits a barrier for -snapshot *n* into all of its outgoing streams. Once a sink operator (the end of -a streaming DAG) has received the barrier *n* from all of its input streams, it -acknowledges that snapshot *n* to the checkpoint coordinator. After all sinks -have acknowledged a snapshot, it is considered completed. - -Once snapshot *n* has been completed, the job will never again ask the source -for records from before Sn, since at that point these records -(and their descendant records) will have passed through the entire data flow -topology. - -
    - Aligning data streams at operators with multiple inputs -
    - -Operators that receive more than one input stream need to *align* the input -streams on the snapshot barriers. The figure above illustrates this: - - - As soon as the operator receives snapshot barrier *n* from an incoming - stream, it cannot process any further records from that stream until it has - received the barrier *n* from the other inputs as well. Otherwise, it would - mix records that belong to snapshot *n* and with records that belong to - snapshot *n+1*. - - Once the last stream has received barrier *n*, the operator emits all - pending outgoing records, and then emits snapshot *n* barriers itself. - - It snapshots the state and resumes processing records from all input streams, - processing records from the input buffers before processing the records - from the streams. - - Finally, the operator writes the state asynchronously to the state backend. - -Note that the alignment is needed for all operators with multiple inputs and for -operators after a shuffle when they consume output streams of multiple upstream -subtasks. - -#### Snapshotting Operator State - -When operators contain any form of *state*, this state must be part of the -snapshots as well. - -Operators snapshot their state at the point in time when they have received all -snapshot barriers from their input streams, and before emitting the barriers to -their output streams. At that point, all updates to the state from records -before the barriers have been made, and no updates that depend on records -from after the barriers have been applied. Because the state of a snapshot may -be large, it is stored in a configurable *[state backend]({% -link ops/state/state_backends.md %})*. By default, this is the JobManager's -memory, but for production use a distributed reliable storage should be -configured (such as HDFS). After the state has been stored, the operator -acknowledges the checkpoint, emits the snapshot barrier into the output -streams, and proceeds. - -The resulting snapshot now contains: - - - For each parallel stream data source, the offset/position in the stream - when the snapshot was started - - For each operator, a pointer to the state that was stored as part of the - snapshot - -
    - Illustration of the Checkpointing Mechanism -
    - -#### Recovery - -Recovery under this mechanism is straightforward: Upon a failure, Flink selects -the latest completed checkpoint *k*. The system then re-deploys the entire -distributed dataflow, and gives each operator the state that was snapshotted as -part of checkpoint *k*. The sources are set to start reading the stream from -position Sk. For example in Apache Kafka, that means telling -the consumer to start fetching from offset Sk. - -If state was snapshotted incrementally, the operators start with the state of -the latest full snapshot and then apply a series of incremental snapshot -updates to that state. - -See [Restart Strategies]({% link dev/task_failure_recovery.md -%}#restart-strategies) for more information. - -### Unaligned Checkpointing - -Starting with Flink 1.11, checkpointing can also be performed unaligned. -The basic idea is that checkpoints can overtake all in-flight data as long as -the in-flight data becomes part of the operator state. - -Note that this approach is actually closer to the [Chandy-Lamport algorithm -](http://research.microsoft.com/en-us/um/people/lamport/pubs/chandy.pdf), but -Flink still inserts the barrier in the sources to avoid overloading the -checkpoint coordinator. - -
    - Unaligned checkpointing -
    - -The figure depicts how an operator handles unaligned checkpoint barriers: - -- The operator reacts on the first barrier that is stored in its input buffers. -- It immediately forwards the barrier to the downstream operator by adding it - to the end of the output buffers. -- The operator marks all overtaken records to be stored asynchronously and - creates a snapshot of its own state. - -Consequently, the operator only briefly stops the processing of input to mark -the buffers, forwards the barrier, and creates the snapshot of the other state. - -Unaligned checkpointing ensures that barriers are arriving at the sink as fast -as possible. It's especially suited for applications with at least one slow -moving data path, where alignment times can reach hours. However, since it's -adding additional I/O pressure, it doesn't help when the I/O to the state -backends is the bottleneck. See the more in-depth discussion in -[ops]({% link ops/state/checkpoints.md %}#unaligned-checkpoints) -for other limitations. - -Note that savepoints will always be aligned. - -#### Unaligned Recovery - -Operators first recover the in-flight data before starting processing any data -from upstream operators in unaligned checkpointing. Aside from that, it -performs the same steps as during [recovery of aligned checkpoints](#recovery). - -### State Backends - -The exact data structures in which the key/values indexes are stored depends on -the chosen [state backend]({% link -ops/state/state_backends.md %}). One state backend stores data in an in-memory -hash map, another state backend uses [RocksDB](http://rocksdb.org) as the -key/value store. In addition to defining the data structure that holds the -state, the state backends also implement the logic to take a point-in-time -snapshot of the key/value state and store that snapshot as part of a -checkpoint. State backends can be configured without changing your application -logic. - -checkpoints and snapshots - -{% top %} - -### Savepoints - -All programs that use checkpointing can resume execution from a **savepoint**. -Savepoints allow both updating your programs and your Flink cluster without -losing any state. - -[Savepoints]({% link ops/state/savepoints.md %}) are -**manually triggered checkpoints**, which take a snapshot of the program and -write it out to a state backend. They rely on the regular checkpointing -mechanism for this. - -Savepoints are similar to checkpoints except that they are -**triggered by the user** and **don't automatically expire** when newer -checkpoints are completed. - -{% top %} - -### Exactly Once vs. At Least Once - -The alignment step may add latency to the streaming program. Usually, this -extra latency is on the order of a few milliseconds, but we have seen cases -where the latency of some outliers increased noticeably. For applications that -require consistently super low latencies (few milliseconds) for all records, -Flink has a switch to skip the stream alignment during a checkpoint. Checkpoint -snapshots are still drawn as soon as an operator has seen the checkpoint -barrier from each input. - -When the alignment is skipped, an operator keeps processing all inputs, even -after some checkpoint barriers for checkpoint *n* arrived. That way, the -operator also processes elements that belong to checkpoint *n+1* before the -state snapshot for checkpoint *n* was taken. On a restore, these records will -occur as duplicates, because they are both included in the state snapshot of -checkpoint *n*, and will be replayed as part of the data after checkpoint *n*. - -{% info Note %} Alignment happens only for operators with multiple predecessors -(joins) as well as operators with multiple senders (after a stream -repartitioning/shuffle). Because of that, dataflows with only embarrassingly -parallel streaming operations (`map()`, `flatMap()`, `filter()`, ...) actually -give *exactly once* guarantees even in *at least once* mode. - -{% top %} - -## State and Fault Tolerance in Batch Programs - -Flink executes [batch programs]({% link dev/batch/index.md %}) as a special case of -streaming programs, where the streams are bounded (finite number of elements). -A *DataSet* is treated internally as a stream of data. The concepts above thus -apply to batch programs in the same way as well as they apply to streaming -programs, with minor exceptions: - - - [Fault tolerance for batch programs]({% link dev/task_failure_recovery.md %}) - does not use checkpointing. Recovery happens by fully replaying the - streams. That is possible, because inputs are bounded. This pushes the - cost more towards the recovery, but makes the regular processing cheaper, - because it avoids checkpoints. - - - Stateful operations in the DataSet API use simplified in-memory/out-of-core - data structures, rather than key/value indexes. - - - The DataSet API introduces special synchronized (superstep-based) - iterations, which are only possible on bounded streams. For details, check - out the [iteration docs]({% link dev/batch/iterations.md %}). - -{% top %} diff --git a/docs/concepts/stateful-stream-processing.zh.md b/docs/concepts/stateful-stream-processing.zh.md deleted file mode 100644 index 2df6628365209..0000000000000 --- a/docs/concepts/stateful-stream-processing.zh.md +++ /dev/null @@ -1,324 +0,0 @@ ---- -title: 有状态流处理 -nav-id: stateful-stream-processing -nav-pos: 2 -nav-title: 有状态流处理 -nav-parent_id: concepts ---- - - -* This will be replaced by the TOC -{:toc} - -## What is State? - -While many operations in a dataflow simply look at one individual *event at a -time* (for example an event parser), some operations remember information -across multiple events (for example window operators). These operations are -called **stateful**. - -Some examples of stateful operations: - - - When an application searches for certain event patterns, the state will - store the sequence of events encountered so far. - - When aggregating events per minute/hour/day, the state holds the pending - aggregates. - - When training a machine learning model over a stream of data points, the - state holds the current version of the model parameters. - - When historic data needs to be managed, the state allows efficient access - to events that occurred in the past. - -Flink needs to be aware of the state in order to make it fault tolerant using -[checkpoints]({% link dev/stream/state/checkpointing.zh.md %}) -and [savepoints]({%link ops/state/savepoints.zh.md %}). - -Knowledge about the state also allows for rescaling Flink applications, meaning -that Flink takes care of redistributing state across parallel instances. - -[Queryable state]({% link dev/stream/state/queryable_state.zh.md -%}) allows you to access state from outside of Flink during runtime. - -When working with state, it might also be useful to read about [Flink's state -backends]({% link ops/state/state_backends.zh.md %}). Flink -provides different state backends that specify how and where state is stored. - -{% top %} - -## Keyed State - -Keyed state is maintained in what can be thought of as an embedded key/value -store. The state is partitioned and distributed strictly together with the -streams that are read by the stateful operators. Hence, access to the key/value -state is only possible on *keyed streams*, i.e. after a keyed/partitioned data -exchange, and is restricted to the values associated with the current event's -key. Aligning the keys of streams and state makes sure that all state updates -are local operations, guaranteeing consistency without transaction overhead. -This alignment also allows Flink to redistribute the state and adjust the -stream partitioning transparently. - -State and Partitioning - -Keyed State is further organized into so-called *Key Groups*. Key Groups are -the atomic unit by which Flink can redistribute Keyed State; there are exactly -as many Key Groups as the defined maximum parallelism. During execution each -parallel instance of a keyed operator works with the keys for one or more Key -Groups. - -## State Persistence - -Flink implements fault tolerance using a combination of **stream replay** and -**checkpointing**. A checkpoint marks a specific point in each of the -input streams along with the corresponding state for each of the operators. A -streaming dataflow can be resumed from a checkpoint while maintaining -consistency *(exactly-once processing semantics)* by restoring the state of the -operators and replaying the records from the point of the checkpoint. - -The checkpoint interval is a means of trading off the overhead of fault -tolerance during execution with the recovery time (the number of records that -need to be replayed). - -The fault tolerance mechanism continuously draws snapshots of the distributed -streaming data flow. For streaming applications with small state, these -snapshots are very light-weight and can be drawn frequently without much impact -on performance. The state of the streaming applications is stored at a -configurable place, usually in a distributed file system. - -In case of a program failure (due to machine-, network-, or software failure), -Flink stops the distributed streaming dataflow. The system then restarts the -operators and resets them to the latest successful checkpoint. The input -streams are reset to the point of the state snapshot. Any records that are -processed as part of the restarted parallel dataflow are guaranteed to not have -affected the previously checkpointed state. - -{% info Note %} By default, checkpointing is disabled. See [Checkpointing]({% -link dev/stream/state/checkpointing.zh.md %}) for details on how to enable and -configure checkpointing. - -{% info Note %} For this mechanism to realize its full guarantees, the data -stream source (such as message queue or broker) needs to be able to rewind the -stream to a defined recent point. [Apache Kafka](http://kafka.apache.org) has -this ability and Flink's connector to Kafka exploits this. See [Fault -Tolerance Guarantees of Data Sources and Sinks]({% link -dev/connectors/guarantees.zh.md %}) for more information about the guarantees -provided by Flink's connectors. - -{% info Note %} Because Flink's checkpoints are realized through distributed -snapshots, we use the words *snapshot* and *checkpoint* interchangeably. Often -we also use the term *snapshot* to mean either *checkpoint* or *savepoint*. - -### Checkpointing - -The central part of Flink's fault tolerance mechanism is drawing consistent -snapshots of the distributed data stream and operator state. These snapshots -act as consistent checkpoints to which the system can fall back in case of a -failure. Flink's mechanism for drawing these snapshots is described in -"[Lightweight Asynchronous Snapshots for Distributed -Dataflows](http://arxiv.org/abs/1506.08603)". It is inspired by the standard -[Chandy-Lamport -algorithm](http://research.microsoft.com/en-us/um/people/lamport/pubs/chandy.pdf) -for distributed snapshots and is specifically tailored to Flink's execution -model. - -Keep in mind that everything to do with checkpointing can be done -asynchronously. The checkpoint barriers don't travel in lock step and -operations can asynchronously snapshot their state. - - -#### Barriers - -A core element in Flink's distributed snapshotting are the *stream barriers*. -These barriers are injected into the data stream and flow with the records as -part of the data stream. Barriers never overtake records, they flow strictly in -line. A barrier separates the records in the data stream into the set of -records that goes into the current snapshot, and the records that go into the -next snapshot. Each barrier carries the ID of the snapshot whose records it -pushed in front of it. Barriers do not interrupt the flow of the stream and are -hence very lightweight. Multiple barriers from different snapshots can be in -the stream at the same time, which means that various snapshots may happen -concurrently. - -
    - Checkpoint barriers in data streams -
    - -Stream barriers are injected into the parallel data flow at the stream sources. -The point where the barriers for snapshot *n* are injected (let's call it -Sn) is the position in the source stream up to which the -snapshot covers the data. For example, in Apache Kafka, this position would be -the last record's offset in the partition. This position Sn -is reported to the *checkpoint coordinator* (Flink's JobManager). - -The barriers then flow downstream. When an intermediate operator has received a -barrier for snapshot *n* from all of its input streams, it emits a barrier for -snapshot *n* into all of its outgoing streams. Once a sink operator (the end of -a streaming DAG) has received the barrier *n* from all of its input streams, it -acknowledges that snapshot *n* to the checkpoint coordinator. After all sinks -have acknowledged a snapshot, it is considered completed. - -Once snapshot *n* has been completed, the job will never again ask the source -for records from before Sn, since at that point these records -(and their descendant records) will have passed through the entire data flow -topology. - -
    - Aligning data streams at operators with multiple inputs -
    - -Operators that receive more than one input stream need to *align* the input -streams on the snapshot barriers. The figure above illustrates this: - - - As soon as the operator receives snapshot barrier *n* from an incoming - stream, it cannot process any further records from that stream until it has - received the barrier *n* from the other inputs as well. Otherwise, it would - mix records that belong to snapshot *n* and with records that belong to - snapshot *n+1*. - - Streams that report barrier *n* are temporarily set aside. Records that are - received from these streams are not processed, but put into an input - buffer. - - Once the last stream has received barrier *n*, the operator emits all - pending outgoing records, and then emits snapshot *n* barriers itself. - - After that, it resumes processing records from all input streams, - processing records from the input buffers before processing the records - from the streams. - -#### Snapshotting Operator State - -When operators contain any form of *state*, this state must be part of the -snapshots as well. - -Operators snapshot their state at the point in time when they have received all -snapshot barriers from their input streams, and before emitting the barriers to -their output streams. At that point, all updates to the state from records -before the barriers will have been made, and no updates that depend on records -from after the barriers have been applied. Because the state of a snapshot may -be large, it is stored in a configurable *[state backend]({% -link ops/state/state_backends.zh.md %})*. By default, this is the JobManager's -memory, but for production use a distributed reliable storage should be -configured (such as HDFS). After the state has been stored, the operator -acknowledges the checkpoint, emits the snapshot barrier into the output -streams, and proceeds. - -The resulting snapshot now contains: - - - For each parallel stream data source, the offset/position in the stream - when the snapshot was started - - For each operator, a pointer to the state that was stored as part of the - snapshot - -
    - Illustration of the Checkpointing Mechanism -
    - -#### Recovery - -Recovery under this mechanism is straightforward: Upon a failure, Flink selects -the latest completed checkpoint *k*. The system then re-deploys the entire -distributed dataflow, and gives each operator the state that was snapshotted as -part of checkpoint *k*. The sources are set to start reading the stream from -position Sk. For example in Apache Kafka, that means telling -the consumer to start fetching from offset Sk. - -If state was snapshotted incrementally, the operators start with the state of -the latest full snapshot and then apply a series of incremental snapshot -updates to that state. - -See [Restart Strategies]({% link dev/task_failure_recovery.zh.md -%}#restart-strategies) for more information. - -### State Backends - -The exact data structures in which the key/values indexes are stored depends on -the chosen [state backend]({% link -ops/state/state_backends.zh.md %}). One state backend stores data in an in-memory -hash map, another state backend uses [RocksDB](http://rocksdb.org) as the -key/value store. In addition to defining the data structure that holds the -state, the state backends also implement the logic to take a point-in-time -snapshot of the key/value state and store that snapshot as part of a -checkpoint. State backends can be configured without changing your application -logic. - -checkpoints and snapshots - -{% top %} - -### Savepoints - -All programs that use checkpointing can resume execution from a **savepoint**. -Savepoints allow both updating your programs and your Flink cluster without -losing any state. - -[Savepoints]({% link ops/state/savepoints.zh.md %}) are -**manually triggered checkpoints**, which take a snapshot of the program and -write it out to a state backend. They rely on the regular checkpointing -mechanism for this. - -Savepoints are similar to checkpoints except that they are -**triggered by the user** and **don't automatically expire** when newer -checkpoints are completed. - -{% top %} - -### Exactly Once vs. At Least Once - -The alignment step may add latency to the streaming program. Usually, this -extra latency is on the order of a few milliseconds, but we have seen cases -where the latency of some outliers increased noticeably. For applications that -require consistently super low latencies (few milliseconds) for all records, -Flink has a switch to skip the stream alignment during a checkpoint. Checkpoint -snapshots are still drawn as soon as an operator has seen the checkpoint -barrier from each input. - -When the alignment is skipped, an operator keeps processing all inputs, even -after some checkpoint barriers for checkpoint *n* arrived. That way, the -operator also processes elements that belong to checkpoint *n+1* before the -state snapshot for checkpoint *n* was taken. On a restore, these records will -occur as duplicates, because they are both included in the state snapshot of -checkpoint *n*, and will be replayed as part of the data after checkpoint *n*. - -{% info Note %} Alignment happens only for operators with multiple predecessors -(joins) as well as operators with multiple senders (after a stream -repartitioning/shuffle). Because of that, dataflows with only embarrassingly -parallel streaming operations (`map()`, `flatMap()`, `filter()`, ...) actually -give *exactly once* guarantees even in *at least once* mode. - -{% top %} - -## State and Fault Tolerance in Batch Programs - -Flink executes [batch programs]({% link dev/batch/index.zh.md %}) as a special case of -streaming programs, where the streams are bounded (finite number of elements). -A *DataSet* is treated internally as a stream of data. The concepts above thus -apply to batch programs in the same way as well as they apply to streaming -programs, with minor exceptions: - - - [Fault tolerance for batch programs]({% link dev/task_failure_recovery.zh.md %}) - does not use checkpointing. Recovery happens by fully replaying the - streams. That is possible, because inputs are bounded. This pushes the - cost more towards the recovery, but makes the regular processing cheaper, - because it avoids checkpoints. - - - Stateful operations in the DataSet API use simplified in-memory/out-of-core - data structures, rather than key/value indexes. - - - The DataSet API introduces special synchronized (superstep-based) - iterations, which are only possible on bounded streams. For details, check - out the [iteration docs]({% link dev/batch/iterations.zh.md %}). - -{% top %} diff --git a/docs/concepts/timely-stream-processing.md b/docs/concepts/timely-stream-processing.md deleted file mode 100644 index f8a5b8dfa5435..0000000000000 --- a/docs/concepts/timely-stream-processing.md +++ /dev/null @@ -1,214 +0,0 @@ ---- -title: Timely Stream Processing -nav-id: timely-stream-processing -nav-pos: 3 -nav-title: Timely Stream Processing -nav-parent_id: concepts ---- - - -* This will be replaced by the TOC -{:toc} - -## Introduction - -Timely stream processing is an extension of [stateful stream processing]({% link -concepts/stateful-stream-processing.md %}) in which time plays some role in the -computation. Among other things, this is the case when you do time series -analysis, when doing aggregations based on certain time periods (typically -called windows), or when you do event processing where the time when an event -occurred is important. - -In the following sections we will highlight some of the topics that you should -consider when working with timely Flink Applications. - -{% top %} - -## Notions of Time: Event Time and Processing Time - -When referring to time in a streaming program (for example to define windows), -one can refer to different notions of *time*: - -- **Processing time:** Processing time refers to the system time of the machine - that is executing the respective operation. - - When a streaming program runs on processing time, all time-based operations - (like time windows) will use the system clock of the machines that run the - respective operator. An hourly processing time window will include all - records that arrived at a specific operator between the times when the system - clock indicated the full hour. For example, if an application begins running - at 9:15am, the first hourly processing time window will include events - processed between 9:15am and 10:00am, the next window will include events - processed between 10:00am and 11:00am, and so on. - - Processing time is the simplest notion of time and requires no coordination - between streams and machines. It provides the best performance and the - lowest latency. However, in distributed and asynchronous environments - processing time does not provide determinism, because it is susceptible to - the speed at which records arrive in the system (for example from the message - queue), to the speed at which the records flow between operators inside the - system, and to outages (scheduled, or otherwise). - -- **Event time:** Event time is the time that each individual event occurred on - its producing device. This time is typically embedded within the records - before they enter Flink, and that *event timestamp* can be extracted from - each record. In event time, the progress of time depends on the data, not on - any wall clocks. Event time programs must specify how to generate *Event Time - Watermarks*, which is the mechanism that signals progress in event time. This - watermarking mechanism is described in a later section, - [below](#event-time-and-watermarks). - - In a perfect world, event time processing would yield completely consistent - and deterministic results, regardless of when events arrive, or their - ordering. However, unless the events are known to arrive in-order (by - timestamp), event time processing incurs some latency while waiting for - out-of-order events. As it is only possible to wait for a finite period of - time, this places a limit on how deterministic event time applications can - be. - - Assuming all of the data has arrived, event time operations will behave as - expected, and produce correct and consistent results even when working with - out-of-order or late events, or when reprocessing historic data. For example, - an hourly event time window will contain all records that carry an event - timestamp that falls into that hour, regardless of the order in which they - arrive, or when they are processed. (See the section on [late - events](#late-elements) for more information.) - - Note that sometimes when event time programs are processing live data in - real-time, they will use some *processing time* operations in order to - guarantee that they are progressing in a timely fashion. - -Event Time and Processing Time - -{% top %} - -## Event Time and Watermarks - -*Note: Flink implements many techniques from the Dataflow Model. For a good -introduction to event time and watermarks, have a look at the articles below.* - - - [Streaming - 101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) by - Tyler Akidau - - The [Dataflow Model - paper](https://research.google.com/pubs/archive/43864.pdf) - - -A stream processor that supports *event time* needs a way to measure the -progress of event time. For example, a window operator that builds hourly -windows needs to be notified when event time has passed beyond the end of an -hour, so that the operator can close the window in progress. - -*Event time* can progress independently of *processing time* (measured by wall -clocks). For example, in one program the current *event time* of an operator -may trail slightly behind the *processing time* (accounting for a delay in -receiving the events), while both proceed at the same speed. On the other -hand, another streaming program might progress through weeks of event time with -only a few seconds of processing, by fast-forwarding through some historic data -already buffered in a Kafka topic (or another message queue). - ------- - -The mechanism in Flink to measure progress in event time is **watermarks**. -Watermarks flow as part of the data stream and carry a timestamp *t*. A -*Watermark(t)* declares that event time has reached time *t* in that stream, -meaning that there should be no more elements from the stream with a timestamp -*t' <= t* (i.e. events with timestamps older or equal to the watermark). - -The figure below shows a stream of events with (logical) timestamps, and -watermarks flowing inline. In this example the events are in order (with -respect to their timestamps), meaning that the watermarks are simply periodic -markers in the stream. - -A data stream with events (in order) and watermarks - -Watermarks are crucial for *out-of-order* streams, as illustrated below, where -the events are not ordered by their timestamps. In general a watermark is a -declaration that by that point in the stream, all events up to a certain -timestamp should have arrived. Once a watermark reaches an operator, the -operator can advance its internal *event time clock* to the value of the -watermark. - -A data stream with events (out of order) and watermarks - -Note that event time is inherited by a freshly created stream element (or -elements) from either the event that produced them or from watermark that -triggered creation of those elements. - -### Watermarks in Parallel Streams - -Watermarks are generated at, or directly after, source functions. Each parallel -subtask of a source function usually generates its watermarks independently. -These watermarks define the event time at that particular parallel source. - -As the watermarks flow through the streaming program, they advance the event -time at the operators where they arrive. Whenever an operator advances its -event time, it generates a new watermark downstream for its successor -operators. - -Some operators consume multiple input streams; a union, for example, or -operators following a *keyBy(...)* or *partition(...)* function. Such an -operator's current event time is the minimum of its input streams' event times. -As its input streams update their event times, so does the operator. - -The figure below shows an example of events and watermarks flowing through -parallel streams, and operators tracking event time. - -Parallel data streams and operators with events and watermarks - -## Lateness - -It is possible that certain elements will violate the watermark condition, -meaning that even after the *Watermark(t)* has occurred, more elements with -timestamp *t' <= t* will occur. In fact, in many real world setups, certain -elements can be arbitrarily delayed, making it impossible to specify a time by -which all elements of a certain event timestamp will have occurred. -Furthermore, even if the lateness can be bounded, delaying the watermarks by -too much is often not desirable, because it causes too much delay in the -evaluation of event time windows. - -For this reason, streaming programs may explicitly expect some *late* elements. -Late elements are elements that arrive after the system's event time clock (as -signaled by the watermarks) has already passed the time of the late element's -timestamp. See [Allowed Lateness]({% link -dev/stream/operators/windows.md %}#allowed-lateness) for more information on -how to work with late elements in event time windows. - -## Windowing - -Aggregating events (e.g., counts, sums) works differently on streams than in -batch processing. For example, it is impossible to count all elements in a -stream, because streams are in general infinite (unbounded). Instead, -aggregates on streams (counts, sums, etc), are scoped by **windows**, such as -*"count over the last 5 minutes"*, or *"sum of the last 100 elements"*. - -Windows can be *time driven* (example: every 30 seconds) or *data driven* -(example: every 100 elements). One typically distinguishes different types of -windows, such as *tumbling windows* (no overlap), *sliding windows* (with -overlap), and *session windows* (punctuated by a gap of inactivity). - -Time- and Count Windows - -Please check out this [blog -post](https://flink.apache.org/news/2015/12/04/Introducing-windows.html) for -additional examples of windows or take a look a [window documentation]({% link -dev/stream/operators/windows.md %}) of the DataStream API. - -{% top %} diff --git a/docs/concepts/timely-stream-processing.zh.md b/docs/concepts/timely-stream-processing.zh.md deleted file mode 100644 index 54fcf1bea32fe..0000000000000 --- a/docs/concepts/timely-stream-processing.zh.md +++ /dev/null @@ -1,214 +0,0 @@ ---- -title: 及时流处理 -nav-id: timely-stream-processing -nav-pos: 3 -nav-title: 及时流处理 -nav-parent_id: concepts ---- - - -* This will be replaced by the TOC -{:toc} - -## Introduction - -Timely steam processing is an extension of [stateful stream processing]({% link -concepts/stateful-stream-processing.zh.md %}) in which time plays some role in the -computation. Among other things, this is the case when you do time series -analysis, when doing aggregations based on certain time periods (typically -called windows), or when you do event processing where the time when an event -occurred is important. - -In the following sections we will highlight some of the topics that you should -consider when working with timely Flink Applications. - -{% top %} - -## Notions of Time: Event Time and Processing Time - -When referring to time in a streaming program (for example to define windows), -one can refer to different notions of *time*: - -- **Processing time:** Processing time refers to the system time of the machine - that is executing the respective operation. - - When a streaming program runs on processing time, all time-based operations - (like time windows) will use the system clock of the machines that run the - respective operator. An hourly processing time window will include all - records that arrived at a specific operator between the times when the system - clock indicated the full hour. For example, if an application begins running - at 9:15am, the first hourly processing time window will include events - processed between 9:15am and 10:00am, the next window will include events - processed between 10:00am and 11:00am, and so on. - - Processing time is the simplest notion of time and requires no coordination - between streams and machines. It provides the best performance and the - lowest latency. However, in distributed and asynchronous environments - processing time does not provide determinism, because it is susceptible to - the speed at which records arrive in the system (for example from the message - queue), to the speed at which the records flow between operators inside the - system, and to outages (scheduled, or otherwise). - -- **Event time:** Event time is the time that each individual event occurred on - its producing device. This time is typically embedded within the records - before they enter Flink, and that *event timestamp* can be extracted from - each record. In event time, the progress of time depends on the data, not on - any wall clocks. Event time programs must specify how to generate *Event Time - Watermarks*, which is the mechanism that signals progress in event time. This - watermarking mechanism is described in a later section, - [below](#event-time-and-watermarks). - - In a perfect world, event time processing would yield completely consistent - and deterministic results, regardless of when events arrive, or their - ordering. However, unless the events are known to arrive in-order (by - timestamp), event time processing incurs some latency while waiting for - out-of-order events. As it is only possible to wait for a finite period of - time, this places a limit on how deterministic event time applications can - be. - - Assuming all of the data has arrived, event time operations will behave as - expected, and produce correct and consistent results even when working with - out-of-order or late events, or when reprocessing historic data. For example, - an hourly event time window will contain all records that carry an event - timestamp that falls into that hour, regardless of the order in which they - arrive, or when they are processed. (See the section on [late - events](#late-elements) for more information.) - - Note that sometimes when event time programs are processing live data in - real-time, they will use some *processing time* operations in order to - guarantee that they are progressing in a timely fashion. - -Event Time and Processing Time - -{% top %} - -## Event Time and Watermarks - -*Note: Flink implements many techniques from the Dataflow Model. For a good -introduction to event time and watermarks, have a look at the articles below.* - - - [Streaming - 101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) by - Tyler Akidau - - The [Dataflow Model - paper](https://research.google.com/pubs/archive/43864.pdf) - - -A stream processor that supports *event time* needs a way to measure the -progress of event time. For example, a window operator that builds hourly -windows needs to be notified when event time has passed beyond the end of an -hour, so that the operator can close the window in progress. - -*Event time* can progress independently of *processing time* (measured by wall -clocks). For example, in one program the current *event time* of an operator -may trail slightly behind the *processing time* (accounting for a delay in -receiving the events), while both proceed at the same speed. On the other -hand, another streaming program might progress through weeks of event time with -only a few seconds of processing, by fast-forwarding through some historic data -already buffered in a Kafka topic (or another message queue). - ------- - -The mechanism in Flink to measure progress in event time is **watermarks**. -Watermarks flow as part of the data stream and carry a timestamp *t*. A -*Watermark(t)* declares that event time has reached time *t* in that stream, -meaning that there should be no more elements from the stream with a timestamp -*t' <= t* (i.e. events with timestamps older or equal to the watermark). - -The figure below shows a stream of events with (logical) timestamps, and -watermarks flowing inline. In this example the events are in order (with -respect to their timestamps), meaning that the watermarks are simply periodic -markers in the stream. - -A data stream with events (in order) and watermarks - -Watermarks are crucial for *out-of-order* streams, as illustrated below, where -the events are not ordered by their timestamps. In general a watermark is a -declaration that by that point in the stream, all events up to a certain -timestamp should have arrived. Once a watermark reaches an operator, the -operator can advance its internal *event time clock* to the value of the -watermark. - -A data stream with events (out of order) and watermarks - -Note that event time is inherited by a freshly created stream element (or -elements) from either the event that produced them or from watermark that -triggered creation of those elements. - -### Watermarks in Parallel Streams - -Watermarks are generated at, or directly after, source functions. Each parallel -subtask of a source function usually generates its watermarks independently. -These watermarks define the event time at that particular parallel source. - -As the watermarks flow through the streaming program, they advance the event -time at the operators where they arrive. Whenever an operator advances its -event time, it generates a new watermark downstream for its successor -operators. - -Some operators consume multiple input streams; a union, for example, or -operators following a *keyBy(...)* or *partition(...)* function. Such an -operator's current event time is the minimum of its input streams' event times. -As its input streams update their event times, so does the operator. - -The figure below shows an example of events and watermarks flowing through -parallel streams, and operators tracking event time. - -Parallel data streams and operators with events and watermarks - -## Lateness - -It is possible that certain elements will violate the watermark condition, -meaning that even after the *Watermark(t)* has occurred, more elements with -timestamp *t' <= t* will occur. In fact, in many real world setups, certain -elements can be arbitrarily delayed, making it impossible to specify a time by -which all elements of a certain event timestamp will have occurred. -Furthermore, even if the lateness can be bounded, delaying the watermarks by -too much is often not desirable, because it causes too much delay in the -evaluation of event time windows. - -For this reason, streaming programs may explicitly expect some *late* elements. -Late elements are elements that arrive after the system's event time clock (as -signaled by the watermarks) has already passed the time of the late element's -timestamp. See [Allowed Lateness]({% link -dev/stream/operators/windows.zh.md %}#allowed-lateness) for more information on -how to work with late elements in event time windows. - -## Windowing - -Aggregating events (e.g., counts, sums) works differently on streams than in -batch processing. For example, it is impossible to count all elements in a -stream, because streams are in general infinite (unbounded). Instead, -aggregates on streams (counts, sums, etc), are scoped by **windows**, such as -*"count over the last 5 minutes"*, or *"sum of the last 100 elements"*. - -Windows can be *time driven* (example: every 30 seconds) or *data driven* -(example: every 100 elements). One typically distinguishes different types of -windows, such as *tumbling windows* (no overlap), *sliding windows* (with -overlap), and *session windows* (punctuated by a gap of inactivity). - -Time- and Count Windows - -Please check out this [blog -post](https://flink.apache.org/news/2015/12/04/Introducing-windows.html) for -additional examples of windows or take a look a [window documentation]({% link -dev/stream/operators/windows.zh.md %}) of the DataStream API. - -{% top %} diff --git a/docs/config.toml b/docs/config.toml new file mode 100644 index 0000000000000..aeeb39a74fbd5 --- /dev/null +++ b/docs/config.toml @@ -0,0 +1,103 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +baseURL = '//ci.apache.org/projects/flink/flink-docs-master' +languageCode = "en-us" +title = "Apache Flink" +enableGitInfo = false +theme = "book" +pygmentsUseClasses = true + +[params] + # Flag whether this is a stable version or not. + # Used for the quickstart page. + IsStable = false + + # Flag to indicate whether an outdated warning should be shown. + ShowOutDatedWarning = false + + # This are the version referenced in the docs. Please only use these variables + # to reference a specific Flink version, because this is the only place where + # we change the version for the complete docs when forking of a release branch + # etc. + # The full version string as referenced in Maven (e.g. 1.2.1) + Version = "1.14-SNAPSHOT" + + # For stable releases, leave the bugfix version out (e.g. 1.2). For snapshot + # release this should be the same as the regular version + VersionTitle = "1.14-SNAPSHOT" + + # The branch for this version of Apache Flink + Branch = "master" + + # The github repository for Apache Flink + Repo = "//github.com/apache/flink" + + GithubRepo = "https://github.com/apache/flink.git" + + # Flink training exercises + TrainingExercises = "//github.com/apache/flink-training" + + # This suffix is appended to the Scala-dependent Maven artifact names + ScalaVersion = "_2.11" + + ProjectHomepage = "//flink.apache.org" + + JavaDocs = "//ci.apache.org/projects/flink/flink-docs-master/api/java/" + + ScalaDocs = "//ci.apache.org/projects/flink/flink-docs-master/api/scala/index.html#org.apache.flink.api.scala.package" + + PyDocs = "//ci.apache.org/projects/flink/flink-docs-master/api/python/" + + # External links at the bottom + # of the menu + MenuLinks = [ + ["Project Homepage", "//flink.apache.org"], + ["JavaDocs", "//ci.apache.org/projects/flink/flink-docs-master/api/java/"], + ["ScalaDocs", "//ci.apache.org/projects/flink/flink-docs-master/api/scala/index.html#org.apache.flink.api.scala.package"], + ["PyDocs", "//ci.apache.org/projects/flink/flink-docs-master/api/python/"] + ] + + PreviousDocs = [ + ["1.12", "http://ci.apache.org/projects/flink/flink-docs-release-1.12"], + ["1.11", "http://ci.apache.org/projects/flink/flink-docs-release-1.11"], + ["1.10", "http://ci.apache.org/projects/flink/flink-docs-release-1.10"], + ["1.9", "http://ci.apache.org/projects/flink/flink-docs-release-1.9"], + ["1.8", "http://ci.apache.org/projects/flink/flink-docs-release-1.8"], + ["1.7", "http://ci.apache.org/projects/flink/flink-docs-release-1.7"], + ["1.6", "http://ci.apache.org/projects/flink/flink-docs-release-1.6"], + ["1.5", "http://ci.apache.org/projects/flink/flink-docs-release-1.5"], + ["1.4", "http://ci.apache.org/projects/flink/flink-docs-release-1.4"], + ["1.3", "http://ci.apache.org/projects/flink/flink-docs-release-1.3"], + ["1.2", "http://ci.apache.org/projects/flink/flink-docs-release-1.2"], + ["1.1", "http://ci.apache.org/projects/flink/flink-docs-release-1.1"], + ["1.0", "http://ci.apache.org/projects/flink/flink-docs-release-1.0"] + ] + +[markup] +[markup.goldmark.renderer] + unsafe = true + +[languages] +[languages.en] + languageName = 'English' + contentDir = 'content' + weight = 1 + +[languages.zh] + languageName = '中文版' + contentDir = 'content.zh' + weight = 2 diff --git a/docs/connectors/index.md b/docs/connectors/index.md deleted file mode 100644 index 5a744166b3ccc..0000000000000 --- a/docs/connectors/index.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: "Connectors" -nav-id: connectors-root -nav-title: ' Connectors' -nav-parent_id: root -nav-pos: 7 ---- - - -* toc -{:toc} diff --git a/docs/connectors/index.zh.md b/docs/connectors/index.zh.md deleted file mode 100644 index 5a744166b3ccc..0000000000000 --- a/docs/connectors/index.zh.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: "Connectors" -nav-id: connectors-root -nav-title: ' Connectors' -nav-parent_id: root -nav-pos: 7 ---- - - -* toc -{:toc} diff --git a/docs/content.zh/_index.md b/docs/content.zh/_index.md new file mode 100644 index 0000000000000..c1e782d1fdcaf --- /dev/null +++ b/docs/content.zh/_index.md @@ -0,0 +1,88 @@ +--- +title: Apache Flink Documentation +type: docs +bookToc: false +aliases: + - /zh/examples/index.html + - /zh/getting-started/examples/index.html +--- + + +# Apache Flink Documentation + +{{< center >}} +**Apache Flink** is a framework and distributed processing engine for stateful computations over *unbounded* and *bounded* data streams. Flink has been designed to run in *all common cluster environments* perform computations at *in-memory* speed and at *any scale*. +{{< /center >}} + +{{< columns >}} + +### Try Flink + +If you’re interested in playing around with Flink, try one of our tutorials: + +* [Fraud Detection with the DataStream API]({{< ref "docs/try-flink/datastream" >}}) +* [Real Time Reporting with the Table API]({{< ref "docs/try-flink/table_api" >}}) +* [Intro to PyFlink]({{< ref "docs/dev/python/overview" >}}) +* [Flink Operations Playground]({{< ref "docs/try-flink/flink-operations-playground" >}}) + +### Learn Flink + +* To dive in deeper, the [Hands-on Training]({{< ref "docs/learn-flink/overview" >}}) includes a set of lessons and exercises that provide a step-by-step introduction to Flink. + +* The [Concepts]({{< ref "docs/concepts/overview" >}}) section explains what you need to know about Flink before exploring the reference documentation. + +### Get Help with Flink + +If you get stuck, check out our [community support resources](https://flink.apache.org/community.html). In particular, Apache Flink’s user mailing list is consistently ranked as one of the most active of any Apache project, and is a great way to get help quickly. + +<---> + +### Explore Flink + +The reference documentation covers all the details. Some starting points: + +{{< columns >}} +* [DataStream API]({{< ref "docs/dev/datastream/overview" >}}) +* [Table API & SQL]({{< ref "docs/dev/table/overview" >}}) +* [Stateful Functions](https://ci.apache.org/projects/flink/flink-statefun-docs-stable/) + +<---> + +* [Configuration]({{< ref "docs/deployment/config" >}}) +* [Rest API]({{< ref "docs/ops/rest_api" >}}) +* [CLI]({{< ref "docs/deployment/cli" >}}) +{{< /columns >}} + +### Deploy Flink + +Before putting your Flink job into production, read the [Production Readiness Checklist]({{< ref "docs/ops/production_ready" >}}). +For an overview of possible deployment targets, see [Clusters and Deployments]({{< ref "docs/deployment/overview" >}}). + +### Upgrade Flink + +Release notes cover important changes between Flink versions. Please read them carefully if you plan to upgrade your Flink setup. + + +See the release notes for [Flink 1.12]({{< ref "/release-notes/flink-1.12.md" >}}), [Flink 1.11]({{< ref "/release-notes/flink-1.11.md" >}}), [Flink 1.10]({{< ref "/release-notes/flink-1.10.md" >}}), [Flink 1.9]({{< ref "/release-notes/flink-1.9.md" >}}), [Flink 1.8]({{< ref "/release-notes/flink-1.8.md" >}}), or [Flink 1.7]({{< ref "/release-notes/flink-1.7.md" >}}). + +{{< /columns >}} \ No newline at end of file diff --git a/docs/content.zh/docs/concepts/_index.md b/docs/content.zh/docs/concepts/_index.md new file mode 100644 index 0000000000000..040815aa98e63 --- /dev/null +++ b/docs/content.zh/docs/concepts/_index.md @@ -0,0 +1,25 @@ +--- +title: 概念透析 +icon: +bold: true +bookCollapseSection: true +weight: 3 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/concepts/flink-architecture.md b/docs/content.zh/docs/concepts/flink-architecture.md new file mode 100644 index 0000000000000..e4c7b0f5174a4 --- /dev/null +++ b/docs/content.zh/docs/concepts/flink-architecture.md @@ -0,0 +1,139 @@ +--- +title: Flink 架构 +weight: 4 +type: docs +nav-title: Flink 架构 +--- + + +# Flink 架构 + +Flink 是一个分布式系统,需要有效分配和管理计算资源才能执行流应用程序。它集成了所有常见的集群资源管理器,例如[Hadoop YARN](https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/YARN.html)、[Apache Mesos](https://mesos.apache.org/)和[Kubernetes](https://kubernetes.io/),但也可以设置作为独立集群甚至库运行。 + +本节概述了 Flink 架构,并且描述了其主要组件如何交互以执行应用程序和从故障中恢复。 + +## Flink 集群剖析 + +Flink 运行时由两种类型的进程组成:一个 _JobManager_ 和一个或者多个 _TaskManager_。 + +{{< img src="/fig/processes.svg" alt="The processes involved in executing a Flink dataflow" class="offset" width="70%" >}} + +*Client* 不是运行时和程序执行的一部分,而是用于准备数据流并将其发送给 JobManager。之后,客户端可以断开连接(_分离模式_),或保持连接来接收进程报告(_附加模式_)。客户端可以作为触发执行 Java/Scala 程序的一部分运行,也可以在命令行进程`./bin/flink run ...`中运行。 + +可以通过多种方式启动 JobManager 和 TaskManager:直接在机器上作为[standalone 集群]({{< ref "docs/deployment/resource-providers/standalone/overview" >}})启动、在容器中启动、或者通过[YARN]({{< ref "docs/deployment/resource-providers/yarn" >}})或[Mesos]({{< ref "docs/deployment/resource-providers/mesos" >}})等资源框架管理并启动。TaskManager 连接到 JobManagers,宣布自己可用,并被分配工作。 + +### JobManager + +_JobManager_ 具有许多与协调 Flink 应用程序的分布式执行有关的职责:它决定何时调度下一个 task(或一组 task)、对完成的 task 或执行失败做出反应、协调 checkpoint、并且协调从失败中恢复等等。这个进程由三个不同的组件组成: + + * **ResourceManager** + + _ResourceManager_ 负责 Flink 集群中的资源提供、回收、分配 - 它管理 **task slots**,这是 Flink 集群中资源调度的单位(请参考[TaskManagers](#taskmanagers))。Flink 为不同的环境和资源提供者(例如 YARN、Mesos、Kubernetes 和 standalone 部署)实现了对应的 ResourceManager。在 standalone 设置中,ResourceManager 只能分配可用 TaskManager 的 slots,而不能自行启动新的 TaskManager。 + + * **Dispatcher** + + _Dispatcher_ 提供了一个 REST 接口,用来提交 Flink 应用程序执行,并为每个提交的作业启动一个新的 JobMaster。它还运行 Flink WebUI 用来提供作业执行信息。 + + * **JobMaster** + + _JobMaster_ 负责管理单个[JobGraph]({{< ref "docs/concepts/glossary" >}}#logical-graph)的执行。Flink 集群中可以同时运行多个作业,每个作业都有自己的 JobMaster。 + +始终至少有一个 JobManager。高可用(HA)设置中可能有多个 JobManager,其中一个始终是 *leader*,其他的则是 *standby*(请参考 [高可用(HA)]({{< ref "docs/deployment/ha/overview" >}}))。 + +### TaskManagers + +*TaskManager*(也称为 *worker*)执行作业流的 task,并且缓存和交换数据流。 + +必须始终至少有一个 TaskManager。在 TaskManager 中资源调度的最小单位是 task _slot_。TaskManager 中 task slot 的数量表示并发处理 task 的数量。请注意一个 task slot 中可以执行多个算子(请参考[Tasks 和算子链](#tasks-and-operator-chains))。 + +{{< top >}} + +## Tasks 和算子链 + +对于分布式执行,Flink 将算子的 subtasks *链接*成 *tasks*。每个 task 由一个线程执行。将算子链接成 task 是个有用的优化:它减少线程间切换、缓冲的开销,并且减少延迟的同时增加整体吞吐量。链行为是可以配置的;请参考[链文档]({{< ref "docs/dev/datastream/operators/overview" >}}#task-chaining-and-resource-groups)以获取详细信息。 + +下图中样例数据流用 5 个 subtask 执行,因此有 5 个并行线程。 + +{{< img src="/fig/tasks_chains.svg" alt="Operator chaining into Tasks" class="offset" width="80%" >}} + +{{< top >}} + +## Task Slots 和资源 + +每个 worker(TaskManager)都是一个 *JVM 进程*,可以在单独的线程中执行一个或多个 subtask。为了控制一个 TaskManager 中接受多少个 task,就有了所谓的 **task slots**(至少一个)。 + +每个 *task slot* 代表 TaskManager 中资源的固定子集。例如,具有 3 个 slot 的 TaskManager,会将其托管内存 1/3 用于每个 slot。分配资源意味着 subtask 不会与其他作业的 subtask 竞争托管内存,而是具有一定数量的保留托管内存。注意此处没有 CPU 隔离;当前 slot 仅分离 task 的托管内存。 + +通过调整 task slot 的数量,用户可以定义 subtask 如何互相隔离。每个 TaskManager 有一个 slot,这意味着每个 task 组都在单独的 JVM 中运行(例如,可以在单独的容器中启动)。具有多个 slot 意味着更多 subtask 共享同一 JVM。同一 JVM 中的 task 共享 TCP 连接(通过多路复用)和心跳信息。它们还可以共享数据集和数据结构,从而减少了每个 task 的开销。 + +{{< img src="/fig/tasks_slots.svg" alt="A TaskManager with Task Slots and Tasks" class="offset" width="80%" >}} + +默认情况下,Flink 允许 subtask 共享 slot,即便它们是不同的 task 的 subtask,只要是来自于同一作业即可。结果就是一个 slot 可以持有整个作业管道。允许 *slot 共享*有两个主要优点: + + - Flink 集群所需的 task slot 和作业中使用的最大并行度恰好一样。无需计算程序总共包含多少个 task(具有不同并行度)。 + + - 容易获得更好的资源利用。如果没有 slot 共享,非密集 subtask(*source/map()*)将阻塞和密集型 subtask(*window*) 一样多的资源。通过 slot 共享,我们示例中的基本并行度从 2 增加到 6,可以充分利用分配的资源,同时确保繁重的 subtask 在 TaskManager 之间公平分配。 + +{{< img src="/fig/slot_sharing.svg" alt="TaskManagers with shared Task Slots" class="offset" width="80%" >}} + +## Flink 应用程序执行 + +_Flink 应用程序_ 是从其 ``main()`` 方法产生的一个或多个 Flink 作业的任何用户程序。这些作业的执行可以在本地 JVM(`LocalEnvironment``)中进行,或具有多台机器的集群的远程设置(``RemoteEnvironment``)中进行。对于每个程序,[``ExecutionEnvironment``]({{ site.javadocs_baseurl }}/api/java/) 提供了一些方法来控制作业执行(例如设置并行度)并与外界交互(请参考 [Flink 程序剖析]({{< ref "docs/dev/datastream/overview" >}}#anatomy-of-a-flink-program) )。 + +Flink 应用程序的作业可以被提交到长期运行的 [Flink Session 集群]({{< ref "docs/concepts/glossary" >}}#flink-session-cluster)、专用的 [Flink Job 集群]({{< ref "docs/concepts/glossary" >}}#flink-job-cluster) 或 [Flink Application 集群]({{< ref "docs/concepts/glossary" >}}#flink-application-cluster)。这些选项之间的差异主要与集群的生命周期和资源隔离保证有关。 + +### Flink Session 集群 + +* **集群生命周期**:在 Flink Session 集群中,客户端连接到一个预先存在的、长期运行的集群,该集群可以接受多个作业提交。即使所有作业完成后,集群(和 JobManager)仍将继续运行直到手动停止 session 为止。因此,Flink Session 集群的寿命不受任何 Flink 作业寿命的约束。 + +* **资源隔离**:TaskManager slot 由 ResourceManager 在提交作业时分配,并在作业完成时释放。由于所有作业都共享同一集群,因此在集群资源方面存在一些竞争 — 例如提交工作阶段的网络带宽。此共享设置的局限性在于,如果 TaskManager 崩溃,则在此 TaskManager 上运行 task 的所有作业都将失败;类似的,如果 JobManager 上发生一些致命错误,它将影响集群中正在运行的所有作业。 + +* **其他注意事项**:拥有一个预先存在的集群可以节省大量时间申请资源和启动 TaskManager。有种场景很重要,作业执行时间短并且启动时间长会对端到端的用户体验产生负面的影响 — 就像对简短查询的交互式分析一样,希望作业可以使用现有资源快速执行计算。 + +{{< hint info >}} +以前,Flink Session 集群也被称为 session 模式下的 Flink 集群。 +{{< /hint >}} + +### Flink Job 集群 + +* **集群生命周期**:在 Flink Job 集群中,可用的集群管理器(例如 YARN)用于为每个提交的作业启动一个集群,并且该集群仅可用于该作业。在这里,客户端首先从集群管理器请求资源启动 JobManager,然后将作业提交给在这个进程中运行的 Dispatcher。然后根据作业的资源请求惰性的分配 TaskManager。一旦作业完成,Flink Job 集群将被拆除。 + +* **资源隔离**:JobManager 中的致命错误仅影响在 Flink Job 集群中运行的一个作业。 + +* **其他注意事项**:由于 ResourceManager 必须应用并等待外部资源管理组件来启动 TaskManager 进程和分配资源,因此 Flink Job 集群更适合长期运行、具有高稳定性要求且对较长的启动时间不敏感的大型作业。 + +{{< hint info >}} +以前,Flink Job 集群也被称为 job (or per-job) 模式下的 Flink 集群。 +{{< /hint >}} +{{< hint info >}} +Kubernetes 不支持 Flink Job 集群。 请参考 [Standalone Kubernetes]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}}#per-job-cluster-mode) 和 [Native Kubernetes]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}#per-job-cluster-mode)。 +{{< /hint >}} + +### Flink Application 集群 + +* **集群生命周期**:Flink Application 集群是专用的 Flink 集群,仅从 Flink 应用程序执行作业,并且 ``main()``方法在集群上而不是客户端上运行。提交作业是一个单步骤过程:无需先启动 Flink 集群,然后将作业提交到现有的 session 集群;相反,将应用程序逻辑和依赖打包成一个可执行的作业 JAR 中,并且集群入口(``ApplicationClusterEntryPoint``)负责调用 ``main()``方法来提取 JobGraph。例如,这允许你像在 Kubernetes 上部署任何其他应用程序一样部署 Flink 应用程序。因此,Flink Application 集群的寿命与 Flink 应用程序的寿命有关。 + +* **资源隔离**:在 Flink Application 集群中,ResourceManager 和 Dispatcher 作用于单个的 Flink 应用程序,相比于 Flink Session 集群,它提供了更好的隔离。 + +{{< hint info >}} +Flink Job 集群可以看做是 Flink Application 集群”客户端运行“的替代方案。 +{{< /hint >}} + +{{< top >}} diff --git a/docs/content.zh/docs/concepts/glossary.md b/docs/content.zh/docs/concepts/glossary.md new file mode 100644 index 0000000000000..c1ea57929b9c4 --- /dev/null +++ b/docs/content.zh/docs/concepts/glossary.md @@ -0,0 +1,145 @@ +--- +title: 词汇表 +weight: 11 +type: docs +bookToc: false +--- + + +# 词汇表 + +#### Flink Application Cluster + +A Flink Application Cluster is a dedicated [Flink Cluster](#flink-cluster) that +only executes [Flink Jobs](#flink-job) from one [Flink Application](#flink-application). +The lifetime of the [Flink Cluster](#flink-cluster) is bound to the lifetime of the Flink Application. + +#### Flink Job Cluster + +A Flink Job Cluster is a dedicated [Flink Cluster](#flink-cluster) that only +executes a single [Flink Job](#flink-job). The lifetime of the +[Flink Cluster](#flink-cluster) is bound to the lifetime of the Flink Job. + +#### Flink Cluster + +一般情况下,Flink 集群是由一个 [Flink JobManager](#flink-jobmanager) 和一个或多个 [Flink TaskManager](#flink-taskmanager) 进程组成的分布式系统。 + +#### Event + +Event 是对应用程序建模的域的状态更改的声明。它可以同时为流或批处理应用程序的 input 和 output,也可以单独是 input 或者 output 中的一种。Event 是特殊类型的 [Record](#record)。 + +#### ExecutionGraph + +见 [Physical Graph](#physical-graph)。 + +#### Function + +Function 是由用户实现的,并封装了 Flink 程序的应用程序逻辑。大多数 Function 都由相应的 [Operator](#operator) 封装。 + +#### Instance + +Instance 常用于描述运行时的特定类型(通常是 [Operator](#operator) 或者 [Function](#function))的一个具体实例。由于 Apache Flink 主要是用 Java 编写的,所以,这与 Java 中的 *Instance* 或 *Object* 的定义相对应。在 Apache Flink 的上下文中,*parallel instance* 也常用于强调同一 [Operator](#operator) 或者 [Function](#function) 的多个 instance 以并行的方式运行。 + +#### Flink Application + +A Flink application is a Java Application that submits one or multiple [Flink +Jobs](#flink-job) from the `main()` method (or by some other means). Submitting +jobs is usually done by calling `execute()` on an execution environment. + +The jobs of an application can either be submitted to a long running [Flink +Session Cluster](#flink-session-cluster), to a dedicated [Flink Application +Cluster](#flink-application-cluster), or to a [Flink Job +Cluster](#flink-job-cluster). + +#### Flink Job + +A Flink Job is the runtime representation of a [logical graph](#logical-graph) +(also often called dataflow graph) that is created and submitted by calling +`execute()` in a [Flink Application](#flink-application). + +#### JobGraph + +见 [Logical Graph](#logical-graph)。 + +#### Flink JobManager + +Flink JobManager 是 [Flink Cluster](#flink-cluster) 的主节点。它包含三个不同的组件:Flink Resource Manager、Flink Dispatcher、运行每个 [Flink Job](#flink-job) 的 [Flink JobMaster](#flink-jobmaster)。 + + +#### Flink JobMaster + +JobMaster 是在 [Flink JobManager](#flink-jobmanager) 运行中的组件之一。JobManager 负责监督单个作业 [Task](#task) 的执行。以前,整个 [Flink JobManager](#flink-jobmanager) 都叫做 JobManager。 + +#### Logical Graph + +A logical graph is a directed graph where the nodes are [Operators](#operator) +and the edges define input/output-relationships of the operators and correspond +to data streams or data sets. A logical graph is created by submitting jobs +from a [Flink Application](#flink-application). + +Logical graphs are also often referred to as *dataflow graphs*. + +#### Managed State + +Managed State 描述了已在框架中注册的应用程序的托管状态。对于托管状态,Apache Flink 会负责持久化和重伸缩等事宜。 + +#### Operator + +[Logical Graph](#logical-graph) 的节点。算子执行某种操作,该操作通常由 [Function](#function) 执行。Source 和 Sink 是数据输入和数据输出的特殊算子。 + +#### Operator Chain + +算子链由两个或多个连续的 [Operator](#operator) 组成,两者之间没有任何的重新分区。同一算子链内的算子可以彼此直接传递 record,而无需通过序列化或 Flink 的网络栈。 + +#### Partition + +分区是整个数据流或数据集的独立子集。通过将每个 [Record](#record) 分配给一个或多个分区,来把数据流或数据集划分为多个分区。在运行期间,[Task](#task) 会消费数据流或数据集的分区。改变数据流或数据集分区方式的转换通常称为重分区。 + +#### Physical Graph + +Physical graph 是一个在分布式运行时,把 [Logical Graph](#logical-graph) 转换为可执行的结果。节点是 [Task](#task),边表示数据流或数据集的输入/输出关系或 [partition](#partition)。 + +#### Record + +Record 是数据集或数据流的组成元素。[Operator](#operator) 和 [Function](#Function)接收 record 作为输入,并将 record 作为输出发出。 + +#### Flink Session Cluster + +长时间运行的 [Flink Cluster](#flink-cluster),它可以接受多个 [Flink Job](#flink-job) 的执行。此 [Flink Cluster](#flink-cluster) 的生命周期不受任何 [Flink Job](#flink-job) 生命周期的约束限制。以前,Flink Session Cluster 也称为 *session mode* 的 [Flink Cluster](#flink-cluster),和 [Flink Application Cluster](#flink-application-cluster) 相对应。 + +#### State Backend + +对于流处理程序,[Flink Job](#flink-job) 的 State Backend 决定了其 [state](#managed-state) 是如何存储在每个 TaskManager 上的( TaskManager 的 Java 堆栈或嵌入式 RocksDB),以及它在 checkpoint 时的写入位置( [Flink JobManager](#flink-jobmanager) 的 Java 堆或者 Filesystem)。 + +#### Sub-Task + +Sub-Task 是负责处理数据流 [Partition](#partition) 的 [Task](#task)。"Sub-Task"强调的是同一个 [Operator](#operator) 或者 [Operator Chain](#operator-chain) 具有多个并行的 Task 。 + +#### Task + +Task 是 [Physical Graph](#physical-graph) 的节点。它是基本的工作单元,由 Flink 的 runtime 来执行。Task 正好封装了一个 [Operator](#operator) 或者 [Operator Chain](#operator-chain) 的 *parallel instance*。 + +#### Flink TaskManager + +TaskManager 是 [Flink Cluster](#flink-cluster) 的工作进程。[Task](#task) 被调度到 TaskManager 上执行。TaskManager 相互通信,只为在后续的 Task 之间交换数据。 + +#### Transformation + +Transformation 应用于一个或多个数据流或数据集,并产生一个或多个输出数据流或数据集。Transformation 可能会在每个记录的基础上更改数据流或数据集,但也可以只更改其分区或执行聚合。虽然 [Operator](#operator) 和 [Function](#function) 是 Flink API 的“物理”部分,但 Transformation 只是一个 API 概念。具体来说,大多数(但不是全部)Transformation 是由某些 [Operator](#operator) 实现的。 diff --git a/docs/content.zh/docs/concepts/overview.md b/docs/content.zh/docs/concepts/overview.md new file mode 100644 index 0000000000000..12e512911f70b --- /dev/null +++ b/docs/content.zh/docs/concepts/overview.md @@ -0,0 +1,50 @@ +--- +title: 概览 +weight: 1 +type: docs +aliases: + - /zh/concepts/ + - /zh/concepts/concepts.html +--- + + +# 概念透析 + +[实践练习]({{< ref "docs/learn-flink/overview" >}})章节介绍了作为 Flink API 根基的有状态实时流处理的基本概念,并且举例说明了如何在 Flink 应用中使用这些机制。其中 [Data Pipelines & ETL]({{< ref "docs/learn-flink/etl" >}}#stateful-transformations) 小节介绍了有状态流处理的概念,并且在 [Fault Tolerance]({{< ref "docs/learn-flink/fault_tolerance" >}}) 小节中进行了深入介绍。[Streaming Analytics]({{< ref "docs/learn-flink/streaming_analytics" >}}) 小节介绍了实时流处理的概念。 + +本章将深入分析 Flink 分布式运行时架构如何实现这些概念。 + +## Flink 中的 API + +Flink 为流式/批式处理应用程序的开发提供了不同级别的抽象。 + +{{< img src="/fig/levels_of_abstraction.svg" alt="Programming levels of abstraction" class="offset" width="80%" >}} + + - Flink API 最底层的抽象为**有状态实时流处理**。其抽象实现是 [Process Function]({{< ref "docs/dev/datastream/operators/process_function" >}}),并且 **Process Function** 被 Flink 框架集成到了 [DataStream API]({{< ref "docs/dev/datastream/overview" >}}) 中来为我们使用。它允许用户在应用程序中自由地处理来自单流或多流的事件(数据),并提供具有全局一致性和容错保障的*状态*。此外,用户可以在此层抽象中注册事件时间(event time)和处理时间(processing time)回调方法,从而允许程序可以实现复杂计算。 + + - Flink API 第二层抽象是 **Core APIs**。实际上,许多应用程序不需要使用到上述最底层抽象的 API,而是可以使用 **Core APIs** 进行编程:其中包含 [DataStream API]({{< ref "docs/dev/datastream/overview" >}})(应用于有界/无界数据流场景)和 [DataSet API]({{< ref "docs/dev/dataset/overview" >}})(应用于有界数据集场景)两部分。Core APIs 提供的流式 API(Fluent API)为数据处理提供了通用的模块组件,例如各种形式的用户自定义转换(transformations)、联接(joins)、聚合(aggregations)、窗口(windows)和状态(state)操作等。此层 API 中处理的数据类型在每种编程语言中都有其对应的类。 + + *Process Function* 这类底层抽象和 *DataStream API* 的相互集成使得用户可以选择使用更底层的抽象 API 来实现自己的需求。*DataSet API* 还额外提供了一些原语,比如循环/迭代(loop/iteration)操作。 + + - Flink API 第三层抽象是 **Table API**。**Table API** 是以表(Table)为中心的声明式编程(DSL)API,例如在流式数据场景下,它可以表示一张正在动态改变的表。[Table API]({{< ref "docs/dev/table/overview" >}}) 遵循(扩展)关系模型:即表拥有 schema(类似于关系型数据库中的 schema),并且 Table API 也提供了类似于关系模型中的操作,比如 select、project、join、group-by 和 aggregate 等。Table API 程序是以声明的方式定义*应执行的逻辑操作*,而不是确切地指定程序*应该执行的代码*。尽管 Table API 使用起来很简洁并且可以由各种类型的用户自定义函数扩展功能,但还是比 Core API 的表达能力差。此外,Table API 程序在执行之前还会使用优化器中的优化规则对用户编写的表达式进行优化。 + + 表和 *DataStream*/*DataSet* 可以进行无缝切换,Flink 允许用户在编写应用程序时将 *Table API* 与 *DataStream*/*DataSet* API 混合使用。 + + - Flink API 最顶层抽象是 **SQL**。这层抽象在语义和程序表达式上都类似于 *Table API*,但是其程序实现都是 SQL 查询表达式。[SQL]({{< ref "docs/dev/table/overview" >}}#sql) 抽象与 Table API 抽象之间的关联是非常紧密的,并且 SQL 查询语句可以在 *Table API* 中定义的表上执行。 diff --git a/docs/content.zh/docs/concepts/stateful-stream-processing.md b/docs/content.zh/docs/concepts/stateful-stream-processing.md new file mode 100644 index 0000000000000..c78949508e896 --- /dev/null +++ b/docs/content.zh/docs/concepts/stateful-stream-processing.md @@ -0,0 +1,365 @@ +--- +title: 有状态流处理 +weight: 2 +type: docs +--- + + +# 有状态流处理 + +## What is State? + +While many operations in a dataflow simply look at one individual *event at a +time* (for example an event parser), some operations remember information +across multiple events (for example window operators). These operations are +called **stateful**. + +Some examples of stateful operations: + + - When an application searches for certain event patterns, the state will + store the sequence of events encountered so far. + - When aggregating events per minute/hour/day, the state holds the pending + aggregates. + - When training a machine learning model over a stream of data points, the + state holds the current version of the model parameters. + - When historic data needs to be managed, the state allows efficient access + to events that occurred in the past. + +Flink needs to be aware of the state in order to make it fault tolerant using +[checkpoints]({{< ref "docs/dev/datastream/fault-tolerance/checkpointing" >}}) +and [savepoints]({{< ref "docs/ops/state/savepoints" >}}). + +Knowledge about the state also allows for rescaling Flink applications, meaning +that Flink takes care of redistributing state across parallel instances. + +[Queryable state]({{< ref "docs/dev/datastream/fault-tolerance/queryable_state" >}}) allows you to access state from outside of Flink during runtime. + +When working with state, it might also be useful to read about [Flink's state +backends]({{< ref "docs/ops/state/state_backends" >}}). Flink +provides different state backends that specify how and where state is stored. + +{{< top >}} + +## Keyed State + +Keyed state is maintained in what can be thought of as an embedded key/value +store. The state is partitioned and distributed strictly together with the +streams that are read by the stateful operators. Hence, access to the key/value +state is only possible on *keyed streams*, i.e. after a keyed/partitioned data +exchange, and is restricted to the values associated with the current event's +key. Aligning the keys of streams and state makes sure that all state updates +are local operations, guaranteeing consistency without transaction overhead. +This alignment also allows Flink to redistribute the state and adjust the +stream partitioning transparently. + +{{< img src="/fig/state_partitioning.svg" alt="State and Partitioning" class="offset" width="50%" >}} + +Keyed State is further organized into so-called *Key Groups*. Key Groups are +the atomic unit by which Flink can redistribute Keyed State; there are exactly +as many Key Groups as the defined maximum parallelism. During execution each +parallel instance of a keyed operator works with the keys for one or more Key +Groups. + +## State Persistence + +Flink implements fault tolerance using a combination of **stream replay** and +**checkpointing**. A checkpoint marks a specific point in each of the +input streams along with the corresponding state for each of the operators. A +streaming dataflow can be resumed from a checkpoint while maintaining +consistency *(exactly-once processing semantics)* by restoring the state of the +operators and replaying the records from the point of the checkpoint. + +The checkpoint interval is a means of trading off the overhead of fault +tolerance during execution with the recovery time (the number of records that +need to be replayed). + +The fault tolerance mechanism continuously draws snapshots of the distributed +streaming data flow. For streaming applications with small state, these +snapshots are very light-weight and can be drawn frequently without much impact +on performance. The state of the streaming applications is stored at a +configurable place, usually in a distributed file system. + +In case of a program failure (due to machine-, network-, or software failure), +Flink stops the distributed streaming dataflow. The system then restarts the +operators and resets them to the latest successful checkpoint. The input +streams are reset to the point of the state snapshot. Any records that are +processed as part of the restarted parallel dataflow are guaranteed to not have +affected the previously checkpointed state. + +{{< hint warning >}} +By default, checkpointing is disabled. See [Checkpointing]({{< ref "docs/dev/datastream/fault-tolerance/checkpointing" >}}) for details on how to enable and configure checkpointing. +{{< /hint >}} + +{{< hint info >}} +For this mechanism to realize its full guarantees, the data +stream source (such as message queue or broker) needs to be able to rewind the +stream to a defined recent point. [Apache Kafka](http://kafka.apache.org) has +this ability and Flink's connector to Kafka exploits this. See [Fault +Tolerance Guarantees of Data Sources and Sinks]({{< ref "docs/connectors/datastream/guarantees" >}}) for more information about the guarantees +provided by Flink's connectors. +{{< /hint >}} + +{{< hint info >}} +Because Flink's checkpoints are realized through distributed +snapshots, we use the words *snapshot* and *checkpoint* interchangeably. Often +we also use the term *snapshot* to mean either *checkpoint* or *savepoint*. +{{< /hint >}} + +### Checkpointing + +The central part of Flink's fault tolerance mechanism is drawing consistent +snapshots of the distributed data stream and operator state. These snapshots +act as consistent checkpoints to which the system can fall back in case of a +failure. Flink's mechanism for drawing these snapshots is described in +"[Lightweight Asynchronous Snapshots for Distributed +Dataflows](http://arxiv.org/abs/1506.08603)". It is inspired by the standard +[Chandy-Lamport algorithm](http://research.microsoft.com/en-us/um/people/lamport/pubs/chandy.pdf) +for distributed snapshots and is specifically tailored to Flink's execution +model. + +Keep in mind that everything to do with checkpointing can be done +asynchronously. The checkpoint barriers don't travel in lock step and +operations can asynchronously snapshot their state. + +Since Flink 1.11, checkpoints can be taken with or without alignment. In this +section, we describe aligned checkpoints first. + +#### Barriers + +A core element in Flink's distributed snapshotting are the *stream barriers*. +These barriers are injected into the data stream and flow with the records as +part of the data stream. Barriers never overtake records, they flow strictly in +line. A barrier separates the records in the data stream into the set of +records that goes into the current snapshot, and the records that go into the +next snapshot. Each barrier carries the ID of the snapshot whose records it +pushed in front of it. Barriers do not interrupt the flow of the stream and are +hence very lightweight. Multiple barriers from different snapshots can be in +the stream at the same time, which means that various snapshots may happen +concurrently. + +
    + {{< img src="/fig/stream_barriers.svg" alt="Checkpoint barriers in data streams" width="60%" >}} +
    + +Stream barriers are injected into the parallel data flow at the stream sources. +The point where the barriers for snapshot *n* are injected (let's call it +Sn) is the position in the source stream up to which the +snapshot covers the data. For example, in Apache Kafka, this position would be +the last record's offset in the partition. This position Sn +is reported to the *checkpoint coordinator* (Flink's JobManager). + +The barriers then flow downstream. When an intermediate operator has received a +barrier for snapshot *n* from all of its input streams, it emits a barrier for +snapshot *n* into all of its outgoing streams. Once a sink operator (the end of +a streaming DAG) has received the barrier *n* from all of its input streams, it +acknowledges that snapshot *n* to the checkpoint coordinator. After all sinks +have acknowledged a snapshot, it is considered completed. + +Once snapshot *n* has been completed, the job will never again ask the source +for records from before Sn, since at that point these records +(and their descendant records) will have passed through the entire data flow +topology. + +
    + {{< img src="/fig/stream_aligning.svg" alt="Aligning data streams at operators with multiple inputs" width="60%" >}} +
    + +Operators that receive more than one input stream need to *align* the input +streams on the snapshot barriers. The figure above illustrates this: + + - As soon as the operator receives snapshot barrier *n* from an incoming + stream, it cannot process any further records from that stream until it has + received the barrier *n* from the other inputs as well. Otherwise, it would + mix records that belong to snapshot *n* and with records that belong to + snapshot *n+1*. + - Once the last stream has received barrier *n*, the operator emits all + pending outgoing records, and then emits snapshot *n* barriers itself. + - It snapshots the state and resumes processing records from all input streams, + processing records from the input buffers before processing the records + from the streams. + - Finally, the operator writes the state asynchronously to the state backend. + +Note that the alignment is needed for all operators with multiple inputs and for +operators after a shuffle when they consume output streams of multiple upstream +subtasks. + +#### Snapshotting Operator State + +When operators contain any form of *state*, this state must be part of the +snapshots as well. + +Operators snapshot their state at the point in time when they have received all +snapshot barriers from their input streams, and before emitting the barriers to +their output streams. At that point, all updates to the state from records +before the barriers have been made, and no updates that depend on records +from after the barriers have been applied. Because the state of a snapshot may +be large, it is stored in a configurable *[state backend]({{< ref "docs/ops/state/state_backends" >}})*. By default, this is the JobManager's +memory, but for production use a distributed reliable storage should be +configured (such as HDFS). After the state has been stored, the operator +acknowledges the checkpoint, emits the snapshot barrier into the output +streams, and proceeds. + +The resulting snapshot now contains: + + - For each parallel stream data source, the offset/position in the stream + when the snapshot was started + - For each operator, a pointer to the state that was stored as part of the + snapshot + +
    + {{< img src="/fig/checkpointing.svg" alt="Illustration of the Checkpointing Mechanism" width="75%" >}} +
    + +#### Recovery + +Recovery under this mechanism is straightforward: Upon a failure, Flink selects +the latest completed checkpoint *k*. The system then re-deploys the entire +distributed dataflow, and gives each operator the state that was snapshotted as +part of checkpoint *k*. The sources are set to start reading the stream from +position Sk. For example in Apache Kafka, that means telling +the consumer to start fetching from offset Sk. + +If state was snapshotted incrementally, the operators start with the state of +the latest full snapshot and then apply a series of incremental snapshot +updates to that state. + +See [Restart Strategies]({{< ref "docs/dev/execution/task_failure_recovery" >}}#restart-strategies) for more information. + +### Unaligned Checkpointing + +Checkpointing can also be performed unaligned. +The basic idea is that checkpoints can overtake all in-flight data as long as +the in-flight data becomes part of the operator state. + +Note that this approach is actually closer to the [Chandy-Lamport algorithm +](http://research.microsoft.com/en-us/um/people/lamport/pubs/chandy.pdf), but +Flink still inserts the barrier in the sources to avoid overloading the +checkpoint coordinator. + +{{< img src="/fig/stream_unaligning.svg" alt="Unaligned checkpointing" >}} + +The figure depicts how an operator handles unaligned checkpoint barriers: + +- The operator reacts on the first barrier that is stored in its input buffers. +- It immediately forwards the barrier to the downstream operator by adding it + to the end of the output buffers. +- The operator marks all overtaken records to be stored asynchronously and + creates a snapshot of its own state. + +Consequently, the operator only briefly stops the processing of input to mark +the buffers, forwards the barrier, and creates the snapshot of the other state. + +Unaligned checkpointing ensures that barriers are arriving at the sink as fast +as possible. It's especially suited for applications with at least one slow +moving data path, where alignment times can reach hours. However, since it's +adding additional I/O pressure, it doesn't help when the I/O to the state +backends is the bottleneck. See the more in-depth discussion in +[ops]({{< ref "docs/ops/state/checkpoints" >}}#unaligned-checkpoints) +for other limitations. + +Note that savepoints will always be aligned. + +#### Unaligned Recovery + +Operators first recover the in-flight data before starting processing any data +from upstream operators in unaligned checkpointing. Aside from that, it +performs the same steps as during [recovery of aligned checkpoints](#recovery). + +### State Backends + +The exact data structures in which the key/values indexes are stored depends on +the chosen [state backend]({{< ref "docs/ops/state/state_backends" >}}). One state backend stores data in an in-memory +hash map, another state backend uses [RocksDB](http://rocksdb.org) as the +key/value store. In addition to defining the data structure that holds the +state, the state backends also implement the logic to take a point-in-time +snapshot of the key/value state and store that snapshot as part of a +checkpoint. State backends can be configured without changing your application +logic. + +{{< img src="/fig/checkpoints.svg" alt="checkpoints and snapshots" class="offset" width="60%" >}} + +{{< top >}} + +### Savepoints + +All programs that use checkpointing can resume execution from a **savepoint**. +Savepoints allow both updating your programs and your Flink cluster without +losing any state. + +[Savepoints]({{< ref "docs/ops/state/savepoints" >}}) are +**manually triggered checkpoints**, which take a snapshot of the program and +write it out to a state backend. They rely on the regular checkpointing +mechanism for this. + +Savepoints are similar to checkpoints except that they are +**triggered by the user** and **don't automatically expire** when newer +checkpoints are completed. + +{{< top >}} + +### Exactly Once vs. At Least Once + +The alignment step may add latency to the streaming program. Usually, this +extra latency is on the order of a few milliseconds, but we have seen cases +where the latency of some outliers increased noticeably. For applications that +require consistently super low latencies (few milliseconds) for all records, +Flink has a switch to skip the stream alignment during a checkpoint. Checkpoint +snapshots are still drawn as soon as an operator has seen the checkpoint +barrier from each input. + +When the alignment is skipped, an operator keeps processing all inputs, even +after some checkpoint barriers for checkpoint *n* arrived. That way, the +operator also processes elements that belong to checkpoint *n+1* before the +state snapshot for checkpoint *n* was taken. On a restore, these records will +occur as duplicates, because they are both included in the state snapshot of +checkpoint *n*, and will be replayed as part of the data after checkpoint *n*. + +{{< hint info >}} +Alignment happens only for operators with multiple predecessors +(joins) as well as operators with multiple senders (after a stream +repartitioning/shuffle). Because of that, dataflows with only embarrassingly +parallel streaming operations (`map()`, `flatMap()`, `filter()`, ...) actually +give *exactly once* guarantees even in *at least once* mode. +{{< /hint >}} + +{{< top >}} + +## State and Fault Tolerance in Batch Programs + +Flink executes [batch programs]({{< ref "docs/dev/dataset/overview" >}}) as a special case of +streaming programs, where the streams are bounded (finite number of elements). +A *DataSet* is treated internally as a stream of data. The concepts above thus +apply to batch programs in the same way as well as they apply to streaming +programs, with minor exceptions: + + - [Fault tolerance for batch programs]({{< ref "docs/dev/execution/task_failure_recovery" >}}) + does not use checkpointing. Recovery happens by fully replaying the + streams. That is possible, because inputs are bounded. This pushes the + cost more towards the recovery, but makes the regular processing cheaper, + because it avoids checkpoints. + + - Stateful operations in the DataSet API use simplified in-memory/out-of-core + data structures, rather than key/value indexes. + + - The DataSet API introduces special synchronized (superstep-based) + iterations, which are only possible on bounded streams. For details, check + out the [iteration docs]({{< ref "docs/dev/dataset/iterations" >}}). + +{{< top >}} diff --git a/docs/content.zh/docs/concepts/time.md b/docs/content.zh/docs/concepts/time.md new file mode 100644 index 0000000000000..473aa0d8c1895 --- /dev/null +++ b/docs/content.zh/docs/concepts/time.md @@ -0,0 +1,204 @@ +--- +title: 及时流处理 +weight: 3 +type: docs +--- + + +# 及时流处理 + +## Introduction + +Timely stream processing is an extension of [stateful stream processing]({{< ref "docs/concepts/stateful-stream-processing" >}}) in which time plays some role in the +computation. Among other things, this is the case when you do time series +analysis, when doing aggregations based on certain time periods (typically +called windows), or when you do event processing where the time when an event +occurred is important. + +In the following sections we will highlight some of the topics that you should +consider when working with timely Flink Applications. + +{{< top >}} + +## Notions of Time: Event Time and Processing Time + +When referring to time in a streaming program (for example to define windows), +one can refer to different notions of *time*: + +- **Processing time:** Processing time refers to the system time of the machine + that is executing the respective operation. + + When a streaming program runs on processing time, all time-based operations + (like time windows) will use the system clock of the machines that run the + respective operator. An hourly processing time window will include all + records that arrived at a specific operator between the times when the system + clock indicated the full hour. For example, if an application begins running + at 9:15am, the first hourly processing time window will include events + processed between 9:15am and 10:00am, the next window will include events + processed between 10:00am and 11:00am, and so on. + + Processing time is the simplest notion of time and requires no coordination + between streams and machines. It provides the best performance and the + lowest latency. However, in distributed and asynchronous environments + processing time does not provide determinism, because it is susceptible to + the speed at which records arrive in the system (for example from the message + queue), to the speed at which the records flow between operators inside the + system, and to outages (scheduled, or otherwise). + +- **Event time:** Event time is the time that each individual event occurred on + its producing device. This time is typically embedded within the records + before they enter Flink, and that *event timestamp* can be extracted from + each record. In event time, the progress of time depends on the data, not on + any wall clocks. Event time programs must specify how to generate *Event Time + Watermarks*, which is the mechanism that signals progress in event time. This + watermarking mechanism is described in a later section, + [below](#event-time-and-watermarks). + + In a perfect world, event time processing would yield completely consistent + and deterministic results, regardless of when events arrive, or their + ordering. However, unless the events are known to arrive in-order (by + timestamp), event time processing incurs some latency while waiting for + out-of-order events. As it is only possible to wait for a finite period of + time, this places a limit on how deterministic event time applications can + be. + + Assuming all of the data has arrived, event time operations will behave as + expected, and produce correct and consistent results even when working with + out-of-order or late events, or when reprocessing historic data. For example, + an hourly event time window will contain all records that carry an event + timestamp that falls into that hour, regardless of the order in which they + arrive, or when they are processed. (See the section on [late + events](#late-elements) for more information.) + + Note that sometimes when event time programs are processing live data in + real-time, they will use some *processing time* operations in order to + guarantee that they are progressing in a timely fashion. + +{{< img src="/fig/event_processing_time.svg" alt="Event Time and Processing Time" width="80%" >}} + +{{< top >}} + +## Event Time and Watermarks + +*Note: Flink implements many techniques from the Dataflow Model. For a good +introduction to event time and watermarks, have a look at the articles below.* + + - [Streaming 101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) by Tyler Akidau + - The [Dataflow Model paper](https://research.google.com/pubs/archive/43864.pdf) + + +A stream processor that supports *event time* needs a way to measure the +progress of event time. For example, a window operator that builds hourly +windows needs to be notified when event time has passed beyond the end of an +hour, so that the operator can close the window in progress. + +*Event time* can progress independently of *processing time* (measured by wall +clocks). For example, in one program the current *event time* of an operator +may trail slightly behind the *processing time* (accounting for a delay in +receiving the events), while both proceed at the same speed. On the other +hand, another streaming program might progress through weeks of event time with +only a few seconds of processing, by fast-forwarding through some historic data +already buffered in a Kafka topic (or another message queue). + +------ + +The mechanism in Flink to measure progress in event time is **watermarks**. +Watermarks flow as part of the data stream and carry a timestamp *t*. A +*Watermark(t)* declares that event time has reached time *t* in that stream, +meaning that there should be no more elements from the stream with a timestamp +*t' <= t* (i.e. events with timestamps older or equal to the watermark). + +The figure below shows a stream of events with (logical) timestamps, and +watermarks flowing inline. In this example the events are in order (with +respect to their timestamps), meaning that the watermarks are simply periodic +markers in the stream. + +{{< img src="/fig/stream_watermark_in_order.svg" alt="A data stream with events (in order) and watermarks" width="65%" >}} + +Watermarks are crucial for *out-of-order* streams, as illustrated below, where +the events are not ordered by their timestamps. In general a watermark is a +declaration that by that point in the stream, all events up to a certain +timestamp should have arrived. Once a watermark reaches an operator, the +operator can advance its internal *event time clock* to the value of the +watermark. + +{{< img src="/fig/stream_watermark_out_of_order.svg" alt="A data stream with events (out of order) and watermarks" width="65%" >}} + +Note that event time is inherited by a freshly created stream element (or +elements) from either the event that produced them or from watermark that +triggered creation of those elements. + +### Watermarks in Parallel Streams + +Watermarks are generated at, or directly after, source functions. Each parallel +subtask of a source function usually generates its watermarks independently. +These watermarks define the event time at that particular parallel source. + +As the watermarks flow through the streaming program, they advance the event +time at the operators where they arrive. Whenever an operator advances its +event time, it generates a new watermark downstream for its successor +operators. + +Some operators consume multiple input streams; a union, for example, or +operators following a *keyBy(...)* or *partition(...)* function. Such an +operator's current event time is the minimum of its input streams' event times. +As its input streams update their event times, so does the operator. + +The figure below shows an example of events and watermarks flowing through +parallel streams, and operators tracking event time. + +{{< img src="/fig/parallel_streams_watermarks.svg" alt="Parallel data streams and operators with events and watermarks" class="center" width="80%" >}} + +## Lateness + +It is possible that certain elements will violate the watermark condition, +meaning that even after the *Watermark(t)* has occurred, more elements with +timestamp *t' <= t* will occur. In fact, in many real world setups, certain +elements can be arbitrarily delayed, making it impossible to specify a time by +which all elements of a certain event timestamp will have occurred. +Furthermore, even if the lateness can be bounded, delaying the watermarks by +too much is often not desirable, because it causes too much delay in the +evaluation of event time windows. + +For this reason, streaming programs may explicitly expect some *late* elements. +Late elements are elements that arrive after the system's event time clock (as +signaled by the watermarks) has already passed the time of the late element's +timestamp. See [Allowed Lateness]({{< ref "docs/dev/datastream/operators/windows" >}}#allowed-lateness) for more information on +how to work with late elements in event time windows. + +## Windowing + +Aggregating events (e.g., counts, sums) works differently on streams than in +batch processing. For example, it is impossible to count all elements in a +stream, because streams are in general infinite (unbounded). Instead, +aggregates on streams (counts, sums, etc), are scoped by **windows**, such as +*"count over the last 5 minutes"*, or *"sum of the last 100 elements"*. + +Windows can be *time driven* (example: every 30 seconds) or *data driven* +(example: every 100 elements). One typically distinguishes different types of +windows, such as *tumbling windows* (no overlap), *sliding windows* (with +overlap), and *session windows* (punctuated by a gap of inactivity). + +{{< img src="/fig/windows.svg" alt="Time- and Count Windows" class="offset" width="80%" >}} + +Please check out this [blog post](https://flink.apache.org/news/2015/12/04/Introducing-windows.html) for +additional examples of windows or take a look a [window documentation]({{< ref "docs/dev/datastream/operators/windows" >}}) of the DataStream API. + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/_index.md b/docs/content.zh/docs/connectors/_index.md new file mode 100644 index 0000000000000..ee1ddc1be0fb5 --- /dev/null +++ b/docs/content.zh/docs/connectors/_index.md @@ -0,0 +1,25 @@ +--- +title: Connectors +icon: +bold: true +bookCollapseSection: true +weight: 6 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/connectors/dataset.md b/docs/content.zh/docs/connectors/dataset.md new file mode 100644 index 0000000000000..2f6bd97d4d7fd --- /dev/null +++ b/docs/content.zh/docs/connectors/dataset.md @@ -0,0 +1,190 @@ +--- +title: "DataSet Connectors" +weight: 11 +type: docs +aliases: + - /zh/dev/batch/connectors.html +--- + + +# DataSet Connectors + +* TOC + + +## Reading from and writing to file systems + +The Apache Flink project supports multiple [file systems]({{< ref "docs/deployment/filesystems/overview" >}}) that can be used as backing stores +for input and output connectors. + +## Connecting to other systems using Input/OutputFormat wrappers for Hadoop + +Apache Flink allows users to access many different systems as data sources or sinks. +The system is designed for very easy extensibility. Similar to Apache Hadoop, Flink has the concept +of so called `InputFormat`s and `OutputFormat`s. + +One implementation of these `InputFormat`s is the `HadoopInputFormat`. This is a wrapper that allows +users to use all existing Hadoop input formats with Flink. + +This section shows some examples for connecting Flink to other systems. +[Read more about Hadoop compatibility in Flink]({{< ref "docs/dev/dataset/hadoop_compatibility" >}}). + +## Avro support in Flink + +Flink has extensive built-in support for [Apache Avro](http://avro.apache.org/). This allows to easily read from Avro files with Flink. +Also, the serialization framework of Flink is able to handle classes generated from Avro schemas. Be sure to include the Flink Avro dependency to the pom.xml of your project. + +```xml + + org.apache.flink + flink-avro + {{ site.version }} + +``` + +In order to read data from an Avro file, you have to specify an `AvroInputFormat`. + +**Example**: + +```java +AvroInputFormat users = new AvroInputFormat(in, User.class); +DataSet usersDS = env.createInput(users); +``` + +Note that `User` is a POJO generated by Avro. Flink also allows to perform string-based key selection of these POJOs. For example: + +```java +usersDS.groupBy("name") +``` + + +Note that using the `GenericData.Record` type is possible with Flink, but not recommended. Since the record contains the full schema, its very data intensive and thus probably slow to use. + +Flink's POJO field selection also works with POJOs generated from Avro. However, the usage is only possible if the field types are written correctly to the generated class. If a field is of type `Object` you can not use the field as a join or grouping key. +Specifying a field in Avro like this `{"name": "type_double_test", "type": "double"},` works fine, however specifying it as a UNION-type with only one field (`{"name": "type_double_test", "type": ["double"]},`) will generate a field of type `Object`. Note that specifying nullable types (`{"name": "type_double_test", "type": ["null", "double"]},`) is possible! + + + +### Access Microsoft Azure Table Storage + +_Note: This example works starting from Flink 0.6-incubating_ + +This example is using the `HadoopInputFormat` wrapper to use an existing Hadoop input format implementation for accessing [Azure's Table Storage](https://azure.microsoft.com/en-us/documentation/articles/storage-introduction/). + +1. Download and compile the `azure-tables-hadoop` project. The input format developed by the project is not yet available in Maven Central, therefore, we have to build the project ourselves. +Execute the following commands: + +```bash +git clone https://github.com/mooso/azure-tables-hadoop.git +cd azure-tables-hadoop +mvn clean install +``` + +2. Setup a new Flink project using the quickstarts: + +```bash +curl https://flink.apache.org/q/quickstart.sh | bash +``` + +3. Add the following dependencies (in the `` section) to your `pom.xml` file: + +```xml + + org.apache.flink + flink-hadoop-compatibility{{ site.scala_version_suffix }} + {{site.version}} + + + com.microsoft.hadoop + microsoft-hadoop-azure + 0.0.4 + +``` + +`flink-hadoop-compatibility` is a Flink package that provides the Hadoop input format wrappers. +`microsoft-hadoop-azure` is adding the project we've build before to our project. + +The project is now prepared for starting to code. We recommend to import the project into an IDE, such as Eclipse or IntelliJ. (Import as a Maven project!). +Browse to the code of the `Job.java` file. Its an empty skeleton for a Flink job. + +Paste the following code into it: + +```java +import java.util.Map; +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.java.DataSet; +import org.apache.flink.api.java.ExecutionEnvironment; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.hadoopcompatibility.mapreduce.HadoopInputFormat; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import com.microsoft.hadoop.azure.AzureTableConfiguration; +import com.microsoft.hadoop.azure.AzureTableInputFormat; +import com.microsoft.hadoop.azure.WritableEntity; +import com.microsoft.windowsazure.storage.table.EntityProperty; + +public class AzureTableExample { + + public static void main(String[] args) throws Exception { + // set up the execution environment + final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); + + // create a AzureTableInputFormat, using a Hadoop input format wrapper + HadoopInputFormat hdIf = new HadoopInputFormat(new AzureTableInputFormat(), Text.class, WritableEntity.class, new Job()); + + // set the Account URI, something like: https://apacheflink.table.core.windows.net + hdIf.getConfiguration().set(AzureTableConfiguration.Keys.ACCOUNT_URI.getKey(), "TODO"); + // set the secret storage key here + hdIf.getConfiguration().set(AzureTableConfiguration.Keys.STORAGE_KEY.getKey(), "TODO"); + // set the table name here + hdIf.getConfiguration().set(AzureTableConfiguration.Keys.TABLE_NAME.getKey(), "TODO"); + + DataSet> input = env.createInput(hdIf); + // a little example how to use the data in a mapper. + DataSet fin = input.map(new MapFunction, String>() { + @Override + public String map(Tuple2 arg0) throws Exception { + System.err.println("--------------------------------\nKey = "+arg0.f0); + WritableEntity we = arg0.f1; + + for(Map.Entry prop : we.getProperties().entrySet()) { + System.err.println("key="+prop.getKey() + " ; value (asString)="+prop.getValue().getValueAsString()); + } + + return arg0.f0.toString(); + } + }); + + // emit result (this works only locally) + fin.print(); + + // execute program + env.execute("Azure Example"); + } +} +``` + +The example shows how to access an Azure table and turn data into Flink's `DataSet` (more specifically, the type of the set is `DataSet>`). With the `DataSet`, you can apply all known transformations to the DataSet. + +## Access MongoDB + +This [GitHub repository documents how to use MongoDB with Apache Flink (starting from 0.7-incubating)](https://github.com/okkam-it/flink-mongodb-test). + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/_index.md b/docs/content.zh/docs/connectors/datastream/_index.md new file mode 100644 index 0000000000000..c4ae18206f407 --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/_index.md @@ -0,0 +1,23 @@ +--- +title: DataStream Connectors +bookCollapseSection: true +weight: 1 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/connectors/datastream/cassandra.md b/docs/content.zh/docs/connectors/datastream/cassandra.md new file mode 100644 index 0000000000000..97eb6daf907bb --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/cassandra.md @@ -0,0 +1,284 @@ +--- +title: Cassandra +weight: 3 +type: docs +aliases: + - /zh/dev/connectors/cassandra.html + - /zh/apis/streaming/connectors/cassandra.html +--- + + +# Apache Cassandra Connector + +This connector provides sinks that writes data into a [Apache Cassandra](https://cassandra.apache.org/) database. + + + +To use this connector, add the following dependency to your project: + +{{< artifact flink-connector-cassandra withScalaVersion >}} + +Note that the streaming connectors are currently __NOT__ part of the binary distribution. See how to link with them for cluster execution [here]({{< ref "docs/dev/datastream/project-configuration" >}}). + +## Installing Apache Cassandra +There are multiple ways to bring up a Cassandra instance on local machine: + +1. Follow the instructions from [Cassandra Getting Started page](http://cassandra.apache.org/doc/latest/getting_started/index.html). +2. Launch a container running Cassandra from [Official Docker Repository](https://hub.docker.com/_/cassandra/) + +## Cassandra Sinks + +### Configurations + +Flink's Cassandra sink are created by using the static CassandraSink.addSink(DataStream input) method. +This method returns a CassandraSinkBuilder, which offers methods to further configure the sink, and finally `build()` the sink instance. + +The following configuration methods can be used: + +1. _setQuery(String query)_ + * Sets the upsert query that is executed for every record the sink receives. + * The query is internally treated as CQL statement. + * __DO__ set the upsert query for processing __Tuple__ data type. + * __DO NOT__ set the query for processing __POJO__ data types. +2. _setClusterBuilder()_ + * Sets the cluster builder that is used to configure the connection to cassandra with more sophisticated settings such as consistency level, retry policy and etc. +3. _setHost(String host[, int port])_ + * Simple version of setClusterBuilder() with host/port information to connect to Cassandra instances +4. _setMapperOptions(MapperOptions options)_ + * Sets the mapper options that are used to configure the DataStax ObjectMapper. + * Only applies when processing __POJO__ data types. +5. _setMaxConcurrentRequests(int maxConcurrentRequests, Duration timeout)_ + * Sets the maximum allowed number of concurrent requests with a timeout for acquiring permits to execute. + * Only applies when __enableWriteAheadLog()__ is not configured. +6. _enableWriteAheadLog([CheckpointCommitter committer])_ + * An __optional__ setting + * Allows exactly-once processing for non-deterministic algorithms. +7. _setFailureHandler([CassandraFailureHandler failureHandler])_ + * An __optional__ setting + * Sets the custom failure handler. +8. _build()_ + * Finalizes the configuration and constructs the CassandraSink instance. + +### Write-ahead Log + +A checkpoint committer stores additional information about completed checkpoints +in some resource. This information is used to prevent a full replay of the last +completed checkpoint in case of a failure. +You can use a `CassandraCommitter` to store these in a separate table in cassandra. +Note that this table will NOT be cleaned up by Flink. + +Flink can provide exactly-once guarantees if the query is idempotent (meaning it can be applied multiple +times without changing the result) and checkpointing is enabled. In case of a failure the failed +checkpoint will be replayed completely. + +Furthermore, for non-deterministic programs the write-ahead log has to be enabled. For such a program +the replayed checkpoint may be completely different than the previous attempt, which may leave the +database in an inconsistent state since part of the first attempt may already be written. +The write-ahead log guarantees that the replayed checkpoint is identical to the first attempt. +Note that that enabling this feature will have an adverse impact on latency. + +

    Note: The write-ahead log functionality is currently experimental. In many cases it is sufficient to use the connector without enabling it. Please report problems to the development mailing list.

    + +### Checkpointing and Fault Tolerance +With checkpointing enabled, Cassandra Sink guarantees at-least-once delivery of action requests to C* instance. + +More details on [checkpoints docs]({{< ref "docs/dev/datastream/fault-tolerance/checkpointing" >}}) and [fault tolerance guarantee docs]({{< ref "docs/connectors/datastream/guarantees" >}}) + +## Examples + +The Cassandra sinks currently support both Tuple and POJO data types, and Flink automatically detects which type of input is used. For general use case of those streaming data type, please refer to [Supported Data Types]({{< ref "docs/dev/serialization/types_serialization" >}}#supported-data-types). We show two implementations based on [SocketWindowWordCount](https://github.com/apache/flink/blob/master/flink-examples/flink-examples-streaming/src/main/java/org/apache/flink/streaming/examples/socket/SocketWindowWordCount.java), for Pojo and Tuple data types respectively. + +In all these examples, we assumed the associated Keyspace `example` and Table `wordcount` have been created. + +{{< tabs "ffc5c4d4-7872-479c-bfa6-206b9e96f6f3" >}} +{{< tab "CQL" >}} +```sql +CREATE KEYSPACE IF NOT EXISTS example + WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}; + +CREATE TABLE IF NOT EXISTS example.wordcount ( + word text, + count bigint, + PRIMARY KEY(word) +); +``` +{{< /tab >}} +{{< /tabs >}} + +### Cassandra Sink Example for Streaming Tuple Data Type +While storing the result with Java/Scala Tuple data type to a Cassandra sink, it is required to set a CQL upsert statement (via setQuery('stmt')) to persist each record back to the database. With the upsert query cached as `PreparedStatement`, each Tuple element is converted to parameters of the statement. + +For details about `PreparedStatement` and `BoundStatement`, please visit [DataStax Java Driver manual](https://docs.datastax.com/en/developer/java-driver/2.1/manual/statements/prepared/) + +{{< tabs "1a84c6a0-0b2f-4f96-8cf8-43ec6dd3bc5d" >}} +{{< tab "Java" >}} +```java +// get the execution environment +final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +// get input data by connecting to the socket +DataStream text = env.socketTextStream(hostname, port, "\n"); + +// parse the data, group it, window it, and aggregate the counts +DataStream> result = text + .flatMap(new FlatMapFunction>() { + @Override + public void flatMap(String value, Collector> out) { + // normalize and split the line + String[] words = value.toLowerCase().split("\\s"); + + // emit the pairs + for (String word : words) { + //Do not accept empty word, since word is defined as primary key in C* table + if (!word.isEmpty()) { + out.collect(new Tuple2(word, 1L)); + } + } + } + }) + .keyBy(value -> value.f0) + .window(TumblingProcessingTimeWindows.of(Time.seconds(5))) + .sum(1); + +CassandraSink.addSink(result) + .setQuery("INSERT INTO example.wordcount(word, count) values (?, ?);") + .setHost("127.0.0.1") + .build(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment + +// get input data by connecting to the socket +val text: DataStream[String] = env.socketTextStream(hostname, port, '\n') + +// parse the data, group it, window it, and aggregate the counts +val result: DataStream[(String, Long)] = text + // split up the lines in pairs (2-tuples) containing: (word,1) + .flatMap(_.toLowerCase.split("\\s")) + .filter(_.nonEmpty) + .map((_, 1L)) + // group by the tuple field "0" and sum up tuple field "1" + .keyBy(_._1) + .window(TumblingProcessingTimeWindows.of(Time.seconds(5))) + .sum(1) + +CassandraSink.addSink(result) + .setQuery("INSERT INTO example.wordcount(word, count) values (?, ?);") + .setHost("127.0.0.1") + .build() + +result.print().setParallelism(1) +``` +{{< /tab >}} +{{< /tabs >}} + + +### Cassandra Sink Example for Streaming POJO Data Type +An example of streaming a POJO data type and store the same POJO entity back to Cassandra. In addition, this POJO implementation needs to follow [DataStax Java Driver Manual](http://docs.datastax.com/en/developer/java-driver/2.1/manual/object_mapper/creating/) to annotate the class as each field of this entity is mapped to an associated column of the designated table using the DataStax Java Driver `com.datastax.driver.mapping.Mapper` class. + +The mapping of each table column can be defined through annotations placed on a field declaration in the Pojo class. For details of the mapping, please refer to CQL documentation on [Definition of Mapped Classes](http://docs.datastax.com/en/developer/java-driver/3.1/manual/object_mapper/creating/) and [CQL Data types](https://docs.datastax.com/en/cql/3.1/cql/cql_reference/cql_data_types_c.html) + +{{< tabs "d65ca6f5-acb2-4f2c-b5b6-d986eafca765" >}} +{{< tab "Java" >}} +```java +// get the execution environment +final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +// get input data by connecting to the socket +DataStream text = env.socketTextStream(hostname, port, "\n"); + +// parse the data, group it, window it, and aggregate the counts +DataStream result = text + .flatMap(new FlatMapFunction() { + public void flatMap(String value, Collector out) { + // normalize and split the line + String[] words = value.toLowerCase().split("\\s"); + + // emit the pairs + for (String word : words) { + if (!word.isEmpty()) { + //Do not accept empty word, since word is defined as primary key in C* table + out.collect(new WordCount(word, 1L)); + } + } + } + }) + .keyBy(WordCount::getWord) + .window(TumblingProcessingTimeWindows.of(Time.seconds(5))) + + .reduce(new ReduceFunction() { + @Override + public WordCount reduce(WordCount a, WordCount b) { + return new WordCount(a.getWord(), a.getCount() + b.getCount()); + } + }); + +CassandraSink.addSink(result) + .setHost("127.0.0.1") + .setMapperOptions(() -> new Mapper.Option[]{Mapper.Option.saveNullFields(true)}) + .build(); + + +@Table(keyspace = "example", name = "wordcount") +public class WordCount { + + @Column(name = "word") + private String word = ""; + + @Column(name = "count") + private long count = 0; + + public WordCount() {} + + public WordCount(String word, long count) { + this.setWord(word); + this.setCount(count); + } + + public String getWord() { + return word; + } + + public void setWord(String word) { + this.word = word; + } + + public long getCount() { + return count; + } + + public void setCount(long count) { + this.count = count; + } + + @Override + public String toString() { + return getWord() + " : " + getCount(); + } +} +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/elasticsearch.md b/docs/content.zh/docs/connectors/datastream/elasticsearch.md new file mode 100644 index 0000000000000..2559151b00ea9 --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/elasticsearch.md @@ -0,0 +1,471 @@ +--- +title: Elasticsearch +weight: 5 +type: docs +aliases: + - /zh/dev/connectors/elasticsearch.html + - /zh/apis/streaming/connectors/elasticsearch.html + - /zh/dev/connectors/elasticsearch2.html + - /zh/apis/streaming/connectors/elasticsearch2.html +--- + + +# Elasticsearch Connector + +This connector provides sinks that can request document actions to an +[Elasticsearch](https://elastic.co/) Index. To use this connector, add one +of the following dependencies to your project, depending on the version +of the Elasticsearch installation: + + + + + + + + + + + + + + + + + + + + + + +
    Elasticsearch versionMaven Dependency
    5.x{{< artifact flink-connector-elasticsearch5 withScalaVersion >}}
    6.x{{< artifact flink-connector-elasticsearch6 withScalaVersion >}}
    7 and later versions{{< artifact flink-connector-elasticsearch7 withScalaVersion >}}
    + +Note that the streaming connectors are currently not part of the binary +distribution. See [here]({{< ref "docs/dev/datastream/project-configuration" >}}) for information +about how to package the program with the libraries for cluster execution. + +## Installing Elasticsearch + +Instructions for setting up an Elasticsearch cluster can be found +[here](https://www.elastic.co/guide/en/elasticsearch/reference/current/setup.html). +Make sure to set and remember a cluster name. This must be set when +creating an `ElasticsearchSink` for requesting document actions against your cluster. + +## Elasticsearch Sink + +The `ElasticsearchSink` uses a `TransportClient` (before 6.x) or `RestHighLevelClient` (starting with 6.x) to communicate with an +Elasticsearch cluster. + +The example below shows how to configure and create a sink: + +{{< tabs "51732edd-4218-470e-adad-b1ebb4021ae4" >}} +{{< tab "java, 5.x" >}} +```java +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction; +import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer; +import org.apache.flink.streaming.connectors.elasticsearch5.ElasticsearchSink; + +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.client.Requests; + +import java.net.InetAddress; +import java.net.InetSocketAddress; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +DataStream input = ...; + +Map config = new HashMap<>(); +config.put("cluster.name", "my-cluster-name"); +// This instructs the sink to emit after every element, otherwise they would be buffered +config.put("bulk.flush.max.actions", "1"); + +List transportAddresses = new ArrayList<>(); +transportAddresses.add(new InetSocketAddress(InetAddress.getByName("127.0.0.1"), 9300)); +transportAddresses.add(new InetSocketAddress(InetAddress.getByName("10.2.3.1"), 9300)); + +input.addSink(new ElasticsearchSink<>(config, transportAddresses, new ElasticsearchSinkFunction() { + public IndexRequest createIndexRequest(String element) { + Map json = new HashMap<>(); + json.put("data", element); + + return Requests.indexRequest() + .index("my-index") + .type("my-type") + .source(json); + } + + @Override + public void process(String element, RuntimeContext ctx, RequestIndexer indexer) { + indexer.add(createIndexRequest(element)); + } +}));``` +{{< /tab >}} +{{< tab "java, Elasticsearch 6.x and above" >}} +```java +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction; +import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer; +import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink; + +import org.apache.http.HttpHost; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.client.Requests; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +DataStream input = ...; + +List httpHosts = new ArrayList<>(); +httpHosts.add(new HttpHost("127.0.0.1", 9200, "http")); +httpHosts.add(new HttpHost("10.2.3.1", 9200, "http")); + +// use a ElasticsearchSink.Builder to create an ElasticsearchSink +ElasticsearchSink.Builder esSinkBuilder = new ElasticsearchSink.Builder<>( + httpHosts, + new ElasticsearchSinkFunction() { + public IndexRequest createIndexRequest(String element) { + Map json = new HashMap<>(); + json.put("data", element); + + return Requests.indexRequest() + .index("my-index") + .type("my-type") + .source(json); + } + + @Override + public void process(String element, RuntimeContext ctx, RequestIndexer indexer) { + indexer.add(createIndexRequest(element)); + } + } +); + +// configuration for the bulk requests; this instructs the sink to emit after every element, otherwise they would be buffered +esSinkBuilder.setBulkFlushMaxActions(1); + +// provide a RestClientFactory for custom configuration on the internally created REST client +esSinkBuilder.setRestClientFactory( + restClientBuilder -> { + restClientBuilder.setDefaultHeaders(...) + restClientBuilder.setMaxRetryTimeoutMillis(...) + restClientBuilder.setPathPrefix(...) + restClientBuilder.setHttpClientConfigCallback(...) + } +); + +// finally, build and add the sink to the job's pipeline +input.addSink(esSinkBuilder.build()); +``` +{{< /tab >}} +{{< tab "scala, 5.x" >}} +```scala +import org.apache.flink.api.common.functions.RuntimeContext +import org.apache.flink.streaming.api.datastream.DataStream +import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction +import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer +import org.apache.flink.streaming.connectors.elasticsearch5.ElasticsearchSink + +import org.elasticsearch.action.index.IndexRequest +import org.elasticsearch.client.Requests + +import java.net.InetAddress +import java.net.InetSocketAddress +import java.util.ArrayList +import java.util.HashMap +import java.util.List +import java.util.Map + +val input: DataStream[String] = ... + +val config = new java.util.HashMap[String, String] +config.put("cluster.name", "my-cluster-name") +// This instructs the sink to emit after every element, otherwise they would be buffered +config.put("bulk.flush.max.actions", "1") + +val transportAddresses = new java.util.ArrayList[InetSocketAddress] +transportAddresses.add(new InetSocketAddress(InetAddress.getByName("127.0.0.1"), 9300)) +transportAddresses.add(new InetSocketAddress(InetAddress.getByName("10.2.3.1"), 9300)) + +input.addSink(new ElasticsearchSink(config, transportAddresses, new ElasticsearchSinkFunction[String] { + def createIndexRequest(element: String): IndexRequest = { + val json = new java.util.HashMap[String, String] + json.put("data", element) + + return Requests.indexRequest() + .index("my-index") + .type("my-type") + .source(json) + } +})) +``` +{{< /tab >}} +{{< tab "scala, Elasticsearch 6.x and above" >}} +```scala +import org.apache.flink.api.common.functions.RuntimeContext +import org.apache.flink.streaming.api.datastream.DataStream +import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction +import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer +import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink + +import org.apache.http.HttpHost +import org.elasticsearch.action.index.IndexRequest +import org.elasticsearch.client.Requests + +import java.util.ArrayList +import java.util.List + +val input: DataStream[String] = ... + +val httpHosts = new java.util.ArrayList[HttpHost] +httpHosts.add(new HttpHost("127.0.0.1", 9200, "http")) +httpHosts.add(new HttpHost("10.2.3.1", 9200, "http")) + +val esSinkBuilder = new ElasticsearchSink.Builder[String]( + httpHosts, + new ElasticsearchSinkFunction[String] { + def process(element: String, ctx: RuntimeContext, indexer: RequestIndexer) { + val json = new java.util.HashMap[String, String] + json.put("data", element) + + val rqst: IndexRequest = Requests.indexRequest + .index("my-index") + .`type`("my-type") + .source(json) + + indexer.add(rqst) + } + } +) + +// configuration for the bulk requests; this instructs the sink to emit after every element, otherwise they would be buffered +esSinkBuilder.setBulkFlushMaxActions(1) + +// provide a RestClientFactory for custom configuration on the internally created REST client +esSinkBuilder.setRestClientFactory(new RestClientFactory { + override def configureRestClientBuilder(restClientBuilder: RestClientBuilder): Unit = { + restClientBuilder.setDefaultHeaders(...) + restClientBuilder.setMaxRetryTimeoutMillis(...) + restClientBuilder.setPathPrefix(...) + restClientBuilder.setHttpClientConfigCallback(...) + } +}) + +// finally, build and add the sink to the job's pipeline +input.addSink(esSinkBuilder.build) +``` +{{< /tab >}} +{{< /tabs >}} + +For Elasticsearch versions that still uses the now deprecated `TransportClient` to communicate +with the Elasticsearch cluster (i.e., versions equal or below 5.x), note how a `Map` of `String`s +is used to configure the `ElasticsearchSink`. This config map will be directly +forwarded when creating the internally used `TransportClient`. +The configuration keys are documented in the Elasticsearch documentation +[here](https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html). +Especially important is the `cluster.name` parameter that must correspond to +the name of your cluster. + +For Elasticsearch 6.x and above, internally, the `RestHighLevelClient` is used for cluster communication. +By default, the connector uses the default configurations for the REST client. To have custom +configuration for the REST client, users can provide a `RestClientFactory` implementation when +setting up the `ElasticsearchClient.Builder` that builds the sink. + +Also note that the example only demonstrates performing a single index +request for each incoming element. Generally, the `ElasticsearchSinkFunction` +can be used to perform multiple requests of different types (ex., +`DeleteRequest`, `UpdateRequest`, etc.). + +Internally, each parallel instance of the Flink Elasticsearch Sink uses +a `BulkProcessor` to send action requests to the cluster. +This will buffer elements before sending them in bulk to the cluster. The `BulkProcessor` +executes bulk requests one at a time, i.e. there will be no two concurrent +flushes of the buffered actions in progress. + +### Elasticsearch Sinks and Fault Tolerance + +With Flink’s checkpointing enabled, the Flink Elasticsearch Sink guarantees +at-least-once delivery of action requests to Elasticsearch clusters. It does +so by waiting for all pending action requests in the `BulkProcessor` at the +time of checkpoints. This effectively assures that all requests before the +checkpoint was triggered have been successfully acknowledged by Elasticsearch, before +proceeding to process more records sent to the sink. + +More details on checkpoints and fault tolerance are in the [fault tolerance docs]({{< ref "docs/learn-flink/fault_tolerance" >}}). + +To use fault tolerant Elasticsearch Sinks, checkpointing of the topology needs to be enabled at the execution environment: + +{{< tabs "d00d1e93-4844-40d7-b0ec-9ec37e73145e" >}} +{{< tab "Java" >}} +```java +final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +env.enableCheckpointing(5000); // checkpoint every 5000 msecs +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() +env.enableCheckpointing(5000) // checkpoint every 5000 msecs +``` +{{< /tab >}} +{{< /tabs >}} + +

    +NOTE: Users can disable flushing if they wish to do so, by calling +disableFlushOnCheckpoint() on the created ElasticsearchSink. Be aware +that this essentially means the sink will not provide any strong +delivery guarantees anymore, even with checkpoint for the topology enabled. +

    + +### Handling Failing Elasticsearch Requests + +Elasticsearch action requests may fail due to a variety of reasons, including +temporarily saturated node queue capacity or malformed documents to be indexed. +The Flink Elasticsearch Sink allows the user to specify how request +failures are handled, by simply implementing an `ActionRequestFailureHandler` and +providing it to the constructor. + +Below is an example: + +{{< tabs "ddb958b3-5dd5-476e-b946-ace3335628b2" >}} +{{< tab "Java" >}} +```java +DataStream input = ...; + +input.addSink(new ElasticsearchSink<>( + config, transportAddresses, + new ElasticsearchSinkFunction() {...}, + new ActionRequestFailureHandler() { + @Override + void onFailure(ActionRequest action, + Throwable failure, + int restStatusCode, + RequestIndexer indexer) throw Throwable { + + if (ExceptionUtils.findThrowable(failure, EsRejectedExecutionException.class).isPresent()) { + // full queue; re-add document for indexing + indexer.add(action); + } else if (ExceptionUtils.findThrowable(failure, ElasticsearchParseException.class).isPresent()) { + // malformed document; simply drop request without failing sink + } else { + // for all other failures, fail the sink + // here the failure is simply rethrown, but users can also choose to throw custom exceptions + throw failure; + } + } +})); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val input: DataStream[String] = ... + +input.addSink(new ElasticsearchSink( + config, transportAddresses, + new ElasticsearchSinkFunction[String] {...}, + new ActionRequestFailureHandler { + @throws(classOf[Throwable]) + override def onFailure(ActionRequest action, + Throwable failure, + int restStatusCode, + RequestIndexer indexer) { + + if (ExceptionUtils.findThrowable(failure, EsRejectedExecutionException.class).isPresent()) { + // full queue; re-add document for indexing + indexer.add(action) + } else if (ExceptionUtils.findThrowable(failure, ElasticsearchParseException.class).isPresent()) { + // malformed document; simply drop request without failing sink + } else { + // for all other failures, fail the sink + // here the failure is simply rethrown, but users can also choose to throw custom exceptions + throw failure + } + } +})) +``` +{{< /tab >}} +{{< /tabs >}} + +The above example will let the sink re-add requests that failed due to +queue capacity saturation and drop requests with malformed documents, without +failing the sink. For all other failures, the sink will fail. If a `ActionRequestFailureHandler` +is not provided to the constructor, the sink will fail for any kind of error. + +Note that `onFailure` is called for failures that still occur only after the +`BulkProcessor` internally finishes all backoff retry attempts. +By default, the `BulkProcessor` retries to a maximum of 8 attempts with +an exponential backoff. For more information on the behaviour of the +internal `BulkProcessor` and how to configure it, please see the following section. + +By default, if a failure handler is not provided, the sink uses a +`NoOpFailureHandler` that simply fails for all kinds of exceptions. The +connector also provides a `RetryRejectedExecutionFailureHandler` implementation +that always re-add requests that have failed due to queue capacity saturation. + +

    +IMPORTANT: Re-adding requests back to the internal BulkProcessor +on failures will lead to longer checkpoints, as the sink will also +need to wait for the re-added requests to be flushed when checkpointing. +For example, when using RetryRejectedExecutionFailureHandler, checkpoints +will need to wait until Elasticsearch node queues have enough capacity for +all the pending requests. This also means that if re-added requests never +succeed, the checkpoint will never finish. +

    + +### Configuring the Internal Bulk Processor + +The internal `BulkProcessor` can be further configured for its behaviour +on how buffered action requests are flushed, by setting the following values in +the provided `Map`: + + * **bulk.flush.max.actions**: Maximum amount of actions to buffer before flushing. + * **bulk.flush.max.size.mb**: Maximum size of data (in megabytes) to buffer before flushing. + * **bulk.flush.interval.ms**: Interval at which to flush regardless of the amount or size of buffered actions. + +For versions 2.x and above, configuring how temporary request errors are +retried is also supported: + + * **bulk.flush.backoff.enable**: Whether or not to perform retries with backoff delay for a flush + if one or more of its actions failed due to a temporary `EsRejectedExecutionException`. + * **bulk.flush.backoff.type**: The type of backoff delay, either `CONSTANT` or `EXPONENTIAL` + * **bulk.flush.backoff.delay**: The amount of delay for backoff. For constant backoff, this + is simply the delay between each retry. For exponential backoff, this is the initial base delay. + * **bulk.flush.backoff.retries**: The amount of backoff retries to attempt. + +More information about Elasticsearch can be found [here](https://elastic.co). + +## Packaging the Elasticsearch Connector into an Uber-Jar + +For the execution of your Flink program, it is recommended to build a +so-called uber-jar (executable jar) containing all your dependencies +(see [here]({{< ref "docs/dev/datastream/project-configuration" >}}) for further information). + +Alternatively, you can put the connector's jar file into Flink's `lib/` folder to make it available +system-wide, i.e. for all job being run. + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/file_sink.md b/docs/content.zh/docs/connectors/datastream/file_sink.md new file mode 100644 index 0000000000000..329c7cf1e764c --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/file_sink.md @@ -0,0 +1,742 @@ +--- +title: File Sink +weight: 6 +type: docs +aliases: + - /zh/dev/connectors/file_sink.html + - /zh/apis/streaming/connectors/filesystem_sink.html +--- + + +# File Sink + +这个连接器提供了一个在流和批模式下统一的 Sink 来将分区文件写入到支持 [Flink `FileSystem`]({{< ref "docs/deployment/filesystems/overview" >}}) 接口的文件系统中,它对于流和批模式可以提供相同的一致性语义保证。File Sink 是现有的 [Streaming File Sink]({{< ref "docs/connectors/datastream/streamfile_sink" >}}) 的一个升级版本,后者仅在流模式下提供了精确一致性。 + +File Sink 会将数据写入到桶中。由于输入流可能是无界的,因此每个桶中的数据被划分为多个有限大小的文件。如何分桶是可以配置的,默认使用基于时间的分桶策略,这种策略每个小时创建一个新的桶,桶中包含的文件将记录所有该小时内从流中接收到的数据。 + +桶目录中的实际输出数据会被划分为多个部分文件(part file),每一个接收桶数据的 Sink Subtask ,至少包含一个部分文件(part file)。额外的部分文件(part file)将根据滚动策略创建,滚动策略是可以配置的。对于行编码格式(参考 [File Formats](#file-formats) )默认的策略是根据文件大小和超时时间来滚动文件。超时时间指打开文件的最长持续时间,以及文件关闭前的最长非活动时间。批量编码格式必须在每次 Checkpoint 时滚动文件,但是用户也可以指定额外的基于文件大小和超时时间的策略。 + +{{< hint info >}} +重要: 在流模式下使用 FileSink 时需要启用 Checkpoint ,每次做 Checkpoint 时写入完成。如果 Checkpoint 被禁用,部分文件(part file)将永远处于 'in-progress' 或 'pending' 状态,下游系统无法安全地读取。 +{{< /hint >}} + + +{{< img src="/fig/streamfilesink_bucketing.png" >}} + +## 文件格式 + + `FileSink` 支持行编码格式和批量编码格式,比如 [Apache Parquet](http://parquet.apache.org) 。 +这两种变体随附了各自的构建器,可以使用以下静态方法创建: + + - Row-encoded sink: `FileSink.forRowFormat(basePath, rowEncoder)` + - Bulk-encoded sink: `FileSink.forBulkFormat(basePath, bulkWriterFactory)` + +创建行或批量编码的 Sink 时,我们需要指定存储桶的基本路径和数据的编码逻辑。 + +更多配置操作以及不同数据格式的实现请参考 `FileSink` 。 + +### 行编码格式 + +行编码格式需要指定一个 `Encoder` 。Encoder 负责为每个处于 In-progress 状态文件的`OutputStream` 序列化数据。 + +`除了桶分配器之外,RowFormatBuilder` 还允许用户指定: + + - Custom `RollingPolicy`:自定义滚动策略以覆盖默认的 DefaultRollingPolicy。 + - bucketCheckInterval (默认为1分钟):毫秒间隔,用于基于时间的滚动策略。 + +字符串元素写入示例: + + +{{< tabs "946da1d5-b046-404e-ab80-a5a5d251d8ee" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.api.common.serialization.SimpleStringEncoder; +import org.apache.flink.core.fs.Path; +import org.apache.flink.connector.file.sink.FileSink; +import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy; + +DataStream input = ...; + +final FileSink sink = FileSink + .forRowFormat(new Path(outputPath), new SimpleStringEncoder("UTF-8")) + .withRollingPolicy( + DefaultRollingPolicy.builder() + .withRolloverInterval(TimeUnit.MINUTES.toMillis(15)) + .withInactivityInterval(TimeUnit.MINUTES.toMillis(5)) + .withMaxPartSize(1024 * 1024 * 1024) + .build()) + .build(); + +input.sinkTo(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.common.serialization.SimpleStringEncoder +import org.apache.flink.core.fs.Path +import org.apache.flink.connector.file.sink.FileSink +import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy + +val input: DataStream[String] = ... + +val sink: FileSink[String] = FileSink + .forRowFormat(new Path(outputPath), new SimpleStringEncoder[String]("UTF-8")) + .withRollingPolicy( + DefaultRollingPolicy.builder() + .withRolloverInterval(TimeUnit.MINUTES.toMillis(15)) + .withInactivityInterval(TimeUnit.MINUTES.toMillis(5)) + .withMaxPartSize(1024 * 1024 * 1024) + .build()) + .build() + +input.sinkTo(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +这个例子创建了一个简单的 Sink ,将记录分配给默认的一小时时间桶。它还指定了一个滚动策略,该策略在以下三种情况下滚动处于 In-progress 状态的部分文件(part file): + + - 它至少包含 15 分钟的数据 + - 最近 5 分钟没有收到新的记录 + - 文件大小达到 1GB (写入最后一条记录后) + +### 批量编码格式 + +批量编码 Sink 的创建与行编码 Sink 相似,不过在这里我们不是指定编码器 `Encoder` 而是指定 `BulkWriter.Factory` 。 +`BulkWriter` 定义了如何添加、刷新元素,以及如何批量编码。 + +Flink 有四个内置的 BulkWriter Factory : + + - `ParquetWriterFactory` + - `AvroWriterFactory` + - `SequenceFileWriterFactory` + - `CompressWriterFactory` + - `OrcBulkWriterFactory` + +{{< hint info >}} +重要: 批量编码模式仅支持 OnCheckpointRollingPolicy 策略, 在每次 checkpoint 的时候滚动文件。 +重要: 批量编码模式必须使用继承自 CheckpointRollingPolicy 的滚动策略, 这些策略必须在每次 checkpoint 的时候滚动文件,但是用户也可以进一步指定额外的基于文件大小和超时时间的策略。 +{{< /hint >}} + +#### Parquet 格式 + +Flink 包含为不同 Avro 类型,创建 ParquetWriterFactory 的便捷方法,更多信息请参考 `ParquetAvroWriters` 。 + +要编写其他 Parquet 兼容的数据格式,用户需要创建 ParquetWriterFactory 并实现 `ParquetBuilder` 接口。 + +在应用中使用 Parquet 批量编码器,你需要添加以下依赖: + +{{< artifact flink-parquet withScalaVersion >}} + +这个例子使用 FileSink 将 Avro 数据写入 Parquet 格式: + +{{< tabs "825da2a2-4bdf-4f2d-9138-2e99a72bb9d4" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.connector.file.sink.FileSink; +import org.apache.flink.formats.parquet.avro.ParquetAvroWriters; +import org.apache.avro.Schema; + + +Schema schema = ...; +DataStream input = ...; + +final FileSink sink = FileSink + .forBulkFormat(outputBasePath, ParquetAvroWriters.forGenericRecord(schema)) + .build(); + +input.sinkTo(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.connector.file.sink.FileSink; +import org.apache.flink.formats.parquet.avro.ParquetAvroWriters +import org.apache.avro.Schema + +val schema: Schema = ... +val input: DataStream[GenericRecord] = ... + +val sink: FileSink[GenericRecord] = FileSink + .forBulkFormat(outputBasePath, ParquetAvroWriters.forGenericRecord(schema)) + .build() + +input.sinkTo(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +类似的,将 Protobuf 数据写入到 Parquet 格式可以通过: + +{{< tabs "7f22c88d-e7dd-4299-aa23-02afc61a6319" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.connector.file.sink.FileSink; +import org.apache.flink.formats.parquet.protobuf.ParquetProtoWriters; + +// ProtoRecord is a generated protobuf Message class. +DataStream input = ...; + +final FileSink sink = FileSink + .forBulkFormat(outputBasePath, ParquetProtoWriters.forType(ProtoRecord.class)) + .build(); + +input.sinkTo(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.connector.file.sink.FileSink; +import org.apache.flink.formats.parquet.protobuf.ParquetProtoWriters + +// ProtoRecord is a generated protobuf Message class. +val input: DataStream[ProtoRecord] = ... + +val sink: FileSink[ProtoRecord] = FileSink + .forBulkFormat(outputBasePath, ParquetProtoWriters.forType(classOf[ProtoRecord])) + .build() + +input.sinkTo(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +#### Avro格式 + +Flink 也提供了将数据写入 Avro 文件的内置支持。对于创建 AvroWriterFactory 的快捷方法,更多信息可以参考 +`AvroWriters`. + +使用Avro相关的Writer需要在项目中添加以下依赖: + +{{< artifact flink-avro >}} + +将数据写入 Avro 文件的 FileSink 算子可以通过如下方式创建: + +{{< tabs "237658d5-98c7-43c7-9844-268df7ba7afc" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.connector.file.sink.FileSink; +import org.apache.flink.formats.avro.AvroWriters; +import org.apache.avro.Schema; + + +Schema schema = ...; +DataStream input = ...; + +final FileSink sink = FileSink + .forBulkFormat(outputBasePath, AvroWriters.forGenericRecord(schema)) + .build(); + +input.sinkTo(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.connector.file.sink.FileSink; +import org.apache.flink.formats.avro.AvroWriters +import org.apache.avro.Schema + +val schema: Schema = ... +val input: DataStream[GenericRecord] = ... + +val sink: FileSink[GenericRecord] = FileSink + .forBulkFormat(outputBasePath, AvroWriters.forGenericRecord(schema)) + .build() + +input.sinkTo(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +如果想要创建自定义的 Avro Writer,例如启用压缩等,用户可以实现 `AvroBuilder` +接口并自行创建一个 `AvroWriterFactory` 实例: + +{{< tabs "732055fb-ae45-49dd-81a1-8aadbb9e3d65" >}} +{{< tab "Java" >}} +```java +AvroWriterFactory factory = new AvroWriterFactory<>((AvroBuilder
    ) out -> { + Schema schema = ReflectData.get().getSchema(Address.class); + DatumWriter
    datumWriter = new ReflectDatumWriter<>(schema); + + DataFileWriter
    dataFileWriter = new DataFileWriter<>(datumWriter); + dataFileWriter.setCodec(CodecFactory.snappyCodec()); + dataFileWriter.create(schema, out); + return dataFileWriter; +}); + +DataStream
    stream = ... +stream.sinkTo(FileSink.forBulkFormat( + outputBasePath, + factory).build()); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val factory = new AvroWriterFactory[Address](new AvroBuilder[Address]() { + override def createWriter(out: OutputStream): DataFileWriter[Address] = { + val schema = ReflectData.get.getSchema(classOf[Address]) + val datumWriter = new ReflectDatumWriter[Address](schema) + + val dataFileWriter = new DataFileWriter[Address](datumWriter) + dataFileWriter.setCodec(CodecFactory.snappyCodec) + dataFileWriter.create(schema, out) + dataFileWriter + } +}) + +val stream: DataStream[Address] = ... +stream.sinkTo(FileSink.forBulkFormat( + outputBasePath, + factory).build()); +``` +{{< /tab >}} +{{< /tabs >}} + +#### ORC Format + +为了使用基于批量编码的 ORC 格式,Flink提供了 `OrcBulkWriterFactory` ,它需要用户提供一个 `Vectorizer` 的具体实现。 + +和其它基于列式存储的批量编码格式类似,Flink中的 `OrcBulkWriter` 将数据按批写出。它通过 ORC 的 VectorizedRowBatch 来实现这一点。 + +由于输入数据必须先缓存为一个完整的 `VectorizedRowBatch` ,用户需要继承 `Vectorizer` 抽像类并且实现其中的 `vectorize(T element, VectorizedRowBatch batch)` 方法。方法参数中传入的 `VectorizedRowBatch` 使用户只需将输入 `element` 转化为 `ColumnVectors` 并将它存储到所提供的 `VectorizedRowBatch` 实例中。 + +例如,如果输入元素的类型是 `Person` 并且它的定义如下: + +{{< tabs "8d9329a5-d67d-4c17-b940-03616c0bd5d6" >}} +{{< tab "Java" >}} +```java + +class Person { + private final String name; + private final int age; + ... +} + +``` +{{< /tab >}} +{{< /tabs >}} + +那么用户可以采用如下方式在子类中将 `Person` 对象转化为 `VectorizedRowBatch` : + +{{< tabs "2462164c-3dfc-414c-8bab-e2e8256266d9" >}} +{{< tab "Java" >}} +```java +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; + +import java.io.IOException; +import java.io.Serializable; +import java.nio.charset.StandardCharsets; + +public class PersonVectorizer extends Vectorizer implements Serializable { + public PersonVectorizer(String schema) { + super(schema); + } + @Override + public void vectorize(Person element, VectorizedRowBatch batch) throws IOException { + BytesColumnVector nameColVector = (BytesColumnVector) batch.cols[0]; + LongColumnVector ageColVector = (LongColumnVector) batch.cols[1]; + int row = batch.size++; + nameColVector.setVal(row, element.getName().getBytes(StandardCharsets.UTF_8)); + ageColVector.vector[row] = element.getAge(); + } +} + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import java.nio.charset.StandardCharsets +import org.apache.hadoop.hive.ql.exec.vector.{BytesColumnVector, LongColumnVector} + +class PersonVectorizer(schema: String) extends Vectorizer[Person](schema) { + + override def vectorize(element: Person, batch: VectorizedRowBatch): Unit = { + val nameColVector = batch.cols(0).asInstanceOf[BytesColumnVector] + val ageColVector = batch.cols(1).asInstanceOf[LongColumnVector] + nameColVector.setVal(batch.size + 1, element.getName.getBytes(StandardCharsets.UTF_8)) + ageColVector.vector(batch.size + 1) = element.getAge + } + +} + +``` +{{< /tab >}} +{{< /tabs >}} + +为了在应用中使用 ORC 批量编码,用户需要添加如下依赖: + +{{< artifact flink-orc withScalaVersion >}} + +然后使用 ORC 格式的 `FileSink` 可以通过如下方式创建: + +{{< tabs "4bc2aa30-6ea9-461f-aa24-8c36856edfcb" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.connector.file.sink.FileSink; +import org.apache.flink.orc.writer.OrcBulkWriterFactory; + +String schema = "struct<_col0:string,_col1:int>"; +DataStream input = ...; + +final OrcBulkWriterFactory writerFactory = new OrcBulkWriterFactory<>(new PersonVectorizer(schema)); + +final FileSink sink = FileSink + .forBulkFormat(outputBasePath, writerFactory) + .build(); + +input.sinkTo(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.connector.file.sink.FileSink; +import org.apache.flink.orc.writer.OrcBulkWriterFactory + +val schema: String = "struct<_col0:string,_col1:int>" +val input: DataStream[Person] = ... +val writerFactory = new OrcBulkWriterFactory(new PersonVectorizer(schema)); + +val sink: FileSink[Person] = FileSink + .forBulkFormat(outputBasePath, writerFactory) + .build() + +input.sinkTo(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +用户还可以通过 Hadoop `Configuration` 和 `Properties` 来设置 OrcBulkWriterFactory 中涉及的 Hadoop 属性和 ORC Writer 属性: + +{{< tabs "79765e6f-43bf-47ac-801c-2f7da9ac4f87" >}} +{{< tab "Java" >}} +```java +String schema = ...; +Configuration conf = ...; +Properties writerProperties = new Properties(); + +writerProps.setProperty("orc.compress", "LZ4"); +// 其它 ORC 支持的属性也可以类似设置。 + +final OrcBulkWriterFactory writerFactory = new OrcBulkWriterFactory<>( + new PersonVectorizer(schema), writerProperties, conf); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val schema: String = ... +val conf: Configuration = ... +val writerProperties: Properties = new Properties() + +writerProps.setProperty("orc.compress", "LZ4") +// 其它 ORC 支持的属性也可以类似设置。 + +val writerFactory = new OrcBulkWriterFactory( + new PersonVectorizer(schema), writerProperties, conf) +``` +{{< /tab >}} +{{< /tabs >}} + +完整的 ORC Writer 的属性可以参考 [相关文档](https://orc.apache.org/docs/hive-config.html). + +给 ORC 文件添加自定义元数据可以通过在实现的 `vectorize(...)` 方法中调用 `addUserMetadata(...)` 实现: + +{{< tabs "df5c8b4f-9db0-41b0-89e7-a74a3b473b35" >}} +{{< tab "Java" >}} +```java + +public class PersonVectorizer extends Vectorizer implements Serializable { + @Override + public void vectorize(Person element, VectorizedRowBatch batch) throws IOException { + ... + String metadataKey = ...; + ByteBuffer metadataValue = ...; + this.addUserMetadata(metadataKey, metadataValue); + } +} + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +class PersonVectorizer(schema: String) extends Vectorizer[Person](schema) { + + override def vectorize(element: Person, batch: VectorizedRowBatch): Unit = { + ... + val metadataKey: String = ... + val metadataValue: ByteBuffer = ... + addUserMetadata(metadataKey, metadataValue) + } + +} + +``` +{{< /tab >}} +{{< /tabs >}} + +#### Hadoop SequenceFile 格式 + +在应用中使用 `SequenceFile` 批量编码器,你需要添加以下依赖: + +{{< artifact flink-sequence-file >}} + +简单的 `SequenceFile` 写入示例: + +{{< tabs "addcc4bc-bd9c-473a-9d5a-d9d9b3efd7d2" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.connector.file.sink.FileSink; +import org.apache.flink.configuration.GlobalConfiguration; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; + + +DataStream> input = ...; +Configuration hadoopConf = HadoopUtils.getHadoopConfiguration(GlobalConfiguration.loadConfiguration()); +final FileSink> sink = FileSink + .forBulkFormat( + outputBasePath, + new SequenceFileWriterFactory<>(hadoopConf, LongWritable.class, Text.class)) + .build(); + +input.sinkTo(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.connector.file.sink.FileSink; +import org.apache.flink.configuration.GlobalConfiguration +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.io.LongWritable +import org.apache.hadoop.io.SequenceFile +import org.apache.hadoop.io.Text; + +val input: DataStream[(LongWritable, Text)] = ... +val hadoopConf: Configuration = HadoopUtils.getHadoopConfiguration(GlobalConfiguration.loadConfiguration()) +val sink: FileSink[(LongWritable, Text)] = FileSink + .forBulkFormat( + outputBasePath, + new SequenceFileWriterFactory(hadoopConf, LongWritable.class, Text.class)) + .build() + +input.sinkTo(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +`SequenceFileWriterFactory` 支持附加构造函数参数指定压缩设置。 + +## 桶分配 + +桶分配逻辑定义了如何将数据结构化为基本输出目录中的子目录 + +行格式和批量格式都使用 `DateTimeBucketAssigner` 作为默认的分配器。 +默认情况下,DateTimeBucketAssigner 基于系统默认时区每小时创建一个桶,格式如下: `yyyy-MM-dd--HH` 。日期格式(即桶的大小)和时区都可以手动配置。 + +我们可以在格式构建器上调用 `.withBucketAssigner(assigner)` 来自定义 `BucketAssigner` 。 + +Flink 有两个内置的 BucketAssigners : + + - `DateTimeBucketAssigner:默认基于时间的分配器` + - `BasePathBucketAssigner` :将所有部分文件(part file)存储在基本路径中的分配器(单个全局桶) + +## 滚动策略 + +在流模式下,滚动策略 `RollingPolicy` 定义了指定的文件在何时关闭(closed)并将其变为 Pending 状态,随后变为 Finished 状态。处于 Pending 状态的文件会在下一次 Checkpoint 时变为 Finished 状态,通过设置 Checkpoint 间隔时间,可以控制部分文件(part file)对下游读取者可用的速度、大小和数量。在批模式下,临时文件只会在作业处理完所有输入数据后才会变成 Finished 状态,此时滚动策略可以用来控制每个文件的大小。 + +Flink 有两个内置的滚动策略: + + - `DefaultRollingPolicy` + - `OnCheckpointRollingPolicy` + +## 部分文件(part file) 生命周期 + +为了在下游系统中使用 FileSink 的输出,我们需要了解输出文件的命名规则和生命周期。 + +部分文件(part file)可以处于以下三种状态之一: + 1. **In-progress** :当前文件正在写入中。 + 2. **Pending** :当处于 In-progress 状态的文件关闭(closed)了,就变为 Pending 状态。 + 3. **Finished** :在成功的 Checkpoint 后(流模式)或作业处理完所有输入数据后(批模式),Pending 状态将变为 Finished 状态。 + +处于 Finished 状态的文件不会再被修改,可以被下游系统安全地读取。 + +{{< hint info >}} +重要: 部分文件的索引在每个 subtask 内部是严格递增的(按文件创建顺序)。但是索引并不总是连续的。当 Job 重启后,所有部分文件的索引从 `max part index + 1` 开始, +这里的 `max part index` 是所有 subtask 中索引的最大值。 +{{< /hint >}} + +对于每个活动的桶,Writer 在任何时候都只有一个处于 In-progress 状态的部分文件(part file),但是可能有几个 Penging 和 Finished 状态的部分文件(part file)。 + +**部分文件(part file)例子** + +为了更好地理解这些文件的生命周期,让我们来看一个包含 2 个 Sink Subtask 的简单例子: + +``` +└── 2019-08-25--12 + ├── part-4005733d-a830-4323-8291-8866de98b582-0.inprogress.bd053eb0-5ecf-4c85-8433-9eff486ac334 + └── part-81fc4980-a6af-41c8-9937-9939408a734b-0.inprogress.ea65a428-a1d0-4a0b-bbc5-7a436a75e575 +``` + +当部分文件 `part-81fc4980-a6af-41c8-9937-9939408a734b-0` 被滚动(假设它变得太大了)时,它将成为 Pending 状态,但是它还没有被重命名。然后 Sink 会创建一个新的部分文件: `part-81fc4980-a6af-41c8-9937-9939408a734b-1`: + +``` +└── 2019-08-25--12 + ├── part-4005733d-a830-4323-8291-8866de98b582-0.inprogress.bd053eb0-5ecf-4c85-8433-9eff486ac334 + ├── part-81fc4980-a6af-41c8-9937-9939408a734b-0.inprogress.ea65a428-a1d0-4a0b-bbc5-7a436a75e575 + └── part-81fc4980-a6af-41c8-9937-9939408a734b-1.inprogress.bc279efe-b16f-47d8-b828-00ef6e2fbd11 +``` + + `part-81fc4980-a6af-41c8-9937-9939408a734b-0` 现在处于 Pending 状态等待完成,在下一次成功的 Checkpoint 后,它会变成 Finished 状态: + +``` +└── 2019-08-25--12 + ├── part-4005733d-a830-4323-8291-8866de98b582-0.inprogress.bd053eb0-5ecf-4c85-8433-9eff486ac334 + ├── part-81fc4980-a6af-41c8-9937-9939408a734b-0 + └── part-81fc4980-a6af-41c8-9937-9939408a734b-1.inprogress.bc279efe-b16f-47d8-b828-00ef6e2fbd11 +``` + +根据分桶策略创建新的桶,但是这并不会影响当前处于 In-progress 状态的文件: + +``` +└── 2019-08-25--12 + ├── part-4005733d-a830-4323-8291-8866de98b582-0.inprogress.bd053eb0-5ecf-4c85-8433-9eff486ac334 + ├── part-81fc4980-a6af-41c8-9937-9939408a734b-0 + └── part-81fc4980-a6af-41c8-9937-9939408a734b-1.inprogress.bc279efe-b16f-47d8-b828-00ef6e2fbd11 +└── 2019-08-25--13 + └── part-4005733d-a830-4323-8291-8866de98b582-0.inprogress.2b475fec-1482-4dea-9946-eb4353b475f1 +``` + +因为分桶策略基于每条记录进行评估,所以旧桶仍然可以接受新的记录。 + +### 部分文件的配置项 + +已经完成的文件和进行中的文件仅能通过文件名格式进行区分。 + +默认情况下,文件命名格式如下所示: + - **In-progress / Pending:** `part--.inprogress.uid` + - **FINISHED:** `part--` + +其中 uid 是在 Sink 的各个 task 在启动时随机生成的 id,这些 id 是不支持容错的,在 task 重启后 id 会重新生成。 + +Flink 允许用户通过 `OutputFileConfig` 指定部分文件名的前缀和后缀。 +举例来说,前缀设置为 "prefix" 以及后缀设置为 ".ext" 之后,Sink 创建的文件名如下所示: + +``` +└── 2019-08-25--12 + ├── prefix-4005733d-a830-4323-8291-8866de98b582-0.ext + ├── prefix-4005733d-a830-4323-8291-8866de98b582-1.ext.inprogress.bd053eb0-5ecf-4c85-8433-9eff486ac334 + ├── prefix-81fc4980-a6af-41c8-9937-9939408a734b-0.ext + └── prefix-81fc4980-a6af-41c8-9937-9939408a734b-1.ext.inprogress.bc279efe-b16f-47d8-b828-00ef6e2fbd11 +``` + +用户可以通过如下方式设置 `OutputFileConfig`: + +{{< tabs "074e85ae-45fa-4280-a017-1c836d7b583e" >}} +{{< tab "Java" >}} +```java + +OutputFileConfig config = OutputFileConfig + .builder() + .withPartPrefix("prefix") + .withPartSuffix(".ext") + .build(); + +FileSink> sink = FileSink + .forRowFormat((new Path(outputPath), new SimpleStringEncoder<>("UTF-8")) + .withBucketAssigner(new KeyBucketAssigner()) + .withRollingPolicy(OnCheckpointRollingPolicy.build()) + .withOutputFileConfig(config) + .build(); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +val config = OutputFileConfig + .builder() + .withPartPrefix("prefix") + .withPartSuffix(".ext") + .build() + +val sink = FileSink + .forRowFormat(new Path(outputPath), new SimpleStringEncoder[String]("UTF-8")) + .withBucketAssigner(new KeyBucketAssigner()) + .withRollingPolicy(OnCheckpointRollingPolicy.build()) + .withOutputFileConfig(config) + .build() + +``` +{{< /tab >}} +{{< /tabs >}} + +## 重要注意事项 + +### 通用注意事项 + +重要提示 1: 使用 Hadoop < 2.7 时,请使用 `OnCheckpointRollingPolicy` 滚动策略,该策略会在每次检查点时进行文件滚动。 +这样做的原因是如果部分文件的生命周期跨多个检查点,当 `FileSink` 从之前的检查点进行恢复时会调用文件系统的 `truncate()` 方法清理 in-progress 文件中未提交的数据。 +Hadoop 2.7 之前的版本不支持这个方法,因此 Flink 会报异常。 + +重要提示 2: 鉴于 Flink 的 sink 以及 UDF 通常不会区分作业的正常结束(比如有限流)和异常终止,因此正常结束作业的最后一批 in-progress 文件不会被转换到 "完成" 状态。 + +重要提示 3: Flink 以及 `FileSink` 不会覆盖已经提交的数据。因此如果尝试从一个包含 in-progress 文件的旧 checkpoint/savepoint 恢复, +且这些 in-progress 文件会被接下来的成功 checkpoint 提交,Flink 会因为无法找到 in-progress 文件而抛异常,从而恢复失败。 + +重要提示 4: 目前 `FileSink` 只支持三种文件系统: HDFS、S3和Local。如果配置了不支持的文件系统,在执行的时候 Flink 会抛出异常。 + +### Batch 模式 + +重要提示 1: 尽管负责写出数据的 Writer 会使用用户提定的并发,负责提交文件的 Committer 将固定并发度为1。 + +Important Note 2: 批模式下只有在所有输入都被处理后 Pending 文件才会被提交,即转为 Finished 状态。 + +Important Note 3: 在高可用模式下,如果在 Committer 提交文件时发生了 JobManager 重启,已提交的数据可能会被重复产生。这一问题将在后续版本中修复。 + +### S3 特有的注意事项 + +重要提示 1: 对于 S3,`FileSink` 只支持基于 [Hadoop](https://hadoop.apache.org/) +的文件系统实现,不支持基于 [Presto](https://prestodb.io/) 的实现。如果想使用 `FileSink` 向 S3 写入数据并且将 +checkpoint 放在基于 Presto 的文件系统,建议明确指定 *"s3a://"* (for Hadoop)作为sink的目标路径方案,并且为 checkpoint 路径明确指定 *"s3p://"* (for Presto)。 +如果 Sink 和 checkpoint 都使用 *"s3://"* 路径的话,可能会导致不可预知的行为,因为双方的实现都在“监听”这个路径。 + +重要提示 2: `FileSink` 使用 S3 的 [Multi-part Upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/mpuoverview.html) +(后续使用MPU代替)特性可以保证精确一次的语义。这个特性支持以独立的块(因此被称为"multi-part")模式上传文件,当 MPU 的所有部分文件 +成功上传之后,可以合并成原始文件。对于失效的 MPUs,S3 提供了一个基于桶生命周期的规则,用户可以用这个规则来丢弃在指定时间内未完成的MPU。 +如果在一些部分文件还未上传时触发 savepoint,并且这个规则设置的比较严格,这意味着相关的 MPU在作业重启之前可能会超时。后续的部分文件没 +有写入到 savepoint, 那么在 Flink 作业从 savepoint 恢复时,会因为拿不到缺失的部分文件,导致任务失败并抛出异常。 + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/guarantees.md b/docs/content.zh/docs/connectors/datastream/guarantees.md new file mode 100644 index 0000000000000..1f60466d64651 --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/guarantees.md @@ -0,0 +1,141 @@ +--- +title: 容错保证 +weight: 1 +type: docs +aliases: + - /zh/dev/connectors/guarantees.html +--- + + +# Data Source 和 Sink 的容错保证 + +当程序出现错误的时候,Flink 的容错机制能恢复并继续运行程序。这种错误包括机器硬件故障、网络故障、瞬态程序故障等等。 + +只有当 source 参与了快照机制的时候,Flink 才能保证对自定义状态的精确一次更新。下表列举了 Flink 与其自带连接器的状态更新的保证。 + +请阅读各个连接器的文档来了解容错保证的细节。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    SourceGuaranteesNotes
    Apache Kafka精确一次根据你的版本用恰当的 Kafka 连接器
    AWS Kinesis Streams精确一次
    RabbitMQ至多一次 (v 0.10) / 精确一次 (v 1.0)
    Twitter Streaming API至多一次
    Google PubSub至少一次
    Collections精确一次
    Files精确一次
    Sockets至多一次
    + +为了保证端到端精确一次的数据交付(在精确一次的状态语义上更进一步),sink需要参与 checkpointing 机制。下表列举了 Flink 与其自带 sink 的交付保证(假设精确一次状态更新)。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    SinkGuaranteesNotes
    Elasticsearch至少一次
    Kafka producer至少一次 / 精确一次当使用事务生产者时,保证精确一次 (v 0.11+)
    Cassandra sink至少一次 / 精确一次只有当更新是幂等时,保证精确一次
    AWS Kinesis Streams至少一次
    File sinks精确一次
    Socket sinks至少一次
    Standard output至少一次
    Redis sink至少一次
    + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/jdbc.md b/docs/content.zh/docs/connectors/datastream/jdbc.md new file mode 100644 index 0000000000000..9bfb923e8d231 --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/jdbc.md @@ -0,0 +1,61 @@ +--- +title: JDBC +weight: 10 +type: docs +aliases: + - /zh/dev/connectors/jdbc.html +--- + + +# JDBC Connector + +该连接器可以向 JDBC 数据库写入数据。 + +添加下面的依赖以便使用该连接器(同时添加 JDBC 驱动): + +{{< artifact flink-connector-jdbc withScalaVersion >}} + +注意该连接器目前还 __不是__ 二进制发行版的一部分,如何在集群中运行请参考 [这里]({{< ref "docs/dev/datastream/project-configuration" >}})。 + +已创建的 JDBC Sink 能够保证至少一次的语义。 +更有效的精确执行一次可以通过 upsert 语句或幂等更新实现。 + +用法示例: +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +env + .fromElements(...) + .addSink(JdbcSink.sink( + "insert into books (id, title, author, price, qty) values (?,?,?,?,?)", + (ps, t) -> { + ps.setInt(1, t.id); + ps.setString(2, t.title); + ps.setString(3, t.author); + ps.setDouble(4, t.price); + ps.setInt(5, t.qty); + }, + new JdbcConnectionOptions.JdbcConnectionOptionsBuilder() + .withUrl(getDbMetadata().getUrl()) + .withDriverName(getDbMetadata().getDriverClass()) + .build())); +env.execute(); +``` + +更多细节请查看 API documentation 。 diff --git a/docs/content.zh/docs/connectors/datastream/kafka.md b/docs/content.zh/docs/connectors/datastream/kafka.md new file mode 100644 index 0000000000000..763995c631d8d --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/kafka.md @@ -0,0 +1,512 @@ +--- +title: Kafka +weight: 2 +type: docs +aliases: + - /zh/dev/connectors/kafka.html + - /zh/apis/streaming/connectors/kafka.html +--- + + +# Apache Kafka 连接器 + +Flink 提供了 [Apache Kafka](https://kafka.apache.org) 连接器,用于从 Kafka topic 中读取或者向其中写入数据,可提供精确一次的处理语义。 + + + +## 依赖 + +Apache Flink 集成了通用的 Kafka 连接器,它会尽力与 Kafka client 的最新版本保持同步。该连接器使用的 Kafka client 版本可能会在 Flink 版本之间发生变化。 +当前 Kafka client 向后兼容 0.10.0 或更高版本的 Kafka broker。 +有关 Kafka 兼容性的更多细节,请参考 [Kafka 官方文档](https://kafka.apache.org/protocol.html#protocol_compatibility)。 + +{{< artifact flink-connector-kafka withScalaVersion >}} + +Flink 目前的流连接器还不是二进制发行版的一部分。 +[在此处]({{< ref "docs/dev/datastream/project-configuration" >}})可以了解到如何链接它们,从而在集群中运行。 + + + +## Kafka Consumer + +Flink 的 Kafka consumer 称为 `FlinkKafkaConsumer`。它提供对一个或多个 Kafka topics 的访问。 + +构造函数接受以下参数: + +1. Topic 名称或者名称列表 +2. 用于反序列化 Kafka 数据的 DeserializationSchema 或者 KafkaDeserializationSchema +3. Kafka 消费者的属性。需要以下属性: + - "bootstrap.servers"(以逗号分隔的 Kafka broker 列表) + - "group.id" 消费组 ID + +{{< tabs "fdf41307-604d-426f-9863-666250ce0cdc" >}} +{{< tab "Java" >}} +```java +Properties properties = new Properties(); +properties.setProperty("bootstrap.servers", "localhost:9092"); +properties.setProperty("group.id", "test"); +DataStream stream = env + .addSource(new FlinkKafkaConsumer<>("topic", new SimpleStringSchema(), properties)); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val properties = new Properties() +properties.setProperty("bootstrap.servers", "localhost:9092") +properties.setProperty("group.id", "test") +val stream = env + .addSource(new FlinkKafkaConsumer[String]("topic", new SimpleStringSchema(), properties)) +``` +{{< /tab >}} +{{< /tabs >}} + + + +### `DeserializationSchema` + +Flink Kafka Consumer 需要知道如何将 Kafka 中的二进制数据转换为 Java 或者 Scala 对象。`KafkaDeserializationSchema` 允许用户指定这样的 schema,每条 Kafka 中的消息会调用 `T deserialize(ConsumerRecord record)` 反序列化。 + +为了方便使用,Flink 提供了以下几种 schemas: + +1. `TypeInformationSerializationSchema`(和 `TypeInformationKeyValueSerializationSchema`) 基于 Flink 的 `TypeInformation` 创建 `schema`。 + 如果该数据的读和写都发生在 Flink 中,那么这将是非常有用的。此 schema 是其他通用序列化方法的高性能 Flink 替代方案。 + +2. `JsonDeserializationSchema`(和 `JSONKeyValueDeserializationSchema`)将序列化的 JSON 转化为 ObjectNode 对象,可以使用 `objectNode.get("field").as(Int/String/...)()` 来访问某个字段。 + KeyValue objectNode 包含一个含所有字段的 key 和 values 字段,以及一个可选的"metadata"字段,可以访问到消息的 offset、partition、topic 等信息。 + +3. `AvroDeserializationSchema` 使用静态提供的 schema 读取 Avro 格式的序列化数据。 + 它能够从 Avro 生成的类(`AvroDeserializationSchema.forSpecific(...)`)中推断出 schema,或者可以与 `GenericRecords` + 一起使用手动提供的 schema(用 `AvroDeserializationSchema.forGeneric(...)`)。此反序列化 schema 要求序列化记录不能包含嵌入式架构! + + - 此模式还有一个版本,可以在 [Confluent Schema Registry](https://docs.confluent.io/current/schema-registry/docs/index.html) 中查找编写器的 schema(用于编写记录的 schema)。 + - 使用这些反序列化 schema 记录将读取从 schema 注册表检索到的 schema 转换为静态提供的 schema(或者通过 `ConfluentRegistryAvroDeserializationSchema.forGeneric(...)` 或 `ConfluentRegistryAvroDeserializationSchema.forSpecific(...)`)。 + + - 您还可以使用AWS实现的[AWS Glue Schema Registry](https://docs.aws.amazon.com/glue/latest/dg/schema-registry.html)来查找编写器的 schema 。相似地,反序列化的记录会读取从 AWS Glue Schema Registry 检索到的 schema 并转换为静态提供的 schema + (或者通过 `GlueSchemaRegistryAvroDeserializationSchema.forGeneric(...)` 或 `GlueSchemaRegistryAvroDeserializationSchema.forSpecific(...)`)。有关 AWS Glue Schema Registry 与 Apache Flink 适配的更多信息,请参见 + [Use Case: Amazon Kinesis Data Analytics for Apache Flink](https://docs.aws.amazon.com/glue/latest/dg/schema-registry-integrations.html#schema-registry-integrations-kinesis-data-analytics-apache-flink). + +
    要使用此反序列化 schema 必须添加以下依赖: + +{{< tabs "28c9b976-d85a-4d98-ad0b-7ca427c85b57" >}} +{{< tab "AvroDeserializationSchema" >}} +```xml + + org.apache.flink + flink-avro + {{site.version }} + +``` +{{< /tab >}} +{{< tab "ConfluentRegistryAvroDeserializationSchema" >}} +```xml + + org.apache.flink + flink-avro-confluent-registry + {{site.version }} + +``` +{{< /tab >}} +{{< tab "GlueSchemaRegistryAvroDeserializationSchema" >}} +```xml + + org.apache.flink + flink-avro-glue-schema-registry + {{site.version }} + +``` +{{< /tab >}} +{{< /tabs >}} + +当遇到因一些原因而无法反序列化的损坏消息时,反序列化 schema 会返回 `null`,以允许 Flink Kafka 消费者悄悄地跳过损坏的消息。请注意,由于 consumer 的容错能力(请参阅下面的部分以获取更多详细信息),在损坏的消息上失败作业将使 consumer 尝试再次反序列化消息。因此,如果反序列化仍然失败,则 consumer 将在该损坏的消息上进入不间断重启和失败的循环。 + + + +### 配置 Kafka Consumer 开始消费的位置 + +Flink Kafka Consumer 允许通过配置来确定 Kafka 分区的起始位置。 + +{{< tabs "dd71055b-6b2d-4e61-8c4b-5e93aeaf939a" >}} +{{< tab "Java" >}} +```java +final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +FlinkKafkaConsumer myConsumer = new FlinkKafkaConsumer<>(...); +myConsumer.setStartFromEarliest(); // 尽可能从最早的记录开始 +myConsumer.setStartFromLatest(); // 从最新的记录开始 +myConsumer.setStartFromTimestamp(...); // 从指定的时间开始(毫秒) +myConsumer.setStartFromGroupOffsets(); // 默认的方法 + +DataStream stream = env.addSource(myConsumer); +... +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() + +val myConsumer = new FlinkKafkaConsumer[String](...) +myConsumer.setStartFromEarliest() // 尽可能从最早的记录开始 +myConsumer.setStartFromLatest() // 从最新的记录开始 +myConsumer.setStartFromTimestamp(...) // 从指定的时间开始(毫秒) +myConsumer.setStartFromGroupOffsets() // 默认的方法 + +val stream = env.addSource(myConsumer) +... +``` +{{< /tab >}} +{{< /tabs >}} + +Flink Kafka Consumer 的所有版本都具有上述明确的起始位置配置方法。 + + * `setStartFromGroupOffsets`(默认方法):从 Kafka brokers 中的 consumer 组(consumer 属性中的 `group.id` 设置)提交的偏移量中开始读取分区。 + 如果找不到分区的偏移量,那么将会使用配置中的 `auto.offset.reset` 设置。 + * `setStartFromEarliest()` 或者 `setStartFromLatest()`:从最早或者最新的记录开始消费,在这些模式下,Kafka 中的 committed offset 将被忽略,不会用作起始位置。 + * `setStartFromTimestamp(long)`:从指定的时间戳开始。对于每个分区,其时间戳大于或等于指定时间戳的记录将用作起始位置。如果一个分区的最新记录早于指定的时间戳,则只从最新记录读取该分区数据。在这种模式下,Kafka 中的已提交 offset 将被忽略,不会用作起始位置。 + +你也可以为每个分区指定 consumer 应该开始消费的具体 offset: + +{{< tabs "3fc8a5ad-77df-4ebb-bc02-d954d1eb29a7" >}} +{{< tab "Java" >}} +```java +Map specificStartOffsets = new HashMap<>(); +specificStartOffsets.put(new KafkaTopicPartition("myTopic", 0), 23L); +specificStartOffsets.put(new KafkaTopicPartition("myTopic", 1), 31L); +specificStartOffsets.put(new KafkaTopicPartition("myTopic", 2), 43L); + +myConsumer.setStartFromSpecificOffsets(specificStartOffsets); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val specificStartOffsets = new java.util.HashMap[KafkaTopicPartition, java.lang.Long]() +specificStartOffsets.put(new KafkaTopicPartition("myTopic", 0), 23L) +specificStartOffsets.put(new KafkaTopicPartition("myTopic", 1), 31L) +specificStartOffsets.put(new KafkaTopicPartition("myTopic", 2), 43L) + +myConsumer.setStartFromSpecificOffsets(specificStartOffsets) +``` +{{< /tab >}} +{{< /tabs >}} + +上面的例子中使用的配置是指定从 `myTopic` 主题的 0 、1 和 2 分区的指定偏移量开始消费。offset 值是 consumer 应该为每个分区读取的下一条消息。请注意:如果 consumer 需要读取在提供的 offset 映射中没有指定 offset 的分区,那么它将回退到该特定分区的默认组偏移行为(即 `setStartFromGroupOffsets()`)。 + + +请注意:当 Job 从故障中自动恢复或使用 savepoint 手动恢复时,这些起始位置配置方法不会影响消费的起始位置。在恢复时,每个 Kafka 分区的起始位置由存储在 savepoint 或 checkpoint 中的 offset 确定(有关 checkpointing 的信息,请参阅下一节,以便为 consumer 启用容错功能)。 + + + +### Kafka Consumer 和容错 + +伴随着启用 Flink 的 checkpointing 后,Flink Kafka Consumer 将使用 topic 中的记录,并以一致的方式定期检查其所有 Kafka offset 和其他算子的状态。如果 Job 失败,Flink 会将流式程序恢复到最新 checkpoint 的状态,并从存储在 checkpoint 中的 offset 开始重新消费 Kafka 中的消息。 + +因此,设置 checkpoint 的间隔定义了程序在发生故障时最多需要返回多少。 + +为了使 Kafka Consumer 支持容错,需要在 [执行环境]({{< ref "docs/deployment/config" >}}#execution-checkpointing-interval) 中启用拓扑的 checkpointing。 + +如果未启用 checkpoint,那么 Kafka consumer 将定期向 Zookeeper 提交 offset。 + + + +### Kafka Consumer Topic 和分区发现 + + + +#### 分区发现 + +Flink Kafka Consumer 支持发现动态创建的 Kafka 分区,并使用精准一次的语义保证去消耗它们。在初始检索分区元数据之后(即,当 Job 开始运行时)发现的所有分区将从最早可能的 offset 中消费。 + +默认情况下,是禁用了分区发现的。若要启用它,请在提供的属性配置中为 `flink.partition-discovery.interval-millis` 设置大于 0 的值,表示发现分区的间隔是以毫秒为单位的。 + + + +#### Topic 发现 + +在更高的级别上,Flink Kafka Consumer 还能够使用正则表达式基于 Topic 名称的模式匹配来发现 Topic。请看下面的例子: + +{{< tabs "46a10932-ea0f-4cba-aa4e-e12930963406" >}} +{{< tab "Java" >}} +```java +final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +Properties properties = new Properties(); +properties.setProperty("bootstrap.servers", "localhost:9092"); +properties.setProperty("group.id", "test"); + +FlinkKafkaConsumer myConsumer = new FlinkKafkaConsumer<>( + java.util.regex.Pattern.compile("test-topic-[0-9]"), + new SimpleStringSchema(), + properties); + +DataStream stream = env.addSource(myConsumer); +... +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() + +val properties = new Properties() +properties.setProperty("bootstrap.servers", "localhost:9092") +properties.setProperty("group.id", "test") + +val myConsumer = new FlinkKafkaConsumer[String]( + java.util.regex.Pattern.compile("test-topic-[0-9]"), + new SimpleStringSchema, + properties) + +val stream = env.addSource(myConsumer) +... +``` +{{< /tab >}} +{{< /tabs >}} + +在上面的例子中,当 Job 开始运行时,Consumer 将订阅名称与指定正则表达式匹配的所有主题(以 `test-topic` 开头并以单个数字结尾)。 + +要允许 consumer 在作业开始运行后发现动态创建的主题,那么请为 `flink.partition-discovery.interval-millis` 设置非负值。这允许 consumer 发现名称与指定模式匹配的新主题的分区。 + + + +### Kafka Consumer 提交 Offset 的行为配置 + +Flink Kafka Consumer 允许有配置如何将 offset 提交回 Kafka broker 的行为。请注意:Flink Kafka Consumer 不依赖于提交的 offset 来实现容错保证。提交的 offset 只是一种方法,用于公开 consumer 的进度以便进行监控。 + +配置 offset 提交行为的方法是否相同,取决于是否为 job 启用了 checkpointing。 + + - *禁用 Checkpointing:* 如果禁用了 checkpointing,则 Flink Kafka Consumer 依赖于内部使用的 Kafka client 自动定期 offset 提交功能。 + 因此,要禁用或启用 offset 的提交,只需将 `enable.auto.commit` 或者 `auto.commit.interval.ms` 的Key 值设置为提供的 `Properties` 配置中的适当值。 + + - *启用 Checkpointing:* 如果启用了 checkpointing,那么当 checkpointing 完成时,Flink Kafka Consumer 将提交的 offset 存储在 checkpoint 状态中。 + 这确保 Kafka broker 中提交的 offset 与 checkpoint 状态中的 offset 一致。 + 用户可以通过调用 consumer 上的 `setCommitOffsetsOnCheckpoints(boolean)` 方法来禁用或启用 offset 的提交(默认情况下,这个值是 true )。 + 注意,在这个场景中,`Properties` 中的自动定期 offset 提交设置会被完全忽略。 + + + +### Kafka Consumer 和 时间戳抽取以及 watermark 发送 + +在许多场景中,记录的时间戳是(显式或隐式)嵌入到记录本身中。此外,用户可能希望定期或以不规则的方式 Watermark,例如基于 Kafka 流中包含当前事件时间的 watermark 的特殊记录。对于这些情况,Flink Kafka Consumer 允许指定 `AssignerWithPeriodicWatermarks` 或 `AssignerWithPunctuatedWatermarks`。 + +你可以按照[此处]({{< ref "docs/dev/datastream/event-time/generating_watermarks" >}}})的说明指定自定义时间戳抽取器或者 Watermark 发送器,或者使用 [内置的]({{< ref "docs/dev/datastream/event-time/built_in" >}})。你也可以通过以下方式将其传递给你的 consumer: + +{{< tabs "c706ebfc-5d9b-49b2-8899-f3ac259a55cc" >}} +{{< tab "Java" >}} +```java +Properties properties = new Properties(); +properties.setProperty("bootstrap.servers", "localhost:9092"); +properties.setProperty("group.id", "test"); + +FlinkKafkaConsumer myConsumer = + new FlinkKafkaConsumer<>("topic", new SimpleStringSchema(), properties); +myConsumer.assignTimestampsAndWatermarks( + WatermarkStrategy. + .forBoundedOutOfOrderness(Duration.ofSeconds(20))); + +DataStream stream = env.addSource(myConsumer); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val properties = new Properties() +properties.setProperty("bootstrap.servers", "localhost:9092") +properties.setProperty("group.id", "test") + +val myConsumer = + new FlinkKafkaConsumer("topic", new SimpleStringSchema(), properties); +myConsumer.assignTimestampsAndWatermarks( + WatermarkStrategy. + .forBoundedOutOfOrderness(Duration.ofSeconds(20))) + +val stream = env.addSource(myConsumer) +``` +{{< /tab >}} +{{< /tabs >}} + +**请注意**:如果 watermark assigner 依赖于从 Kafka 读取的消息来上涨其 watermark (通常就是这种情况),那么所有主题和分区都需要有连续的消息流。否则,整个应用程序的 watermark 将无法上涨,所有基于时间的算子(例如时间窗口或带有计时器的函数)也无法运行。单个的 Kafka 分区也会导致这种反应。考虑设置适当的 [idelness timeouts]({{< ref "docs/dev/datastream/event-time/generating_watermarks" >}}#dealing-with-idle-sources) 来缓解这个问题。 + + + +## Kafka Producer + +Flink Kafka Producer 被称为 `FlinkKafkaProducer`。它允许将消息流写入一个或多个 Kafka topic。 + +构造器接收下列参数: + +1. 事件被写入的默认输出 topic +2. 序列化数据写入 Kafka 的 SerializationSchema / KafkaSerializationSchema +3. Kafka client 的 Properties。下列 property 是必须的: + * “bootstrap.servers” (逗号分隔 Kafka broker 列表) +4. 容错语义 + +{{< tabs "f6c1b77e-6b17-4fd3-837a-c9257e6c7c00" >}} +{{< tab "Java" >}} +```java +DataStream stream = ...; + +Properties properties = new Properties(); +properties.setProperty("bootstrap.servers", "localhost:9092"); + +FlinkKafkaProducer myProducer = new FlinkKafkaProducer( + "my-topic", // 目标 topic + new SimpleStringSchema() // 序列化 schema + properties, // producer 配置 + FlinkKafkaProducer.Semantic.EXACTLY_ONCE); // 容错 + +stream.addSink(myProducer); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val stream: DataStream[String] = ... + +val properties = new Properties +properties.setProperty("bootstrap.servers", "localhost:9092") + +val myProducer = new FlinkKafkaProducer[String]( + "my-topic", // 目标 topic + new SimpleStringSchema(), // 序列化 schema + properties, // producer 配置 + FlinkKafkaProducer.Semantic.EXACTLY_ONCE) // 容错 + +stream.addSink(myProducer) +``` +{{< /tab >}} +{{< /tabs >}} + + + +## `SerializationSchema` + +Flink Kafka Producer 需要知道如何将 Java/Scala 对象转化为二进制数据。 + +`KafkaSerializationSchema` 允许用户指定这样的 schema。它会为每个记录调用 `ProducerRecord serialize(T element, @Nullable Long timestamp)` 方法,产生一个写入到 Kafka 的 `ProducerRecord`。 + +用户可以对如何将数据写到 Kafka 进行细粒度的控制。你可以通过 producer record: + +* 设置 header 值 +* 为每个 record 定义 key +* 指定数据的自定义分区 + + + +### Kafka Producer 和容错 + +启用 Flink 的 checkpointing 后,`FlinkKafkaProducer` 可以提供精确一次的语义保证。 + +除了启用 Flink 的 checkpointing,你也可以通过将适当的 `semantic` 参数传递给 `FlinkKafkaProducer` 来选择三种不同的操作模式: + +* `Semantic.NONE`:Flink 不会有任何语义的保证,产生的记录可能会丢失或重复。 +* `Semantic.AT_LEAST_ONCE`(默认设置):可以保证不会丢失任何记录(但是记录可能会重复) +* `Semantic.EXACTLY_ONCE`:使用 Kafka 事务提供精确一次语义。无论何时,在使用事务写入 Kafka 时,都要记得为所有消费 Kafka 消息的应用程序设置所需的 `isolation.level`(`read_committed` 或 `read_uncommitted` - 后者是默认值)。 + + + +##### 注意事项 + +`Semantic.EXACTLY_ONCE` 模式依赖于事务提交的能力。事务提交发生于触发 checkpoint 之前,以及从 checkpoint 恢复之后。如果从 Flink 应用程序崩溃到完全重启的时间超过了 Kafka 的事务超时时间,那么将会有数据丢失(Kafka 会自动丢弃超出超时时间的事务)。考虑到这一点,请根据预期的宕机时间来合理地配置事务超时时间。 + +默认情况下,Kafka broker 将 `transaction.max.timeout.ms` 设置为 15 分钟。此属性不允许为大于其值的 producer 设置事务超时时间。 +默认情况下,`FlinkKafkaProducer` 将 producer config 中的 `transaction.timeout.ms` 属性设置为 1 小时,因此在使用 `Semantic.EXACTLY_ONCE` 模式之前应该增加 `transaction.max.timeout.ms` 的值。 + +在 `KafkaConsumer` 的 `read_committed` 模式中,任何未结束(既未中止也未完成)的事务将阻塞来自给定 Kafka topic 的未结束事务之后的所有读取数据。 +换句话说,在遵循如下一系列事件之后: + +1. 用户启动了 `transaction1` 并使用它写了一些记录 +2. 用户启动了 `transaction2` 并使用它编写了一些其他记录 +3. 用户提交了 `transaction2` + +即使 `transaction2` 中的记录已提交,在提交或中止 `transaction1` 之前,消费者也不会看到这些记录。这有 2 层含义: + + * 首先,在 Flink 应用程序的正常工作期间,用户可以预料 Kafka 主题中生成的记录的可见性会延迟,相当于已完成 checkpoint 之间的平均时间。 + * 其次,在 Flink 应用程序失败的情况下,此应用程序正在写入的供消费者读取的主题将被阻塞,直到应用程序重新启动或配置的事务超时时间过去后,才恢复正常。此标注仅适用于有多个 agent 或者应用程序写入同一 Kafka 主题的情况。 + +**注意**:`Semantic.EXACTLY_ONCE` 模式为每个 `FlinkKafkaProducer` 实例使用固定大小的 KafkaProducer 池。每个 checkpoint 使用其中一个 producer。如果并发 checkpoint 的数量超过池的大小,`FlinkKafkaProducer` 将抛出异常,并导致整个应用程序失败。请合理地配置最大池大小和最大并发 checkpoint 数量。 + +**注意**:`Semantic.EXACTLY_ONCE` 会尽一切可能不留下任何逗留的事务,否则会阻塞其他消费者从这个 Kafka topic 中读取数据。但是,如果 Flink 应用程序在第一次 checkpoint 之前就失败了,那么在重新启动此类应用程序后,系统中不会有先前池大小(pool size)相关的信息。因此,在第一次 checkpoint 完成前对 Flink 应用程序进行缩容,且并发数缩容倍数大于安全系数 `FlinkKafkaProducer.SAFE_SCALE_DOWN_FACTOR` 的值的话,是不安全的。 + + + +## Kafka 连接器指标 + +Flink 的 Kafka 连接器通过 Flink 的 [metric 系统]({{< ref "docs/ops/metrics" >}}) 提供一些指标来分析 Kafka Connector 的状况。Producer 通过 Flink 的 metrics 系统为所有支持的版本导出 Kafka 的内部指标。consumer 从 Kafka 0.10 版本开始导出所有指标。Kafka 在其[文档](http://kafka.apache.org/documentation/#selector_monitoring)中列出了所有导出的指标。 + +除了这些指标之外,所有 consumer 都暴露了每个主题分区的 `current-offsets` 和 `committed-offsets`。`current-offsets` 是指分区中的当前偏移量。指的是我们成功检索和发出的最后一个元素的偏移量。`committed-offsets` 是最后提交的偏移量。这为用户提供了 at-least-once 语义,用于提交给 Zookeeper 或 broker 的偏移量。对于 Flink 的偏移检查点,系统提供精准一次语义。 + +提交给 ZK 或 broker 的偏移量也可以用来跟踪 Kafka consumer 的读取进度。每个分区中提交的偏移量和最近偏移量之间的差异称为 *consumer lag*。如果 Flink 拓扑消耗来自 topic 的数据的速度比添加新数据的速度慢,那么延迟将会增加,consumer 将会滞后。对于大型生产部署,我们建议监视该指标,以避免增加延迟。 + + + +## 启用 Kerberos 身份验证 + +Flink 通过 Kafka 连接器提供了一流的支持,可以对 Kerberos 配置的 Kafka 安装进行身份验证。只需在 `flink-conf.yaml` 中配置 Flink。像这样为 Kafka 启用 Kerberos 身份验证: + +1. 通过设置以下内容配置 Kerberos 票据 + - `security.kerberos.login.use-ticket-cache`:默认情况下,这个值是 `true`,Flink 将尝试在 `kinit` 管理的票据缓存中使用 Kerberos 票据。注意!在 YARN 上部署的 Flink jobs 中使用 Kafka 连接器时,使用票据缓存的 Kerberos 授权将不起作用。使用 Mesos 进行部署时也是如此,因为 Mesos 部署不支持使用票据缓存进行授权。 + - `security.kerberos.login.keytab` 和 `security.kerberos.login.principal`:要使用 Kerberos keytabs,需为这两个属性设置值。 + +2. 将 `KafkaClient` 追加到 `security.kerberos.login.contexts`:这告诉 Flink 将配置的 Kerberos 票据提供给 Kafka 登录上下文以用于 Kafka 身份验证。 + +一旦启用了基于 Kerberos 的 Flink 安全性后,只需在提供的属性配置中包含以下两个设置(通过传递给内部 Kafka 客户端),即可使用 Flink Kafka Consumer 或 Producer 向 Kafk a进行身份验证: + +- 将 `security.protocol` 设置为 `SASL_PLAINTEXT`(默认为 `NONE`):用于与 Kafka broker 进行通信的协议。使用独立 Flink 部署时,也可以使用 `SASL_SSL`;请在[此处](https://kafka.apache.org/documentation/#security_configclients)查看如何为 SSL 配置 Kafka 客户端。 +- 将 `sasl.kerberos.service.name` 设置为 `kafka`(默认为 `kafka`):此值应与用于 Kafka broker 配置的 `sasl.kerberos.service.name` 相匹配。客户端和服务器配置之间的服务名称不匹配将导致身份验证失败。 + +有关 Kerberos 安全性 Flink 配置的更多信息,请参见[这里]({{< ref "docs/deployment/config" >}}})。你也可以在[这里]({{< ref "docs/deployment/security/security-kerberos" >}})进一步了解 Flink 如何在内部设置基于 kerberos 的安全性。 + + + +## 升级到最近的连接器版本 + +通用的升级步骤概述见 [升级 Jobs 和 Flink 版本指南]({{< ref "docs/ops/upgrading" >}})。对于 Kafka,你还需要遵循这些步骤: + +* 不要同时升级 Flink 和 Kafka 连接器 +* 确保你对 Consumer 设置了 `group.id` +* 在 Consumer 上设置 `setCommitOffsetsOnCheckpoints(true)`,以便读 offset 提交到 Kafka。务必在停止和恢复 savepoint 前执行此操作。你可能需要在旧的连接器版本上进行停止/重启循环来启用此设置。 +* 在 Consumer 上设置 `setStartFromGroupOffsets(true)`,以便我们从 Kafka 获取读 offset。这只会在 Flink 状态中没有读 offset 时生效,这也是为什么下一步非要重要的原因。 +* 修改 source/sink 分配到的 `uid`。这会确保新的 source/sink 不会从旧的 sink/source 算子中读取状态。 +* 使用 `--allow-non-restored-state` 参数启动新 job,因为我们在 savepoint 中仍然有先前连接器版本的状态。 + + + +## 问题排查 + +
    +如果你在使用 Flink 时对 Kafka 有问题,请记住,Flink 只封装 KafkaConsumerKafkaProducer,你的问题可能独立于 Flink,有时可以通过升级 Kafka broker 程序、重新配置 Kafka broker 程序或在 Flink 中重新配置 KafkaConsumerKafkaProducer 来解决。下面列出了一些常见问题的示例。 +
    + + + +### 数据丢失 + +根据你的 Kafka 配置,即使在 Kafka 确认写入后,你仍然可能会遇到数据丢失。特别要记住在 Kafka 的配置中设置以下属性: + +- `acks` +- `log.flush.interval.messages` +- `log.flush.interval.ms` +- `log.flush.*` + +上述选项的默认值是很容易导致数据丢失的。请参考 Kafka 文档以获得更多的解释。 + + + +### UnknownTopicOrPartitionException + +导致此错误的一个可能原因是正在进行新的 leader 选举,例如在重新启动 Kafka broker 之后或期间。这是一个可重试的异常,因此 Flink job 应该能够重启并恢复正常运行。也可以通过更改 producer 设置中的 `retries` 属性来规避。但是,这可能会导致重新排序消息,反过来可以通过将 `max.in.flight.requests.per.connection` 设置为 1 来避免不需要的消息。 + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/kinesis.md b/docs/content.zh/docs/connectors/datastream/kinesis.md new file mode 100644 index 0000000000000..3088f0b9a94d1 --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/kinesis.md @@ -0,0 +1,701 @@ +--- +title: Kinesis +weight: 4 +type: docs +aliases: + - /zh/dev/connectors/kinesis.html + - /zh/apis/streaming/connectors/kinesis.html +--- + + +# Amazon Kinesis Data Streams Connector + +The Kinesis connector provides access to [Amazon AWS Kinesis Streams](http://aws.amazon.com/kinesis/streams/). + +To use the connector, add the following Maven dependency to your project: + +{{< artifact flink-connector-kinesis withScalaVersion >}} + +{{< hint warning >}} +**Attention** Prior to Flink version 1.10.0 the `flink-connector-kinesis{{< scala_version >}}` has a dependency on code licensed under the [Amazon Software License](https://aws.amazon.com/asl/). +Linking to the prior versions of flink-connector-kinesis will include this code into your application. +{{< /hint >}} + +Due to the licensing issue, the `flink-connector-kinesis{{< scala_version >}}` artifact is not deployed to Maven central for the prior versions. Please see the version specific documentation for further information. + +## Using the Amazon Kinesis Streams Service +Follow the instructions from the [Amazon Kinesis Streams Developer Guide](https://docs.aws.amazon.com/streams/latest/dev/learning-kinesis-module-one-create-stream.html) +to setup Kinesis streams. + +## Configuring Access to Kinesis with IAM +Make sure to create the appropriate IAM policy to allow reading / writing to / from the Kinesis streams. See examples [here](https://docs.aws.amazon.com/streams/latest/dev/controlling-access.html). + +Depending on your deployment you would choose a different Credentials Provider to allow access to Kinesis. +By default, the `AUTO` Credentials Provider is used. +If the access key ID and secret key are set in the configuration, the `BASIC` provider is used. + +A specific Credentials Provider can **optionally** be set by using the `AWSConfigConstants.AWS_CREDENTIALS_PROVIDER` setting. + +Supported Credential Providers are: +* `AUTO` - Using the default AWS Credentials Provider chain that searches for credentials in the following order: `ENV_VARS`, `SYS_PROPS`, `WEB_IDENTITY_TOKEN`, `PROFILE` and EC2/ECS credentials provider. +* `BASIC` - Using access key ID and secret key supplied as configuration. +* `ENV_VAR` - Using `AWS_ACCESS_KEY_ID` & `AWS_SECRET_ACCESS_KEY` environment variables. +* `SYS_PROP` - Using Java system properties aws.accessKeyId and aws.secretKey. +* `PROFILE` - Use AWS credentials profile file to create the AWS credentials. +* `ASSUME_ROLE` - Create AWS credentials by assuming a role. The credentials for assuming the role must be supplied. +* `WEB_IDENTITY_TOKEN` - Create AWS credentials by assuming a role using Web Identity Token. + +## Kinesis Consumer + +The `FlinkKinesisConsumer` is an exactly-once parallel streaming data source that subscribes to multiple AWS Kinesis +streams within the same AWS service region, and can transparently handle resharding of streams while the job is running. Each subtask of the consumer is +responsible for fetching data records from multiple Kinesis shards. The number of shards fetched by each subtask will +change as shards are closed and created by Kinesis. + +Before consuming data from Kinesis streams, make sure that all streams are created with the status "ACTIVE" in the AWS dashboard. + +{{< tabs "58b6c235-48ee-4cf7-aabc-41e0679a3370" >}} +{{< tab "Java" >}} +```java +Properties consumerConfig = new Properties(); +consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1"); +consumerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id"); +consumerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key"); +consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST"); + +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +DataStream kinesis = env.addSource(new FlinkKinesisConsumer<>( + "kinesis_stream_name", new SimpleStringSchema(), consumerConfig)); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val consumerConfig = new Properties() +consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1") +consumerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id") +consumerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key") +consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST") + +val env = StreamExecutionEnvironment.getExecutionEnvironment + +val kinesis = env.addSource(new FlinkKinesisConsumer[String]( + "kinesis_stream_name", new SimpleStringSchema, consumerConfig)) +``` +{{< /tab >}} +{{< /tabs >}} + +The above is a simple example of using the consumer. Configuration for the consumer is supplied with a `java.util.Properties` +instance, the configuration keys for which can be found in `AWSConfigConstants` (AWS-specific parameters) and +`ConsumerConfigConstants` (Kinesis consumer parameters). The example +demonstrates consuming a single Kinesis stream in the AWS region "us-east-1". The AWS credentials are supplied using the basic method in which +the AWS access key ID and secret access key are directly supplied in the configuration. Also, data is being consumed +from the newest position in the Kinesis stream (the other option will be setting `ConsumerConfigConstants.STREAM_INITIAL_POSITION` +to `TRIM_HORIZON`, which lets the consumer start reading the Kinesis stream from the earliest record possible). + +Other optional configuration keys for the consumer can be found in `ConsumerConfigConstants`. + +Note that the configured parallelism of the Flink Kinesis Consumer source +can be completely independent of the total number of shards in the Kinesis streams. +When the number of shards is larger than the parallelism of the consumer, +then each consumer subtask can subscribe to multiple shards; otherwise +if the number of shards is smaller than the parallelism of the consumer, +then some consumer subtasks will simply be idle and wait until it gets assigned +new shards (i.e., when the streams are resharded to increase the +number of shards for higher provisioned Kinesis service throughput). + +Also note that the assignment of shards to subtasks may not be optimal when +shard IDs are not consecutive (as result of dynamic re-sharding in Kinesis). +For cases where skew in the assignment leads to significant imbalanced consumption, +a custom implementation of `KinesisShardAssigner` can be set on the consumer. + +### The `DeserializationSchema` + +Flink Kinesis Consumer also needs a schema to know how to turn the binary data in a Kinesis Data Stream into Java objects. +The `KinesisDeserializationSchema` allows users to specify such a schema. The `T deserialize(byte[] recordValue, String partitionKey, String seqNum, long approxArrivalTimestamp, String stream, String shardId)` +method gets called for each Kinesis record. + +For convenience, Flink provides the following schemas out of the box: + +1. `TypeInformationSerializationSchema` which creates a schema based on a Flink's `TypeInformation`. + This is useful if the data is both written and read by Flink. + This schema is a performant Flink-specific alternative to other generic serialization approaches. + +2. `AvroDeserializationSchema` which reads data serialized with Avro format using a statically provided schema. It can + infer the schema from Avro generated classes (`AvroDeserializationSchema.forSpecific(...)`) or it can work with `GenericRecords` + with a manually provided schema (with `AvroDeserializationSchema.forGeneric(...)`). This deserialization schema expects that + the serialized records DO NOT contain embedded schema. + + - You can use [AWS Glue Schema Registry](https://docs.aws.amazon.com/glue/latest/dg/schema-registry.html) + to retrieve the writer’s schema. Similarly, the deserialization record will be read with the schema from AWS Glue Schema Registry and transformed + (either through `GlueSchemaRegistryAvroDeserializationSchema.forGeneric(...)` or `GlueSchemaRegistryAvroDeserializationSchema.forSpecific(...)`). + For more information on integrating the AWS Glue Schema Registry with Apache Flink see + [Use Case: Amazon Kinesis Data Analytics for Apache Flink](https://docs.aws.amazon.com/glue/latest/dg/schema-registry-integrations.html#schema-registry-integrations-kinesis-data-analytics-apache-flink). + +
    To use this deserialization schema one has to add the following additional dependency: + +{{< tabs "8c6721c7-4a48-496e-b0fe-6522cf6a5e13" >}} +{{< tab "AvroDeserializationSchema" >}} +{{< artifact flink-avro >}} +{{< /tab >}} +{{< tab "GlueSchemaRegistryAvroDeserializationSchema" >}} +{{< artifact flink-avro-glue-schema-registry >}} +{{< /tab >}} +{{< /tabs >}} + +### Configuring Starting Position + +The Flink Kinesis Consumer currently provides the following options to configure where to start reading Kinesis streams, simply by setting `ConsumerConfigConstants.STREAM_INITIAL_POSITION` to +one of the following values in the provided configuration properties (the naming of the options identically follows [the namings used by the AWS Kinesis Streams service](http://docs.aws.amazon.com/kinesis/latest/APIReference/API_GetShardIterator.html#API_GetShardIterator_RequestSyntax)): + +- `LATEST`: read all shards of all streams starting from the latest record. +- `TRIM_HORIZON`: read all shards of all streams starting from the earliest record possible (data may be trimmed by Kinesis depending on the retention settings). +- `AT_TIMESTAMP`: read all shards of all streams starting from a specified timestamp. The timestamp must also be specified in the configuration +properties by providing a value for `ConsumerConfigConstants.STREAM_INITIAL_TIMESTAMP`, in one of the following date pattern : + - a non-negative double value representing the number of seconds that has elapsed since the Unix epoch (for example, `1459799926.480`). + - a user defined pattern, which is a valid pattern for `SimpleDateFormat` provided by `ConsumerConfigConstants.STREAM_TIMESTAMP_DATE_FORMAT`. + If `ConsumerConfigConstants.STREAM_TIMESTAMP_DATE_FORMAT` is not defined then the default pattern will be `yyyy-MM-dd'T'HH:mm:ss.SSSXXX` + (for example, timestamp value is `2016-04-04` and pattern is `yyyy-MM-dd` given by user or timestamp value is `2016-04-04T19:58:46.480-00:00` without given a pattern). + +### Fault Tolerance for Exactly-Once User-Defined State Update Semantics + +With Flink's checkpointing enabled, the Flink Kinesis Consumer will consume records from shards in Kinesis streams and +periodically checkpoint each shard's progress. In case of a job failure, Flink will restore the streaming program to the +state of the latest complete checkpoint and re-consume the records from Kinesis shards, starting from the progress that +was stored in the checkpoint. + +The interval of drawing checkpoints therefore defines how much the program may have to go back at most, in case of a failure. + +To use fault tolerant Kinesis Consumers, checkpointing of the topology needs to be enabled at the execution environment: + +{{< tabs "b1399ed7-5855-446d-9684-7a49de9b4c97" >}} +{{< tab "Java" >}} +```java +final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +env.enableCheckpointing(5000); // checkpoint every 5000 msecs +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() +env.enableCheckpointing(5000) // checkpoint every 5000 msecs +``` +{{< /tab >}} +{{< /tabs >}} + +Also note that Flink can only restart the topology if enough processing slots are available to restart the topology. +Therefore, if the topology fails due to loss of a TaskManager, there must still be enough slots available afterwards. +Flink on YARN supports automatic restart of lost YARN containers. + +### Using Enhanced Fan-Out + +[Enhanced Fan-Out (EFO)](https://aws.amazon.com/blogs/aws/kds-enhanced-fanout/) increases the maximum +number of concurrent consumers per Kinesis stream. +Without EFO, all concurrent consumers share a single read quota per shard. +Using EFO, each consumer gets a distinct dedicated read quota per shard, allowing read throughput to scale with the number of consumers. +Using EFO will [incur additional cost](https://aws.amazon.com/kinesis/data-streams/pricing/). + +In order to enable EFO two additional configuration parameters are required: + +- `RECORD_PUBLISHER_TYPE`: Determines whether to use `EFO` or `POLLING`. The default `RecordPublisher` is `POLLING`. +- `EFO_CONSUMER_NAME`: A name to identify the consumer. +For a given Kinesis data stream, each consumer must have a unique name. +However, consumer names do not have to be unique across data streams. +Reusing a consumer name will result in existing subscriptions being terminated. + +The code snippet below shows a simple example configurating an EFO consumer. + +{{< tabs "42345893-70c3-4678-a348-4c419b337eb1" >}} +{{< tab "Java" >}} +```java +Properties consumerConfig = new Properties(); +consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1"); +consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST"); + +consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE, + ConsumerConfigConstants.RecordPublisherType.EFO.name()); +consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer"); + +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +DataStream kinesis = env.addSource(new FlinkKinesisConsumer<>( + "kinesis_stream_name", new SimpleStringSchema(), consumerConfig)); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val consumerConfig = new Properties() +consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1") +consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST") + +consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE, + ConsumerConfigConstants.RecordPublisherType.EFO.name()); +consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer"); + +val env = StreamExecutionEnvironment.getExecutionEnvironment() + +val kinesis = env.addSource(new FlinkKinesisConsumer[String]( + "kinesis_stream_name", new SimpleStringSchema, consumerConfig)) +``` +{{< /tab >}} +{{< /tabs >}} + +#### EFO Stream Consumer Registration/Deregistration + +In order to use EFO, a stream consumer must be registered against each stream you wish to consume. +By default, the `FlinkKinesisConsumer` will register the stream consumer automatically when the Flink job starts. +The stream consumer will be registered using the name provided by the `EFO_CONSUMER_NAME` configuration. +`FlinkKinesisConsumer` provides three registration strategies: + +- Registration + - `LAZY` (default): Stream consumers are registered when the Flink job starts running. + If the stream consumer already exists, it will be reused. + This is the preferred strategy for the majority of applications. + However, jobs with parallelism greater than 1 will result in tasks competing to register and acquire the stream consumer ARN. + For jobs with very large parallelism this can result in an increased start-up time. + The describe operation has a limit of 20 [transactions per second](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_DescribeStreamConsumer.html), + this means application startup time will increase by roughly `parallelism/20 seconds`. + - `EAGER`: Stream consumers are registered in the `FlinkKinesisConsumer` constructor. + If the stream consumer already exists, it will be reused. + This will result in registration occurring when the job is constructed, + either on the Flink Job Manager or client environment submitting the job. + Using this strategy results in a single thread registering and retrieving the stream consumer ARN, + reducing startup time over `LAZY` (with large parallelism). + However, consider that the client environment will require access to the AWS services. + - `NONE`: Stream consumer registration is not performed by `FlinkKinesisConsumer`. + Registration must be performed externally using the [AWS CLI or SDK](https://aws.amazon.com/tools/) + to invoke [RegisterStreamConsumer](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_RegisterStreamConsumer.html). + Stream consumer ARNs should be provided to the job via the consumer configuration. +- Deregistration + - `LAZY|EAGER` (default): Stream consumers are deregistered when the job is shutdown gracefully. + In the event that a job terminates within executing the shutdown hooks, stream consumers will remain active. + In this situation the stream consumers will be gracefully reused when the application restarts. + - `NONE`: Stream consumer deregistration is not performed by `FlinkKinesisConsumer`. + +Below is an example configuration to use the `EAGER` registration strategy: + +{{< tabs "a85d716b-6c1c-46d8-9ee4-12d8380a0c06" >}} +{{< tab "Java" >}} +```java +Properties consumerConfig = new Properties(); +consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1"); +consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST"); + +consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE, + ConsumerConfigConstants.RecordPublisherType.EFO.name()); +consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer"); + +consumerConfig.put(ConsumerConfigConstants.EFO_REGISTRATION_TYPE, + ConsumerConfigConstants.EFORegistrationType.EAGER.name()); + +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +DataStream kinesis = env.addSource(new FlinkKinesisConsumer<>( + "kinesis_stream_name", new SimpleStringSchema(), consumerConfig)); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val consumerConfig = new Properties() +consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1") +consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST") + +consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE, + ConsumerConfigConstants.RecordPublisherType.EFO.name()); +consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer"); + +consumerConfig.put(ConsumerConfigConstants.EFO_REGISTRATION_TYPE, + ConsumerConfigConstants.EFORegistrationType.EAGER.name()); + +val env = StreamExecutionEnvironment.getExecutionEnvironment() + +val kinesis = env.addSource(new FlinkKinesisConsumer[String]( + "kinesis_stream_name", new SimpleStringSchema, consumerConfig)) +``` +{{< /tab >}} +{{< /tabs >}} + +Below is an example configuration to use the `NONE` registration strategy: + +{{< tabs "00b46c87-7740-4263-8040-2aa7e2960513" >}} +{{< tab "Java" >}} +```java +Properties consumerConfig = new Properties(); +consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1"); +consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST"); + +consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE, + ConsumerConfigConstants.RecordPublisherType.EFO.name()); +consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer"); + +consumerConfig.put(ConsumerConfigConstants.EFO_REGISTRATION_TYPE, + ConsumerConfigConstants.EFORegistrationType.NONE.name()); +consumerConfig.put(ConsumerConfigConstants.efoConsumerArn("stream-name"), + "arn:aws:kinesis::>:stream//consumer/:"); + +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +DataStream kinesis = env.addSource(new FlinkKinesisConsumer<>( + "kinesis_stream_name", new SimpleStringSchema(), consumerConfig)); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val consumerConfig = new Properties() +consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1") +consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST") + +consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE, + ConsumerConfigConstants.RecordPublisherType.EFO.name()); +consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer"); + +consumerConfig.put(ConsumerConfigConstants.EFO_REGISTRATION_TYPE, + ConsumerConfigConstants.EFORegistrationType.NONE.name()); +consumerConfig.put(ConsumerConfigConstants.efoConsumerArn("stream-name"), + "arn:aws:kinesis::>:stream//consumer/:"); + +val env = StreamExecutionEnvironment.getExecutionEnvironment() + +val kinesis = env.addSource(new FlinkKinesisConsumer[String]( + "kinesis_stream_name", new SimpleStringSchema, consumerConfig)) +``` +{{< /tab >}} +{{< /tabs >}} + +### Event Time for Consumed Records + +If streaming topologies choose to use the [event time notion]({{< ref "docs/concepts/time" >}}) for record +timestamps, an *approximate arrival timestamp* will be used by default. This timestamp is attached to records by Kinesis once they +were successfully received and stored by streams. Note that this timestamp is typically referred to as a Kinesis server-side +timestamp, and there are no guarantees about the accuracy or order correctness (i.e., the timestamps may not always be +ascending). + +Users can choose to override this default with a custom timestamp, as described [here]({{< ref "docs/dev/datastream/event-time/generating_watermarks" >}}), +or use one from the [predefined ones]({{< ref "docs/dev/datastream/event-time/built_in" >}}). After doing so, +it can be passed to the consumer in the following way: + +{{< tabs "8fbaf5cb-3b76-4c62-a74e-db51b60f6600" >}} +{{< tab "Java" >}} +```java +FlinkKinesisConsumer consumer = new FlinkKinesisConsumer<>( + "kinesis_stream_name", + new SimpleStringSchema(), + kinesisConsumerConfig); +consumer.setPeriodicWatermarkAssigner(new CustomAssignerWithPeriodicWatermarks()); +DataStream stream = env + .addSource(consumer) + .print(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val consumer = new FlinkKinesisConsumer[String]( + "kinesis_stream_name", + new SimpleStringSchema(), + kinesisConsumerConfig); +consumer.setPeriodicWatermarkAssigner(new CustomAssignerWithPeriodicWatermarks()); +val stream = env + .addSource(consumer) + .print(); +``` +{{< /tab >}} +{{< /tabs >}} + +Internally, an instance of the assigner is executed per shard / consumer thread (see threading model below). +When an assigner is specified, for each record read from Kinesis, the extractTimestamp(T element, long previousElementTimestamp) +is called to assign a timestamp to the record and getCurrentWatermark() to determine the new watermark for the shard. +The watermark of the consumer subtask is then determined as the minimum watermark of all its shards and emitted periodically. +The per shard watermark is essential to deal with varying consumption speed between shards, that otherwise could lead +to issues with downstream logic that relies on the watermark, such as incorrect late data dropping. + +By default, the watermark is going to stall if shards do not deliver new records. +The property `ConsumerConfigConstants.SHARD_IDLE_INTERVAL_MILLIS` can be used to avoid this potential issue through a +timeout that will allow the watermark to progress despite of idle shards. + +### Event Time Alignment for Shard Consumers + +The Flink Kinesis Consumer optionally supports synchronization between parallel consumer subtasks (and their threads) +to avoid the event time skew related problems described in [Event time synchronization across sources](https://issues.apache.org/jira/browse/FLINK-10886). + +To enable synchronization, set the watermark tracker on the consumer: + +
    +```java +JobManagerWatermarkTracker watermarkTracker = + new JobManagerWatermarkTracker("myKinesisSource"); +consumer.setWatermarkTracker(watermarkTracker); +``` +
    + +The `JobManagerWatermarkTracker` will use a global aggregate to synchronize the per subtask watermarks. Each subtask +uses a per shard queue to control the rate at which records are emitted downstream based on how far ahead of the global +watermark the next record in the queue is. + +The "emit ahead" limit is configured via `ConsumerConfigConstants.WATERMARK_LOOKAHEAD_MILLIS`. Smaller values reduce +the skew but also the throughput. Larger values will allow the subtask to proceed further before waiting for the global +watermark to advance. + +Another variable in the throughput equation is how frequently the watermark is propagated by the tracker. +The interval can be configured via `ConsumerConfigConstants.WATERMARK_SYNC_MILLIS`. +Smaller values reduce emitter waits and come at the cost of increased communication with the job manager. + +Since records accumulate in the queues when skew occurs, increased memory consumption needs to be expected. +How much depends on the average record size. With larger sizes, it may be necessary to adjust the emitter queue capacity via +`ConsumerConfigConstants.WATERMARK_SYNC_QUEUE_CAPACITY`. + +### Threading Model + +The Flink Kinesis Consumer uses multiple threads for shard discovery and data consumption. + +#### Shard Discovery + +For shard discovery, each parallel consumer subtask will have a single thread that constantly queries Kinesis for shard +information even if the subtask initially did not have shards to read from when the consumer was started. In other words, if +the consumer is run with a parallelism of 10, there will be a total of 10 threads constantly querying Kinesis regardless +of the total amount of shards in the subscribed streams. + +#### Polling (default) Record Publisher + +For `POLLING` data consumption, a single thread will be created to consume each discovered shard. Threads will terminate when the +shard it is responsible of consuming is closed as a result of stream resharding. In other words, there will always be +one thread per open shard. + +#### Enhanced Fan-Out Record Publisher + +For `EFO` data consumption the threading model is the same as `POLLING`, with additional thread pools to handle +asynchronous communication with Kinesis. AWS SDK v2.x `KinesisAsyncClient` uses additional threads for +Netty to handle IO and asynchronous response. Each parallel consumer subtask will have their own instance of the `KinesisAsyncClient`. +In other words, if the consumer is run with a parallelism of 10, there will be a total of 10 `KinesisAsyncClient` instances. +A separate client will be created and subsequently destroyed when registering and deregistering stream consumers. + +### Internally Used Kinesis APIs + +The Flink Kinesis Consumer uses the [AWS Java SDK](http://aws.amazon.com/sdk-for-java/) internally to call Kinesis APIs +for shard discovery and data consumption. Due to Amazon's [service limits for Kinesis Streams](http://docs.aws.amazon.com/streams/latest/dev/service-sizes-and-limits.html) +on the APIs, the consumer will be competing with other non-Flink consuming applications that the user may be running. +Below is a list of APIs called by the consumer with description of how the consumer uses the API, as well as information +on how to deal with any errors or warnings that the Flink Kinesis Consumer may have due to these service limits. + +#### Shard Discovery + +- *[ListShards](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_ListShards.html)*: this is constantly called +by a single thread in each parallel consumer subtask to discover any new shards as a result of stream resharding. By default, +the consumer performs the shard discovery at an interval of 10 seconds, and will retry indefinitely until it gets a result +from Kinesis. If this interferes with other non-Flink consuming applications, users can slow down the consumer of +calling this API by setting a value for `ConsumerConfigConstants.SHARD_DISCOVERY_INTERVAL_MILLIS` in the supplied +configuration properties. This sets the discovery interval to a different value. Note that this setting directly impacts +the maximum delay of discovering a new shard and starting to consume it, as shards will not be discovered during the interval. + +#### Polling (default) Record Publisher + +- *[GetShardIterator](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_GetShardIterator.html)*: this is called +only once when per shard consuming threads are started, and will retry if Kinesis complains that the transaction limit for the +API has exceeded, up to a default of 3 attempts. Note that since the rate limit for this API is per shard (not per stream), +the consumer itself should not exceed the limit. Usually, if this happens, users can either try to slow down any other +non-Flink consuming applications of calling this API, or modify the retry behaviour of this API call in the consumer by +setting keys prefixed by `ConsumerConfigConstants.SHARD_GETITERATOR_*` in the supplied configuration properties. + +- *[GetRecords](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_GetRecords.html)*: this is constantly called +by per shard consuming threads to fetch records from Kinesis. When a shard has multiple concurrent consumers (when there +are any other non-Flink consuming applications running), the per shard rate limit may be exceeded. By default, on each call +of this API, the consumer will retry if Kinesis complains that the data size / transaction limit for the API has exceeded, +up to a default of 3 attempts. Users can either try to slow down other non-Flink consuming applications, or adjust the throughput +of the consumer by setting the `ConsumerConfigConstants.SHARD_GETRECORDS_MAX` and +`ConsumerConfigConstants.SHARD_GETRECORDS_INTERVAL_MILLIS` keys in the supplied configuration properties. Setting the former +adjusts the maximum number of records each consuming thread tries to fetch from shards on each call (default is 10,000), while +the latter modifies the sleep interval between each fetch (default is 200). The retry behaviour of the +consumer when calling this API can also be modified by using the other keys prefixed by `ConsumerConfigConstants.SHARD_GETRECORDS_*`. + +#### Enhanced Fan-Out Record Publisher + +- *[SubscribeToShard](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_SubscribeToShard.html)*: this is called +by per shard consuming threads to obtain shard subscriptions. A shard subscription is typically active for 5 minutes, +but subscriptions will be reaquired if any recoverable errors are thrown. Once a subscription is acquired, the consumer +will receive a stream of [SubscribeToShardEvents](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_SubscribeToShardEvent.html)s. +Retry and backoff parameters can be configured using the `ConsumerConfigConstants.SUBSCRIBE_TO_SHARD_*` keys. + +- *[DescribeStreamSummary](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_DescribeStreamSummary.html)*: this is called +once per stream, during stream consumer registration. By default, the `LAZY` registration strategy will scale the +number of calls by the job parallelism. `EAGER` will invoke this once per stream and `NONE` will not invoke this API. +Retry and backoff parameters can be configured using the +`ConsumerConfigConstants.STREAM_DESCRIBE_*` keys. + +- *[DescribeStreamConsumer](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_DescribeStreamConsumer.html)*: +this is called during stream consumer registration and deregistration. For each stream this service will be invoked +periodically until the stream consumer is reported `ACTIVE`/`not found` for registration/deregistration. By default, +the `LAZY` registration strategy will scale the number of calls by the job parallelism. `EAGER` will call the service +once per stream for registration, and scale the number of calls by the job parallelism for deregistration. +`NONE` will not invoke this service. Retry and backoff parameters can be configured using the +`ConsumerConfigConstants.DESCRIBE_STREAM_CONSUMER_*` keys. + +- *[RegisterStreamConsumer](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_RegisterStreamConsumer.html)*: +this is called once per stream during stream consumer registration, unless the `NONE` registration strategy is configured. +Retry and backoff parameters can be configured using the `ConsumerConfigConstants.REGISTER_STREAM_*` keys. + +- *[DeregisterStreamConsumer](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_DeregisterStreamConsumer.html)*: +this is called once per stream during stream consumer deregistration, unless the `NONE` registration strategy is configured. +Retry and backoff parameters can be configured using the `ConsumerConfigConstants.DEREGISTER_STREAM_*` keys. + +## Kinesis Producer + +The `FlinkKinesisProducer` uses [Kinesis Producer Library (KPL)](http://docs.aws.amazon.com/streams/latest/dev/developing-producers-with-kpl.html) to put data from a Flink stream into a Kinesis stream. + +Note that the producer is not participating in Flink's checkpointing and doesn't provide exactly-once processing guarantees. Also, the Kinesis producer does not guarantee that records are written in order to the shards (See [here](https://github.com/awslabs/amazon-kinesis-producer/issues/23) and [here](http://docs.aws.amazon.com/kinesis/latest/APIReference/API_PutRecord.html#API_PutRecord_RequestSyntax) for more details). + +In case of a failure or a resharding, data will be written again to Kinesis, leading to duplicates. This behavior is usually called "at-least-once" semantics. + +To put data into a Kinesis stream, make sure the stream is marked as "ACTIVE" in the AWS dashboard. + +For the monitoring to work, the user accessing the stream needs access to the CloudWatch service. + +{{< tabs "6df3b696-c2ca-4f44-bea0-96cf8275d61c" >}} +{{< tab "Java" >}} +```java +Properties producerConfig = new Properties(); +// Required configs +producerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1"); +producerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id"); +producerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key"); +// Optional configs +producerConfig.put("AggregationMaxCount", "4294967295"); +producerConfig.put("CollectionMaxCount", "1000"); +producerConfig.put("RecordTtl", "30000"); +producerConfig.put("RequestTimeout", "6000"); +producerConfig.put("ThreadPoolSize", "15"); + +// Disable Aggregation if it's not supported by a consumer +// producerConfig.put("AggregationEnabled", "false"); +// Switch KinesisProducer's threading model +// producerConfig.put("ThreadingModel", "PER_REQUEST"); + +FlinkKinesisProducer kinesis = new FlinkKinesisProducer<>(new SimpleStringSchema(), producerConfig); +kinesis.setFailOnError(true); +kinesis.setDefaultStream("kinesis_stream_name"); +kinesis.setDefaultPartition("0"); + +DataStream simpleStringStream = ...; +simpleStringStream.addSink(kinesis); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val producerConfig = new Properties() +// Required configs +producerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1") +producerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id") +producerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key") +// Optional KPL configs +producerConfig.put("AggregationMaxCount", "4294967295") +producerConfig.put("CollectionMaxCount", "1000") +producerConfig.put("RecordTtl", "30000") +producerConfig.put("RequestTimeout", "6000") +producerConfig.put("ThreadPoolSize", "15") + +// Disable Aggregation if it's not supported by a consumer +// producerConfig.put("AggregationEnabled", "false") +// Switch KinesisProducer's threading model +// producerConfig.put("ThreadingModel", "PER_REQUEST") + +val kinesis = new FlinkKinesisProducer[String](new SimpleStringSchema, producerConfig) +kinesis.setFailOnError(true) +kinesis.setDefaultStream("kinesis_stream_name") +kinesis.setDefaultPartition("0") + +val simpleStringStream = ... +simpleStringStream.addSink(kinesis) +``` +{{< /tab >}} +{{< /tabs >}} + +The above is a simple example of using the producer. To initialize `FlinkKinesisProducer`, users are required to pass in `AWS_REGION`, `AWS_ACCESS_KEY_ID`, and `AWS_SECRET_ACCESS_KEY` via a `java.util.Properties` instance. Users can also pass in KPL's configurations as optional parameters to customize the KPL underlying `FlinkKinesisProducer`. The full list of KPL configs and explanations can be found [here](https://github.com/awslabs/amazon-kinesis-producer/blob/master/java/amazon-kinesis-producer-sample/default_config.properties). The example demonstrates producing a single Kinesis stream in the AWS region "us-east-1". + +If users don't specify any KPL configs and values, `FlinkKinesisProducer` will use default config values of KPL, except `RateLimit`. `RateLimit` limits the maximum allowed put rate for a shard, as a percentage of the backend limits. KPL's default value is 150 but it makes KPL throw `RateLimitExceededException` too frequently and breaks Flink sink as a result. Thus `FlinkKinesisProducer` overrides KPL's default value to 100. + +Instead of a `SerializationSchema`, it also supports a `KinesisSerializationSchema`. The `KinesisSerializationSchema` allows to send the data to multiple streams. This is +done using the `KinesisSerializationSchema.getTargetStream(T element)` method. Returning `null` there will instruct the producer to write the element to the default stream. +Otherwise, the returned stream name is used. + +### Threading Model + +Since Flink 1.4.0, `FlinkKinesisProducer` switches its default underlying KPL from a one-thread-per-request mode to a thread-pool mode. KPL in thread-pool mode uses a queue and thread pool to execute requests to Kinesis. This limits the number of threads that KPL's native process may create, and therefore greatly lowers CPU utilization and improves efficiency. **Thus, We highly recommend Flink users use thread-pool model.** The default thread pool size is `10`. Users can set the pool size in `java.util.Properties` instance with key `ThreadPoolSize`, as shown in the above example. + +Users can still switch back to one-thread-per-request mode by setting a key-value pair of `ThreadingModel` and `PER_REQUEST` in `java.util.Properties`, as shown in the code commented out in above example. + +### Backpressure + +By default, `FlinkKinesisProducer` does not backpressure. Instead, records that +cannot be sent because of the rate restriction of 1 MB per second per shard are +buffered in an unbounded queue and dropped when their `RecordTtl` expires. + +To avoid data loss, you can enable backpressuring by restricting the size of the +internal queue: + +``` +// 200 Bytes per record, 1 shard +kinesis.setQueueLimit(500); +``` + +The value for `queueLimit` depends on the expected record size. To choose a good +value, consider that Kinesis is rate-limited to 1MB per second per shard. If +less than one second's worth of records is buffered, then the queue may not be +able to operate at full capacity. With the default `RecordMaxBufferedTime` of +100ms, a queue size of 100kB per shard should be sufficient. The `queueLimit` +can then be computed via + +``` +queue limit = (number of shards * queue size per shard) / record size +``` + +E.g. for 200Bytes per record and 8 shards, a queue limit of 4000 is a good +starting point. If the queue size limits throughput (below 1MB per second per +shard), try increasing the queue limit slightly. + + +## Using Custom Kinesis Endpoints + +It is sometimes desirable to have Flink operate as a consumer or producer against a Kinesis VPC endpoint or a non-AWS +Kinesis endpoint such as [Kinesalite](https://github.com/mhart/kinesalite); this is especially useful when performing +functional testing of a Flink application. The AWS endpoint that would normally be inferred by the AWS region set in the +Flink configuration must be overridden via a configuration property. + +To override the AWS endpoint, set the `AWSConfigConstants.AWS_ENDPOINT` and `AWSConfigConstants.AWS_REGION` properties. The region will be used to sign the endpoint URL. + +{{< tabs "bcadd466-8416-4d3c-a6a7-c46eee0cbd4a" >}} +{{< tab "Java" >}} +```java +Properties producerConfig = new Properties(); +producerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1"); +producerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id"); +producerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key"); +producerConfig.put(AWSConfigConstants.AWS_ENDPOINT, "http://localhost:4567"); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val producerConfig = new Properties() +producerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1") +producerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id") +producerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key") +producerConfig.put(AWSConfigConstants.AWS_ENDPOINT, "http://localhost:4567") +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/nifi.md b/docs/content.zh/docs/connectors/datastream/nifi.md new file mode 100644 index 0000000000000..ffcbbfc715f40 --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/nifi.md @@ -0,0 +1,128 @@ +--- +title: NiFi +weight: 8 +type: docs +aliases: + - /zh/dev/connectors/nifi.html +--- + + +# Apache NiFi 连接器 + +[Apache NiFi](https://nifi.apache.org/) 连接器提供了可以读取和写入的 Source 和 Sink。 +使用这个连接器,需要在工程中添加下面的依赖: + +{{< artifact flink-connector-nifi withScalaVersion >}} + +注意这些连接器目前还没有包含在二进制发行版中。添加依赖、打包配置以及集群运行的相关信息请参考 [这里]({{< ref "docs/dev/datastream/project-configuration" >}})。 + +#### 安装 Apache NiFi + +安装 Apache NiFi 集群请参考 [这里](https://nifi.apache.org/docs/nifi-docs/html/administration-guide.html#how-to-install-and-start-nifi)。 + +#### Apache NiFi Source + +该连接器提供了一个 Source 可以用来从 Apache NiFi 读取数据到 Apache Flink。 + +`NiFiSource(…)` 类有两个构造方法。 + +- `NiFiSource(SiteToSiteConfig config)` - 构造一个 `NiFiSource(…)` ,需要指定参数 SiteToSiteConfig ,采用默认的等待时间 1000 ms。 + +- `NiFiSource(SiteToSiteConfig config, long waitTimeMs)` - 构造一个 `NiFiSource(…)`,需要指定参数 SiteToSiteConfig 和等待时间(单位为毫秒)。 + +示例: + +{{< tabs "44ccc35b-83c3-464f-9464-995d4981f4d9" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment streamExecEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + +SiteToSiteClientConfig clientConfig = new SiteToSiteClient.Builder() + .url("http://localhost:8080/nifi") + .portName("Data for Flink") + .requestBatchCount(5) + .buildConfig(); + +SourceFunction nifiSource = new NiFiSource(clientConfig); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val streamExecEnv = StreamExecutionEnvironment.getExecutionEnvironment() + +val clientConfig: SiteToSiteClientConfig = new SiteToSiteClient.Builder() + .url("http://localhost:8080/nifi") + .portName("Data for Flink") + .requestBatchCount(5) + .buildConfig() + +val nifiSource = new NiFiSource(clientConfig) +``` +{{< /tab >}} +{{< /tabs >}} + +数据从 Apache NiFi Output Port 读取,Apache NiFi Output Port 也被称为 "Data for Flink",是 Apache NiFi Site-to-site 协议配置的一部分。 + +#### Apache NiFi Sink + +该连接器提供了一个 Sink 可以用来把 Apache Flink 的数据写入到 Apache NiFi。 + +`NiFiSink(…)` 类只有一个构造方法。 + +- `NiFiSink(SiteToSiteClientConfig, NiFiDataPacketBuilder)` 构造一个 `NiFiSink(…)`,需要指定 `SiteToSiteConfig` 和 `NiFiDataPacketBuilder` 参数 ,`NiFiDataPacketBuilder` 可以将Flink数据转化成可以被NiFi识别的 `NiFiDataPacket`. + +示例: + +{{< tabs "599dbd31-e2a4-4203-a428-0a4c95c8fd07" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment streamExecEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + +SiteToSiteClientConfig clientConfig = new SiteToSiteClient.Builder() + .url("http://localhost:8080/nifi") + .portName("Data from Flink") + .requestBatchCount(5) + .buildConfig(); + +SinkFunction nifiSink = new NiFiSink<>(clientConfig, new NiFiDataPacketBuilder() {...}); + +streamExecEnv.addSink(nifiSink); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val streamExecEnv = StreamExecutionEnvironment.getExecutionEnvironment() + +val clientConfig: SiteToSiteClientConfig = new SiteToSiteClient.Builder() + .url("http://localhost:8080/nifi") + .portName("Data from Flink") + .requestBatchCount(5) + .buildConfig() + +val nifiSink: NiFiSink[NiFiDataPacket] = new NiFiSink[NiFiDataPacket](clientConfig, new NiFiDataPacketBuilder() {...}) + +streamExecEnv.addSink(nifiSink) +``` +{{< /tab >}} +{{< /tabs >}} + +更多关于 [Apache NiFi](https://nifi.apache.org) Site-to-Site Protocol 的信息请参考 [这里](https://nifi.apache.org/docs/nifi-docs/html/user-guide.html#site-to-site)。 + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/overview.md b/docs/content.zh/docs/connectors/datastream/overview.md new file mode 100644 index 0000000000000..cc59b1d1d288f --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/overview.md @@ -0,0 +1,79 @@ +--- +title: 概览 +weight: 1 +type: docs +aliases: + - /zh/dev/connectors/ + - /zh/apis/connectors.html +--- + + +# DataStream Connectors + +## 预定义的 Source 和 Sink + +一些比较基本的 Source 和 Sink 已经内置在 Flink 里。 +[预定义 data sources]({{< ref "docs/dev/datastream/overview" >}}#data-sources) 支持从文件、目录、socket,以及 collections 和 iterators 中读取数据。 +[预定义 data sinks]({{< ref "docs/dev/datastream/overview" >}}#data-sinks) 支持把数据写入文件、标准输出(stdout)、标准错误输出(stderr)和 socket。 + +## 附带的连接器 + +连接器可以和多种多样的第三方系统进行交互。目前支持以下系统: + + * [Apache Kafka](kafka.html) (source/sink) + * [Apache Cassandra](cassandra.html) (sink) + * [Amazon Kinesis Streams](kinesis.html) (source/sink) + * [Elasticsearch](elasticsearch.html) (sink) + * [FileSystem(包括 Hadoop ) - 仅支持流](streamfile_sink.html) (sink) + * [FileSystem(包括 Hadoop ) - 流批统一](file_sink.html) (sink) + * [RabbitMQ](rabbitmq.html) (source/sink) + * [Apache NiFi](nifi.html) (source/sink) + * [Twitter Streaming API](twitter.html) (source) + * [Google PubSub](pubsub.html) (source/sink) + * [JDBC](jdbc.html) (sink) + +请记住,在使用一种连接器时,通常需要额外的第三方组件,比如:数据存储服务器或者消息队列。 +要注意这些列举的连接器是 Flink 工程的一部分,包含在发布的源码中,但是不包含在二进制发行版中。 +更多说明可以参考对应的子部分。 + +## Apache Bahir 中的连接器 + +Flink 还有些一些额外的连接器通过 [Apache Bahir](https://bahir.apache.org/) 发布, 包括: + + * [Apache ActiveMQ](https://bahir.apache.org/docs/flink/current/flink-streaming-activemq/) (source/sink) + * [Apache Flume](https://bahir.apache.org/docs/flink/current/flink-streaming-flume/) (sink) + * [Redis](https://bahir.apache.org/docs/flink/current/flink-streaming-redis/) (sink) + * [Akka](https://bahir.apache.org/docs/flink/current/flink-streaming-akka/) (sink) + * [Netty](https://bahir.apache.org/docs/flink/current/flink-streaming-netty/) (source) + +## 连接Fink的其他方法 + +### 异步 I/O + +使用connector并不是唯一可以使数据进入或者流出Flink的方式。 +一种常见的模式是从外部数据库或者 Web 服务查询数据得到初始数据流,然后通过 `Map` 或者 `FlatMap` 对初始数据流进行丰富和增强。 +Flink 提供了[异步 I/O]({{< ref "docs/dev/datastream/operators/asyncio" >}}) API 来让这个过程更加简单、高效和稳定。 + +### 可查询状态 + +当 Flink 应用程序需要向外部存储推送大量数据时会导致 I/O 瓶颈问题出现。在这种场景下,如果对数据的读操作远少于写操作,那么让外部应用从 Flink 拉取所需的数据会是一种更好的方式。 +[可查询状态]({{< ref "docs/dev/datastream/fault-tolerance/queryable_state" >}}) 接口可以实现这个功能,该接口允许被 Flink 托管的状态可以被按需查询。 + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/pubsub.md b/docs/content.zh/docs/connectors/datastream/pubsub.md new file mode 100644 index 0000000000000..f4194083aeba5 --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/pubsub.md @@ -0,0 +1,153 @@ +--- +title: Google Cloud PubSub +weight: 8 +type: docs +aliases: + - /zh/dev/connectors/pubsub.html +--- + + +# Google Cloud PubSub + +这个连接器可向 [Google Cloud PubSub](https://cloud.google.com/pubsub) 读取与写入数据。添加下面的依赖来使用此连接器: + +{{< artifact flink-connector-pubsub withScalaVersion >}} + +

    +注意:此连接器最近才加到 Flink 里,还未接受广泛测试。 +

    + +注意连接器目前还不是二进制发行版的一部分,添加依赖、打包配置以及集群运行信息请参考[这里]({{< ref "docs/dev/datastream/project-configuration" >}}) + +## Consuming or Producing PubSubMessages + +连接器可以接收和发送 Google PubSub 的信息。和 Google PubSub 一样,这个连接器能够保证`至少一次`的语义。 + +### PubSub SourceFunction + +`PubSubSource` 类的对象由构建类来构建: `PubSubSource.newBuilder(...)` + +有多种可选的方法来创建 PubSubSource,但最低要求是要提供 Google Project、Pubsub 订阅和反序列化 PubSubMessages 的方法。 + +Example: + +{{< tabs "d2d1d8c9-12b6-4ca4-bf32-990c1c63d960" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment streamExecEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + +DeserializationSchema deserializer = (...); +SourceFunction pubsubSource = PubSubSource.newBuilder() + .withDeserializationSchema(deserializer) + .withProjectName("project") + .withSubscriptionName("subscription") + .build(); + +streamExecEnv.addSource(source); +``` +{{< /tab >}} +{{< /tabs >}} + +当前还不支持 PubSub 的 source functions [pulls](https://cloud.google.com/pubsub/docs/pull) messages 和 [push endpoints](https://cloud.google.com/pubsub/docs/push)。 + +### PubSub Sink + +`PubSubSink` 类的对象由构建类来构建: `PubSubSink.newBuilder(...)` + +构建类的使用方式与 PubSubSource 类似。 + +Example: + +{{< tabs "2edf4665-456f-4380-8c5f-c5003cadf488" >}} +{{< tab "Java" >}} +```java +DataStream dataStream = (...); + +SerializationSchema serializationSchema = (...); +SinkFunction pubsubSink = PubSubSink.newBuilder() + .withSerializationSchema(serializationSchema) + .withProjectName("project") + .withSubscriptionName("subscription") + .build() + +dataStream.addSink(pubsubSink); +``` +{{< /tab >}} +{{< /tabs >}} + +### Google Credentials + +应用程序需要使用 [Credentials](https://cloud.google.com/docs/authentication/production) 来通过认证和授权才能使用 Google Cloud Platform 的资源,例如 PubSub。 + +上述的两个构建类都允许你提供 Credentials, 但是连接器默认会通过环境变量: [GOOGLE_APPLICATION_CREDENTIALS](https://cloud.google.com/docs/authentication/production#obtaining_and_providing_service_account_credentials_manually) 来获取 Credentials 的路径。 + +如果你想手动提供 Credentials,例如你想从外部系统读取 Credentials,你可以使用 `PubSubSource.newBuilder(...).withCredentials(...)`。 + +### 集成测试 + +在集成测试的时候,如果你不想直接连 PubSub 而是想读取和写入一个 docker container,可以参照 [PubSub testing locally](https://cloud.google.com/pubsub/docs/emulator)。 + +下面的例子展示了如何使用 source 来从仿真器读取信息并发送回去: + +{{< tabs "96e21898-1c58-4b39-a7ab-d0fa278df2ba" >}} +{{< tab "Java" >}} +```java +String hostAndPort = "localhost:1234"; +DeserializationSchema deserializationSchema = (...); +SourceFunction pubsubSource = PubSubSource.newBuilder() + .withDeserializationSchema(deserializationSchema) + .withProjectName("my-fake-project") + .withSubscriptionName("subscription") + .withPubSubSubscriberFactory(new PubSubSubscriberFactoryForEmulator(hostAndPort, "my-fake-project", "subscription", 10, Duration.ofSeconds(15), 100)) + .build(); +SerializationSchema serializationSchema = (...); +SinkFunction pubsubSink = PubSubSink.newBuilder() + .withSerializationSchema(serializationSchema) + .withProjectName("my-fake-project") + .withSubscriptionName("subscription") + .withHostAndPortForEmulator(hostAndPort) + .build(); + +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +env.addSource(pubsubSource) + .addSink(pubsubSink); +``` +{{< /tab >}} +{{< /tabs >}} + +### 至少一次语义保证 + +#### SourceFunction + +有很多原因导致会一个信息会被多次发出,例如 Google PubSub 的故障。 + +另一个可能的原因是超过了确认的截止时间,即收到与确认信息之间的时间间隔。PubSubSource 只有在信息被成功快照之后才会确认以保证至少一次的语义。这意味着,如果你的快照间隔大于信息确认的截止时间,那么你订阅的信息很有可能会被多次处理。 + +因此,我们建议把快照的间隔设置得比信息确认截止时间更短。 + +参照 [PubSub](https://cloud.google.com/pubsub/docs/subscriber) 来增加信息确认截止时间。 + +注意: `PubSubMessagesProcessedNotAcked` 显示了有多少信息正在等待下一个 checkpoint 还没被确认。 + +#### SinkFunction + +Sink function 会把准备发到 PubSub 的信息短暂地缓存以提高性能。每次 checkpoint 前,它会刷新缓冲区,并且只有当所有信息成功发送到 PubSub 之后,checkpoint 才会成功完成。 + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/rabbitmq.md b/docs/content.zh/docs/connectors/datastream/rabbitmq.md new file mode 100644 index 0000000000000..d5477ffa0ad2a --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/rabbitmq.md @@ -0,0 +1,182 @@ +--- +title: RabbitMQ +weight: 7 +type: docs +aliases: + - /zh/dev/connectors/rabbitmq.html +--- + + +# RabbitMQ 连接器 + +## RabbitMQ 连接器的许可证 + +Flink 的 RabbitMQ 连接器依赖了 "RabbitMQ AMQP Java Client",它基于三种协议下发行:Mozilla Public License 1.1 ("MPL")、GNU General Public License version 2 ("GPL") 和 Apache License version 2 ("ASL")。 + +Flink 自身既没有复用 "RabbitMQ AMQP Java Client" 的代码,也没有将 "RabbitMQ AMQP Java Client" 打二进制包。 + +如果用户发布的内容是基于 Flink 的 RabbitMQ 连接器的(进而重新发布了 "RabbitMQ AMQP Java Client" ),那么一定要注意这可能会受到 Mozilla Public License 1.1 ("MPL")、GNU General Public License version 2 ("GPL")、Apache License version 2 ("ASL") 协议的限制. + +## RabbitMQ 连接器 + +这个连接器可以访问 [RabbitMQ](http://www.rabbitmq.com/) 的数据流。使用这个连接器,需要在工程里添加下面的依赖: + +{{< artifact flink-connector-rabbitmq withScalaVersion >}} + +注意连接器现在没有包含在二进制发行版中。集群执行的相关信息请参考 [这里]({{< ref "docs/dev/datastream/project-configuration" >}}). + +### 安装 RabbitMQ +安装 RabbitMQ 请参考 [RabbitMQ 下载页面](http://www.rabbitmq.com/download.html)。安装完成之后,服务会自动拉起,应用程序就可以尝试连接到 RabbitMQ 了。 + +### RabbitMQ Source + +`RMQSource` 负责从 RabbitMQ 中消费数据,可以配置三种不同级别的保证: + +1. **精确一次**: 保证精确一次需要以下条件 - + - *开启 checkpointing*: 开启 checkpointing 之后,消息在 checkpoints + 完成之后才会被确认(然后从 RabbitMQ 队列中删除). + - *使用关联标识(Correlation ids)*: 关联标识是 RabbitMQ 的一个特性,消息写入 RabbitMQ 时在消息属性中设置。 + 从 checkpoint 恢复时有些消息可能会被重复处理,source 可以利用关联标识对消息进行去重。 + - *非并发 source*: 为了保证精确一次的数据投递,source 必须是非并发的(并行度设置为1)。 + 这主要是由于 RabbitMQ 分发数据时是从单队列向多个消费者投递消息的。 + +2. **至少一次**: 在 checkpointing 开启的条件下,如果没有使用关联标识或者 source 是并发的, +那么 source 就只能提供至少一次的保证。 + +3. **无任何保证**: 如果没有开启 checkpointing,source 就不能提供任何的数据投递保证。 +使用这种设置时,source 一旦接收到并处理消息,消息就会被自动确认。 + +下面是一个保证 exactly-once 的 RabbitMQ source 示例。 注释部分展示了更加宽松的保证应该如何配置。 + +{{< tabs "62892e62-eb37-4fed-aa0f-64690135b4d6" >}} +{{< tab "Java" >}} +```java +final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +// checkpointing is required for exactly-once or at-least-once guarantees +env.enableCheckpointing(...); + +final RMQConnectionConfig connectionConfig = new RMQConnectionConfig.Builder() + .setHost("localhost") + .setPort(5000) + ... + .build(); + +final DataStream stream = env + .addSource(new RMQSource( + connectionConfig, // config for the RabbitMQ connection + "queueName", // name of the RabbitMQ queue to consume + true, // use correlation ids; can be false if only at-least-once is required + new SimpleStringSchema())) // deserialization schema to turn messages into Java objects + .setParallelism(1); // non-parallel source is only required for exactly-once +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment +// checkpointing is required for exactly-once or at-least-once guarantees +env.enableCheckpointing(...) + +val connectionConfig = new RMQConnectionConfig.Builder() + .setHost("localhost") + .setPort(5000) + ... + .build + +val stream = env + .addSource(new RMQSource[String]( + connectionConfig, // config for the RabbitMQ connection + "queueName", // name of the RabbitMQ queue to consume + true, // use correlation ids; can be false if only at-least-once is required + new SimpleStringSchema)) // deserialization schema to turn messages into Java objects + .setParallelism(1) // non-parallel source is only required for exactly-once +``` +{{< /tab >}} +{{< /tabs >}} + +#### 服务质量 (QoS) / 消费者预取(Consumer Prefetch) + +RabbitMQ Source 通过 `RMQConnectionConfig` 类提供了一种简单的方式,来设置 source channel 上的 `basicQos`(见下方示例)。要注意的是这里的 prefetch count 是对单个 channel 设置的,并且由于每个并发的 source 都持有一个 connection/channel,因此这个值实际上会乘以 source 的并行度,来表示同一时间可以向这个 job 总共发送多少条未确认的消息。如果需要更复杂的配置,可以通过重写 `RMQSource#setupChannel(Connection)` 方法来实现手动配置。 + +{{< tabs "a899561b-bd98-4b65-a3a3-b81ee1f1f677" >}} +{{< tab "Java" >}} +```java +final RMQConnectionConfig connectionConfig = new RMQConnectionConfig.Builder() + .setPrefetchCount(30_000) + ... + .build(); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val connectionConfig = new RMQConnectionConfig.Builder() + .setPrefetchCount(30000) + ... + .build +``` +{{< /tab >}} +{{< /tabs >}} + +RabbitMQ Source 默认情况下是不设置 prefetch count 的,这意味着 RabbitMQ 服务器将会无限制地向 source 发送消息。因此在生产环境中,最好要设置它。当消费海量数据的队列并且启用 checkpointing 时,消息只有在做完 checkpoint 后才会被确认,因此也许需要对 prefetch count 做一些调整来减少不必要的循环。 + +更多关于 QoS 以及 prefetch 相关的内容可以参考 [这里](https://www.rabbitmq.com/confirms.html#channel-qos-prefetch). +更多关于在 AMQP 0-9-1 中可选的选项可以参考 [这里](https://www.rabbitmq.com/consumer-prefetch.html). + +### RabbitMQ Sink +该连接器提供了一个 `RMQSink` 类,用来向 RabbitMQ 队列发送数据。下面是设置 RabbitMQ sink 的代码示例: + +{{< tabs "b8966b60-db25-4853-8382-751bbe1a89c7" >}} +{{< tab "Java" >}} +```java +final DataStream stream = ... + +final RMQConnectionConfig connectionConfig = new RMQConnectionConfig.Builder() + .setHost("localhost") + .setPort(5000) + ... + .build(); + +stream.addSink(new RMQSink( + connectionConfig, // config for the RabbitMQ connection + "queueName", // name of the RabbitMQ queue to send messages to + new SimpleStringSchema())); // serialization schema to turn Java objects to messages +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val stream: DataStream[String] = ... + +val connectionConfig = new RMQConnectionConfig.Builder() + .setHost("localhost") + .setPort(5000) + ... + .build + +stream.addSink(new RMQSink[String]( + connectionConfig, // config for the RabbitMQ connection + "queueName", // name of the RabbitMQ queue to send messages to + new SimpleStringSchema)) // serialization schema to turn Java objects to messages +``` +{{< /tab >}} +{{< /tabs >}} + +更多关于 RabbitMQ 的信息请参考 [这里](http://www.rabbitmq.com/). + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/streamfile_sink.md b/docs/content.zh/docs/connectors/datastream/streamfile_sink.md new file mode 100644 index 0000000000000..01e62df07dfd3 --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/streamfile_sink.md @@ -0,0 +1,738 @@ +--- +title: Streaming File Sink +weight: 6 +type: docs +aliases: + - /zh/dev/connectors/streamfile_sink.html +--- + + +# Streaming File Sink + + + +这个连接器提供了一个 Sink 来将分区文件写入到支持 [Flink `FileSystem`]({{< ref "docs/deployment/filesystems/overview" >}}) 接口的文件系统中。 + +Streaming File Sink 会将数据写入到桶中。由于输入流可能是无界的,因此每个桶中的数据被划分为多个有限大小的文件。如何分桶是可以配置的,默认使用基于时间的分桶策略,这种策略每个小时创建一个新的桶,桶中包含的文件将记录所有该小时内从流中接收到的数据。 + +桶目录中的实际输出数据会被划分为多个部分文件(part file),每一个接收桶数据的 Sink Subtask ,至少包含一个部分文件(part file)。额外的部分文件(part file)将根据滚动策略创建,滚动策略是可以配置的。默认的策略是根据文件大小和超时时间来滚动文件。超时时间指打开文件的最长持续时间,以及文件关闭前的最长非活动时间。 + +{{< hint info >}} +重要: 使用 StreamingFileSink 时需要启用 Checkpoint ,每次做 Checkpoint 时写入完成。如果 Checkpoint 被禁用,部分文件(part file)将永远处于 'in-progress' 或 'pending' 状态,下游系统无法安全地读取。 +{{< /hint >}} + +{{< img src="/fig/streamfilesink_bucketing.png" >}} + +## 文件格式 + + `StreamingFileSink` 支持行编码格式和批量编码格式,比如 [Apache Parquet](http://parquet.apache.org) 。 +这两种变体随附了各自的构建器,可以使用以下静态方法创建: + + - Row-encoded sink: `StreamingFileSink.forRowFormat(basePath, rowEncoder)` + - Bulk-encoded sink: `StreamingFileSink.forBulkFormat(basePath, bulkWriterFactory)` + +创建行或批量编码的 Sink 时,我们需要指定存储桶的基本路径和数据的编码逻辑。 + +更多配置操作以及不同数据格式的实现请参考 `StreamingFileSink` + +### 行编码格式 + +行编码格式需要指定一个 `Encoder` 。Encoder 负责为每个处于 In-progress 状态文件的`OutputStream` 序列化数据。 + +除了桶分配器之外,`RowFormatBuilder` 还允许用户指定: + + - Custom `RollingPolicy` :自定义滚动策略以覆盖默认的 DefaultRollingPolicy + - bucketCheckInterval (默认为1分钟):毫秒间隔,用于基于时间的滚动策略。 + +字符串元素写入示例: + + +{{< tabs "804d0538-5382-4b74-b389-8ab1403c804c" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.api.common.serialization.SimpleStringEncoder; +import org.apache.flink.core.fs.Path; +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink; +import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy; + +DataStream input = ...; + +final StreamingFileSink sink = StreamingFileSink + .forRowFormat(new Path(outputPath), new SimpleStringEncoder("UTF-8")) + .withRollingPolicy( + DefaultRollingPolicy.builder() + .withRolloverInterval(TimeUnit.MINUTES.toMillis(15)) + .withInactivityInterval(TimeUnit.MINUTES.toMillis(5)) + .withMaxPartSize(1024 * 1024 * 1024) + .build()) + .build(); + +input.addSink(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.common.serialization.SimpleStringEncoder +import org.apache.flink.core.fs.Path +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink +import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy + +val input: DataStream[String] = ... + +val sink: StreamingFileSink[String] = StreamingFileSink + .forRowFormat(new Path(outputPath), new SimpleStringEncoder[String]("UTF-8")) + .withRollingPolicy( + DefaultRollingPolicy.builder() + .withRolloverInterval(TimeUnit.MINUTES.toMillis(15)) + .withInactivityInterval(TimeUnit.MINUTES.toMillis(5)) + .withMaxPartSize(1024 * 1024 * 1024) + .build()) + .build() + +input.addSink(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +这个例子创建了一个简单的 Sink ,将记录分配给默认的一小时时间桶。它还指定了一个滚动策略,该策略在以下三种情况下滚动处于 In-progress 状态的部分文件(part file): + + - 它至少包含 15 分钟的数据 + - 最近 5 分钟没有收到新的记录 + - 文件大小达到 1GB (写入最后一条记录后) + +### 批量编码格式 + +批量编码 Sink 的创建与行编码 Sink 相似,不过在这里我们不是指定编码器 `Encoder` 而是指定 BulkWriter.`Factory` 。 +`BulkWriter` 定义了如何添加、刷新元素,以及如何批量编码。 + +Flink 有四个内置的 BulkWriter Factory : + + - `ParquetWriterFactory` + - `AvroWriterFactory` + - `SequenceFileWriterFactory` + - `CompressWriterFactory` + - `OrcBulkWriterFactory` + +{{< hint info >}} +重要: 批量编码模式仅支持 OnCheckpointRollingPolicy 策略, 在每次 checkpoint 的时候切割文件。 +{{< /hint >}} + +#### Parquet 格式 + +Flink 包含为不同 Avro 类型,创建 ParquetWriterFactory 的便捷方法,更多信息请参考 `ParquetAvroWriters` 。 + +要编写其他 Parquet 兼容的数据格式,用户需要创建 ParquetWriterFactory 并实现 `ParquetBuilder` 接口。 + +在应用中使用 Parquet 批量编码器,你需要添加以下依赖: + +{{< artifact flink-parquet withScalaVersion >}} + +这个例子使用 StreamingFileSink 将 Avro 数据写入 Parquet 格式: + +{{< tabs "7f839bbf-4d61-48ef-81a6-5649d53fcfae" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink; +import org.apache.flink.formats.parquet.avro.ParquetAvroWriters; +import org.apache.avro.Schema; + + +Schema schema = ...; +DataStream input = ...; + +final StreamingFileSink sink = StreamingFileSink + .forBulkFormat(outputBasePath, ParquetAvroWriters.forGenericRecord(schema)) + .build(); + +input.addSink(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink +import org.apache.flink.formats.parquet.avro.ParquetAvroWriters +import org.apache.avro.Schema + +val schema: Schema = ... +val input: DataStream[GenericRecord] = ... + +val sink: StreamingFileSink[GenericRecord] = StreamingFileSink + .forBulkFormat(outputBasePath, ParquetAvroWriters.forGenericRecord(schema)) + .build() + +input.addSink(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +类似的,将 Protobuf 数据写入到 Parquet 格式可以通过: + +{{< tabs "6207391f-278a-4eed-91aa-3112a2934e54" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink; +import org.apache.flink.formats.parquet.protobuf.ParquetProtoWriters; + +// ProtoRecord is a generated protobuf Message class. +DataStream input = ...; + +final StreamingFileSink sink = StreamingFileSink + .forBulkFormat(outputBasePath, ParquetProtoWriters.forType(ProtoRecord.class)) + .build(); + +input.addSink(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink +import org.apache.flink.formats.parquet.protobuf.ParquetProtoWriters + +// ProtoRecord is a generated protobuf Message class. +val input: DataStream[ProtoRecord] = ... + +val sink: StreamingFileSink[ProtoRecord] = StreamingFileSink + .forBulkFormat(outputBasePath, ParquetProtoWriters.forType(classOf[ProtoRecord])) + .build() + +input.addSink(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +#### Avro格式 + +Flink 也提供了将数据写入 Avro 文件的内置支持。对于创建 AvroWriterFactory 的快捷方法,更多信息可以参考 +`AvroWriters`. + +使用Avro相关的Writer需要在项目中添加以下依赖: + +{{< artifact flink-avro >}} + +将数据写入 Avro 文件的 StreamingFileSink 算子可以通过如下方式创建: + +{{< tabs "2df2f4da-8346-4ce3-bb4c-bcca28b29811" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink; +import org.apache.flink.formats.avro.AvroWriters; +import org.apache.avro.Schema; + + +Schema schema = ...; +DataStream input = ...; + +final StreamingFileSink sink = StreamingFileSink + .forBulkFormat(outputBasePath, AvroWriters.forGenericRecord(schema)) + .build(); + +input.addSink(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink +import org.apache.flink.formats.avro.AvroWriters +import org.apache.avro.Schema + +val schema: Schema = ... +val input: DataStream[GenericRecord] = ... + +val sink: StreamingFileSink[GenericRecord] = StreamingFileSink + .forBulkFormat(outputBasePath, AvroWriters.forGenericRecord(schema)) + .build() + +input.addSink(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +如果想要创建自定义的 Avro Writer,例如启用压缩等,用户可以实现 `AvroBuilder` +接口并自行创建一个 `AvroWriterFactory` 实例: + +{{< tabs "bc3ef729-afac-4ca7-ae2e-85368368a61f" >}} +{{< tab "Java" >}} +```java +AvroWriterFactory factory = new AvroWriterFactory<>((AvroBuilder
    ) out -> { + Schema schema = ReflectData.get().getSchema(Address.class); + DatumWriter
    datumWriter = new ReflectDatumWriter<>(schema); + + DataFileWriter
    dataFileWriter = new DataFileWriter<>(datumWriter); + dataFileWriter.setCodec(CodecFactory.snappyCodec()); + dataFileWriter.create(schema, out); + return dataFileWriter; +}); + +DataStream
    stream = ... +stream.addSink(StreamingFileSink.forBulkFormat( + outputBasePath, + factory).build()); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val factory = new AvroWriterFactory[Address](new AvroBuilder[Address]() { + override def createWriter(out: OutputStream): DataFileWriter[Address] = { + val schema = ReflectData.get.getSchema(classOf[Address]) + val datumWriter = new ReflectDatumWriter[Address](schema) + + val dataFileWriter = new DataFileWriter[Address](datumWriter) + dataFileWriter.setCodec(CodecFactory.snappyCodec) + dataFileWriter.create(schema, out) + dataFileWriter + } +}) + +val stream: DataStream[Address] = ... +stream.addSink(StreamingFileSink.forBulkFormat( + outputBasePath, + factory).build()); +``` +{{< /tab >}} +{{< /tabs >}} + +#### ORC Format + +To enable the data to be bulk encoded in ORC format, Flink offers `OrcBulkWriterFactory` +which takes a concrete implementation of `Vectorizer`. + +Like any other columnar format that encodes data in bulk fashion, Flink's `OrcBulkWriter` writes the input elements in batches. It uses +ORC's `VectorizedRowBatch` to achieve this. + +Since the input element has to be transformed to a `VectorizedRowBatch`, users have to extend the abstract `Vectorizer` +class and override the `vectorize(T element, VectorizedRowBatch batch)` method. As you can see, the method provides an +instance of `VectorizedRowBatch` to be used directly by the users so users just have to write the logic to transform the +input `element` to `ColumnVectors` and set them in the provided `VectorizedRowBatch` instance. + +For example, if the input element is of type `Person` which looks like: + +{{< tabs "a49d0a8c-1cd6-458a-a5a1-0ed645a1139d" >}} +{{< tab "Java" >}} +```java + +class Person { + private final String name; + private final int age; + ... +} + +``` +{{< /tab >}} +{{< /tabs >}} + +Then a child implementation to convert the element of type `Person` and set them in the `VectorizedRowBatch` can be like: + +{{< tabs "7198abfc-97cf-4a81-8100-1dfe233d5608" >}} +{{< tab "Java" >}} +```java +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; + +import java.io.IOException; +import java.io.Serializable; +import java.nio.charset.StandardCharsets; + +public class PersonVectorizer extends Vectorizer implements Serializable { + public PersonVectorizer(String schema) { + super(schema); + } + @Override + public void vectorize(Person element, VectorizedRowBatch batch) throws IOException { + BytesColumnVector nameColVector = (BytesColumnVector) batch.cols[0]; + LongColumnVector ageColVector = (LongColumnVector) batch.cols[1]; + int row = batch.size++; + nameColVector.setVal(row, element.getName().getBytes(StandardCharsets.UTF_8)); + ageColVector.vector[row] = element.getAge(); + } +} + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import java.nio.charset.StandardCharsets +import org.apache.hadoop.hive.ql.exec.vector.{BytesColumnVector, LongColumnVector} + +class PersonVectorizer(schema: String) extends Vectorizer[Person](schema) { + + override def vectorize(element: Person, batch: VectorizedRowBatch): Unit = { + val nameColVector = batch.cols(0).asInstanceOf[BytesColumnVector] + val ageColVector = batch.cols(1).asInstanceOf[LongColumnVector] + nameColVector.setVal(batch.size + 1, element.getName.getBytes(StandardCharsets.UTF_8)) + ageColVector.vector(batch.size + 1) = element.getAge + } + +} + +``` +{{< /tab >}} +{{< /tabs >}} + +To use the ORC bulk encoder in an application, users need to add the following dependency: + +{{< artifact flink-orc withScalaVersion >}} + +And then a `StreamingFileSink` that writes data in ORC format can be created like this: + +{{< tabs "fa7db9c6-8ad4-4cbd-82f8-9ec94f3371e6" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink; +import org.apache.flink.orc.writer.OrcBulkWriterFactory; + +String schema = "struct<_col0:string,_col1:int>"; +DataStream input = ...; + +final OrcBulkWriterFactory writerFactory = new OrcBulkWriterFactory<>(new PersonVectorizer(schema)); + +final StreamingFileSink sink = StreamingFileSink + .forBulkFormat(outputBasePath, writerFactory) + .build(); + +input.addSink(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink +import org.apache.flink.orc.writer.OrcBulkWriterFactory + +val schema: String = "struct<_col0:string,_col1:int>" +val input: DataStream[Person] = ... +val writerFactory = new OrcBulkWriterFactory(new PersonVectorizer(schema)); + +val sink: StreamingFileSink[Person] = StreamingFileSink + .forBulkFormat(outputBasePath, writerFactory) + .build() + +input.addSink(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +OrcBulkWriterFactory can also take Hadoop `Configuration` and `Properties` so that a custom Hadoop configuration and ORC +writer properties can be provided. + +{{< tabs "97463c6c-ceeb-42c6-a281-f784d9cbafc6" >}} +{{< tab "Java" >}} +```java +String schema = ...; +Configuration conf = ...; +Properties writerProperties = new Properties(); + +writerProps.setProperty("orc.compress", "LZ4"); +// Other ORC supported properties can also be set similarly. + +final OrcBulkWriterFactory writerFactory = new OrcBulkWriterFactory<>( + new PersonVectorizer(schema), writerProperties, conf); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val schema: String = ... +val conf: Configuration = ... +val writerProperties: Properties = new Properties() + +writerProps.setProperty("orc.compress", "LZ4") +// Other ORC supported properties can also be set similarly. + +val writerFactory = new OrcBulkWriterFactory( + new PersonVectorizer(schema), writerProperties, conf) +``` +{{< /tab >}} +{{< /tabs >}} + +The complete list of ORC writer properties can be found [here](https://orc.apache.org/docs/hive-config.html). + +Users who want to add user metadata to the ORC files can do so by calling `addUserMetadata(...)` inside the overriding +`vectorize(...)` method. + +{{< tabs "959c9327-80a3-4ef3-910a-e069b046f6d5" >}} +{{< tab "Java" >}} +```java + +public class PersonVectorizer extends Vectorizer implements Serializable { + @Override + public void vectorize(Person element, VectorizedRowBatch batch) throws IOException { + ... + String metadataKey = ...; + ByteBuffer metadataValue = ...; + this.addUserMetadata(metadataKey, metadataValue); + } +} + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +class PersonVectorizer(schema: String) extends Vectorizer[Person](schema) { + + override def vectorize(element: Person, batch: VectorizedRowBatch): Unit = { + ... + val metadataKey: String = ... + val metadataValue: ByteBuffer = ... + addUserMetadata(metadataKey, metadataValue) + } + +} + +``` +{{< /tab >}} +{{< /tabs >}} + +#### Hadoop SequenceFile 格式 + +在应用中使用 SequenceFile 批量编码器,你需要添加以下依赖: + +{{< artifact flink-sequence-file withScalaVersion >}} + +简单的 SequenceFile 写入示例: + +{{< tabs "466b0dac-1a2d-4472-8494-11764ef0a577" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink; +import org.apache.flink.configuration.GlobalConfiguration; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; + + +DataStream> input = ...; +Configuration hadoopConf = HadoopUtils.getHadoopConfiguration(GlobalConfiguration.loadConfiguration()); +final StreamingFileSink> sink = StreamingFileSink + .forBulkFormat( + outputBasePath, + new SequenceFileWriterFactory<>(hadoopConf, LongWritable.class, Text.class)) + .build(); + +input.addSink(sink); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink +import org.apache.flink.configuration.GlobalConfiguration +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.io.LongWritable +import org.apache.hadoop.io.SequenceFile +import org.apache.hadoop.io.Text; + +val input: DataStream[(LongWritable, Text)] = ... +val hadoopConf: Configuration = HadoopUtils.getHadoopConfiguration(GlobalConfiguration.loadConfiguration()) +val sink: StreamingFileSink[(LongWritable, Text)] = StreamingFileSink + .forBulkFormat( + outputBasePath, + new SequenceFileWriterFactory(hadoopConf, LongWritable.class, Text.class)) + .build() + +input.addSink(sink) + +``` +{{< /tab >}} +{{< /tabs >}} + +SequenceFileWriterFactory 支持附加构造函数参数指定压缩设置。 + +## 桶分配 + +桶分配逻辑定义了如何将数据结构化为基本输出目录中的子目录 + +行格式和批量格式都使用 `DateTimeBucketAssigner` 作为默认的分配器。 +默认情况下,DateTimeBucketAssigner 基于系统默认时区每小时创建一个桶,格式如下: `yyyy-MM-dd--HH` 。日期格式(即桶的大小)和时区都可以手动配置。 + +我们可以在格式构建器上调用 `.withBucketAssigner(assigner)` 来自定义 `BucketAssigner` 。 + +Flink 有两个内置的 BucketAssigners : + + - `DateTimeBucketAssigner` :默认基于时间的分配器 + - `BasePathBucketAssigner` :将所有部分文件(part file)存储在基本路径中的分配器(单个全局桶) + +## 滚动策略 + +滚动策略 `RollingPolicy` 定义了指定的文件在何时关闭(closed)并将其变为 Pending 状态,随后变为 Finished 状态。处于 Pending 状态的文件会在下一次 Checkpoint 时变为 Finished 状态,通过设置 Checkpoint 间隔时间,可以控制部分文件(part file)对下游读取者可用的速度、大小和数量。 + +Flink 有两个内置的滚动策略: + + - `DefaultRollingPolicy` + - `OnCheckpointRollingPolicy` + +## 部分文件(part file) 生命周期 + +为了在下游系统中使用 StreamingFileSink 的输出,我们需要了解输出文件的命名规则和生命周期。 + +部分文件(part file)可以处于以下三种状态之一: + 1. **In-progress** :当前文件正在写入中 + 2. **Pending** :当处于 In-progress 状态的文件关闭(closed)了,就变为 Pending 状态 + 3. **Finished** :在成功的 Checkpoint 后,Pending 状态将变为 Finished 状态 + +处于 Finished 状态的文件不会再被修改,可以被下游系统安全地读取。 + +
    + 重要: 部分文件的索引在每个 subtask 内部是严格递增的(按文件创建顺序)。但是索引并不总是连续的。当 Job 重启后,所有部分文件的索引从 `max part index + 1` 开始, + 这里的 `max part index` 是所有 subtask 中索引的最大值。 +
    + +对于每个活动的桶,Writer 在任何时候都只有一个处于 In-progress 状态的部分文件(part file),但是可能有几个 Penging 和 Finished 状态的部分文件(part file)。 + +**部分文件(part file)例子** + +为了更好地理解这些文件的生命周期,让我们来看一个包含 2 个 Sink Subtask 的简单例子: + +``` +└── 2019-08-25--12 + ├── part-0-0.inprogress.bd053eb0-5ecf-4c85-8433-9eff486ac334 + └── part-1-0.inprogress.ea65a428-a1d0-4a0b-bbc5-7a436a75e575 +``` + +当部分文件 `part-1-0` 被滚动(假设它变得太大了)时,它将成为 Pending 状态,但是它还没有被重命名。然后 Sink 会创建一个新的部分文件: `part-1-1`: + +``` +└── 2019-08-25--12 + ├── part-0-0.inprogress.bd053eb0-5ecf-4c85-8433-9eff486ac334 + ├── part-1-0.inprogress.ea65a428-a1d0-4a0b-bbc5-7a436a75e575 + └── part-1-1.inprogress.bc279efe-b16f-47d8-b828-00ef6e2fbd11 +``` + + `part-1-0` 现在处于 Pending 状态等待完成,在下一次成功的 Checkpoint 后,它会变成 Finished 状态: + +``` +└── 2019-08-25--12 + ├── part-0-0.inprogress.bd053eb0-5ecf-4c85-8433-9eff486ac334 + ├── part-1-0 + └── part-1-1.inprogress.bc279efe-b16f-47d8-b828-00ef6e2fbd11 +``` + +根据分桶策略创建新的桶,但是这并不会影响当前处于 In-progress 状态的文件: + +``` +└── 2019-08-25--12 + ├── part-0-0.inprogress.bd053eb0-5ecf-4c85-8433-9eff486ac334 + ├── part-1-0 + └── part-1-1.inprogress.bc279efe-b16f-47d8-b828-00ef6e2fbd11 +└── 2019-08-25--13 + └── part-0-2.inprogress.2b475fec-1482-4dea-9946-eb4353b475f1 +``` + +因为分桶策略基于每条记录进行评估,所以旧桶仍然可以接受新的记录。 + +### 部分文件的配置项 + +已经完成的文件和进行中的文件仅能通过文件名格式进行区分。 + +默认情况下,文件命名格式如下所示: + - **In-progress / Pending:** `part--.inprogress.uid` + - **FINISHED:** `part--` + +Flink 允许用户通过 `OutputFileConfig` 指定部分文件名的前缀和后缀。 +举例来说,前缀设置为 "prefix" 以及后缀设置为 ".ext" 之后,Sink 创建的文件名如下所示: + +``` +└── 2019-08-25--12 + ├── prefix-0-0.ext + ├── prefix-0-1.ext.inprogress.bd053eb0-5ecf-4c85-8433-9eff486ac334 + ├── prefix-1-0.ext + └── prefix-1-1.ext.inprogress.bc279efe-b16f-47d8-b828-00ef6e2fbd11 +``` + +用户可以通过如下方式设置 `OutputFileConfig`: + +{{< tabs "4ee26cce-7551-4a28-bc76-f8ba0e71ba80" >}} +{{< tab "Java" >}} +```java + +OutputFileConfig config = OutputFileConfig + .builder() + .withPartPrefix("prefix") + .withPartSuffix(".ext") + .build(); + +StreamingFileSink> sink = StreamingFileSink + .forRowFormat((new Path(outputPath), new SimpleStringEncoder<>("UTF-8")) + .withBucketAssigner(new KeyBucketAssigner()) + .withRollingPolicy(OnCheckpointRollingPolicy.build()) + .withOutputFileConfig(config) + .build(); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +val config = OutputFileConfig + .builder() + .withPartPrefix("prefix") + .withPartSuffix(".ext") + .build() + +val sink = StreamingFileSink + .forRowFormat(new Path(outputPath), new SimpleStringEncoder[String]("UTF-8")) + .withBucketAssigner(new KeyBucketAssigner()) + .withRollingPolicy(OnCheckpointRollingPolicy.build()) + .withOutputFileConfig(config) + .build() + +``` +{{< /tab >}} +{{< /tabs >}} + +## 重要注意事项 + +### 通用注意事项 + +重要提示 1: 使用 Hadoop < 2.7 时,请使用 `OnCheckpointRollingPolicy` 滚动策略,该策略会在每次检查点时进行文件切割。 +这样做的原因是如果部分文件的生命周期跨多个检查点,当 `StreamingFileSink` 从之前的检查点进行恢复时会调用文件系统的 `truncate()` 方法清理 in-progress 文件中未提交的数据。 +Hadoop 2.7 之前的版本不支持这个方法,因此 Flink 会报异常。 + +重要提示 2: 鉴于 Flink 的 sink 以及 UDF 通常不会区分作业的正常结束(比如有限流)和异常终止,因此正常结束作业的最后一批 in-progress 文件不会被转换到 "完成" 状态。 + +重要提示 3: Flink 以及 `StreamingFileSink` 不会覆盖已经提交的数据。因此如果尝试从一个包含 in-progress 文件的旧 checkpoint/savepoint 恢复, +且这些 in-progress 文件会被接下来的成功 checkpoint 提交,Flink 会因为无法找到 in-progress 文件而抛异常,从而恢复失败。 + +重要提示 4: 目前 `StreamingFileSink` 只支持三种文件系统: HDFS、S3和Local。如果配置了不支持的文件系统,在执行的时候 Flink 会抛出异常。 + +### S3 特有的注意事项 + +重要提示 1: 对于 S3,`StreamingFileSink` 只支持基于 [Hadoop](https://hadoop.apache.org/) +的文件系统实现,不支持基于 [Presto](https://prestodb.io/) 的实现。如果想使用 `StreamingFileSink` 向 S3 写入数据并且将 +checkpoint 放在基于 Presto 的文件系统,建议明确指定 *"s3a://"* (for Hadoop)作为sink的目标路径方案,并且为 checkpoint 路径明确指定 *"s3p://"* (for Presto)。 +如果 Sink 和 checkpoint 都使用 *"s3://"* 路径的话,可能会导致不可预知的行为,因为双方的实现都在“监听”这个路径。 + +重要提示 2: `StreamingFileSink` 使用 S3 的 [Multi-part Upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/mpuoverview.html) +(后续使用MPU代替)特性可以保证精确一次的语义。这个特性支持以独立的块(因此被称为"multi-part")模式上传文件,当 MPU 的所有部分文件 +成功上传之后,可以合并成原始文件。对于失效的 MPUs,S3 提供了一个基于桶生命周期的规则,用户可以用这个规则来丢弃在指定时间内未完成的MPU。 +如果在一些部分文件还未上传时触发 savepoint,并且这个规则设置的比较严格,这意味着相关的 MPU在作业重启之前可能会超时。后续的部分文件没 +有写入到 savepoint, 那么在 Flink 作业从 savepoint 恢复时,会因为拿不到缺失的部分文件,导致任务失败并抛出异常。 + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/datastream/twitter.md b/docs/content.zh/docs/connectors/datastream/twitter.md new file mode 100644 index 0000000000000..171dbc400b3e1 --- /dev/null +++ b/docs/content.zh/docs/connectors/datastream/twitter.md @@ -0,0 +1,80 @@ +--- +title: Twitter +weight: 9 +type: docs +aliases: + - /zh/dev/connectors/twitter.html +--- + + +# Twitter 连接器 + +[Twitter Streaming API](https://dev.twitter.com/docs/streaming-apis) 提供了访问 Twitter 的 tweets 流的能力。 +Flink Streaming 通过一个内置的 `TwitterSource` 类来创建到 tweets 流的连接。 +使用 Twitter 连接器,需要在工程中添加下面的依赖: + +{{< artifact flink-connector-twitter withScalaVersion >}} + +注意:当前的二进制发行版还没有这些连接器。集群执行请参考[这里]({{< ref "docs/dev/datastream/project-configuration" >}}). + +#### 认证 +使用 Twitter 流,用户需要先注册自己的程序,获取认证相关的必要信息。过程如下: + +#### 获取认证信息 +首先,需要一个 Twitter 账号。可以通过 [twitter.com/signup](https://twitter.com/signup) 免费注册, +或者在 Twitter 的 [Application Management](https://apps.twitter.com/) 登录,然后点击 "Create New App" + 按钮来注册应用,填写应用程序相关表格并且接受条款。选择应用程序之后,可以在 "API Keys" 标签页看到 API key 和 + API secret(对应于`TwitterSource`中的`twitter-source.consumerKey` 和 `twitter-source.consumerSecret` )。 +请保管好这些信息并且不要将其发布到public的仓库。 + + +#### 使用 +和其他的连接器不同的是,`TwitterSource` 没有任何其他依赖。下面的示例代码就可以优雅的运行: + +{{< tabs "976b5108-dc39-4108-ab77-d920b42f74d3" >}} +{{< tab "Java" >}} +```java +Properties props = new Properties(); +props.setProperty(TwitterSource.CONSUMER_KEY, ""); +props.setProperty(TwitterSource.CONSUMER_SECRET, ""); +props.setProperty(TwitterSource.TOKEN, ""); +props.setProperty(TwitterSource.TOKEN_SECRET, ""); +DataStream streamSource = env.addSource(new TwitterSource(props)); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val props = new Properties() +props.setProperty(TwitterSource.CONSUMER_KEY, "") +props.setProperty(TwitterSource.CONSUMER_SECRET, "") +props.setProperty(TwitterSource.TOKEN, "") +props.setProperty(TwitterSource.TOKEN_SECRET, "") +val streamSource = env.addSource(new TwitterSource(props)) +``` +{{< /tab >}} +{{< /tabs >}} + +`TwitterSource` 会发出包含了JSON object的字符串,这样的字符串表示一个Tweet. + +`flink-examples-streaming` 中的 `TwitterExample` 类是使用 `TwitterSource` 的完整示范。 + +`TwitterSource` 默认使用 `StatusesSampleEndpoint`。`StatusesSampleEndpoint` 会返回一个 Tweets 的随机抽样。用户可以通过实现 `TwitterSource.EndpointInitializer` 接口来自定义 endpoint。 + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/table/_index.md b/docs/content.zh/docs/connectors/table/_index.md new file mode 100644 index 0000000000000..4c3be5788e0ec --- /dev/null +++ b/docs/content.zh/docs/connectors/table/_index.md @@ -0,0 +1,23 @@ +--- +title: Table API Connectors +bookCollapseSection: true +weight: 2 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/connectors/table/blackhole.md b/docs/content.zh/docs/connectors/table/blackhole.md new file mode 100644 index 0000000000000..a96cbdf761f1d --- /dev/null +++ b/docs/content.zh/docs/connectors/table/blackhole.md @@ -0,0 +1,85 @@ +--- +title: BlackHole +weight: 15 +type: docs +aliases: + - /zh/dev/table/connectors/blackhole.html +--- + + +# BlackHole SQL 连接器 + +{{< label "Sink: Bounded" >}} +{{< label "Sink: UnBounded" >}} + +BlackHole 连接器允许接收所有输入记录。它被设计用于: + +- 高性能测试。 +- UDF 输出,而不是实质性 sink。 + +就像类 Unix 操作系统上的 /dev/null。 + +BlackHole 连接器是内置的。 + +如何创建 BlackHole 表 +---------------- + +```sql +CREATE TABLE blackhole_table ( + f0 INT, + f1 INT, + f2 STRING, + f3 DOUBLE +) WITH ( + 'connector' = 'blackhole' +); +``` + + +或者,可以基于现有模式使用 [LIKE 子句]({{< ref "docs/dev/table/sql/create" >}}#create-table) 创建。 + +```sql +CREATE TABLE blackhole_table WITH ('connector' = 'blackhole') +LIKE source_table (EXCLUDING ALL) +``` + +连接器选项 +---------------- + + + + + + + + + + + + + + + + + + + + +
    选项是否必要默认值类型描述
    connector
    必要(none)String指定需要使用的连接器,此处应为‘blackhole’。
    diff --git a/docs/content.zh/docs/connectors/table/datagen.md b/docs/content.zh/docs/connectors/table/datagen.md new file mode 100644 index 0000000000000..287686faa74a7 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/datagen.md @@ -0,0 +1,149 @@ +--- +title: DataGen +weight: 13 +type: docs +aliases: + - /zh/dev/table/connectors/datagen.html +--- + + +# DataGen SQL 连接器 + +{{< label "Scan Source: 有界" >}} +{{< label "Scan Source: 无界" >}} + +DataGen 连接器允许按数据生成规则进行读取。 + +DataGen 连接器可以使用[计算列语法]({{< ref "docs/dev/table/sql/create" >}}#create-table)。 +这使您可以灵活地生成记录。 + +DataGen 连接器是内置的。 + +注意 不支持复杂类型: Array,Map,Row。 请用计算列构造这些类型。 + +怎么创建一个 DataGen 的表 +---------------- + +表的有界性:当表中字段的数据全部生成完成后,source 就结束了。 因此,表的有界性取决于字段的有界性。 + +每个列,都有两种生成数据的方法: + +- 随机生成器是默认的生成器,您可以指定随机生成的最大和最小值。char、varchar、string (类型)可以指定长度。它是无界的生成器。 + +- 序列生成器,您可以指定序列的起始和结束值。它是有界的生成器,当序列数字达到结束值,读取结束。 + +```sql +CREATE TABLE datagen ( + f_sequence INT, + f_random INT, + f_random_str STRING, + ts AS localtimestamp, + WATERMARK FOR ts AS ts +) WITH ( + 'connector' = 'datagen', + + -- optional options -- + + 'rows-per-second'='5', + + 'fields.f_sequence.kind'='sequence', + 'fields.f_sequence.start'='1', + 'fields.f_sequence.end'='1000', + + 'fields.f_random.min'='1', + 'fields.f_random.max'='1000', + + 'fields.f_random_str.length'='10' +) +``` + + +连接器参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值数据类型描述
    connector
    必须(none)String指定要使用的连接器,这里是 'datagen'。
    rows-per-second
    可选10000Long每秒生成的行数,用以控制数据发出速率。
    fields.#.kind
    可选randomString指定 '#' 字段的生成器。可以是 'sequence' 或 'random'。
    fields.#.min
    可选(Minimum value of type)(Type of field)随机生成器的最小值,适用于数字类型。
    fields.#.max
    可选(Maximum value of type)(Type of field)随机生成器的最大值,适用于数字类型。
    fields.#.length
    可选100Integer随机生成器生成字符的长度,适用于 char、varchar、string。
    fields.#.start
    可选(none)(Type of field)序列生成器的起始值。
    fields.#.end
    可选(none)(Type of field)序列生成器的结束值。
    diff --git a/docs/content.zh/docs/connectors/table/downloads.md b/docs/content.zh/docs/connectors/table/downloads.md new file mode 100644 index 0000000000000..6409dd13942ed --- /dev/null +++ b/docs/content.zh/docs/connectors/table/downloads.md @@ -0,0 +1,48 @@ +--- +title: 下载页面 +weight: 100 +type: docs +bookToc: false +aliases: + - /dev/table/connectors/downloads.html +--- + + +# SQL Connectors 下载页面 + +{{< unstable >}} +{{< hint info >}} +Download links are available only for stable releases. +{{< /hint >}} +{{< /unstable >}} + +The page contains links to optional sql-client connectors and formats that are not part of the binary distribution. + +# 可选的 SQL formats +------------------- + +{{< sql_optional_formats >}} + +# 可选的 SQL 连接器 +------------------- + +{{< sql_optional_connectors >}} + + diff --git a/docs/content.zh/docs/connectors/table/elasticsearch.md b/docs/content.zh/docs/connectors/table/elasticsearch.md new file mode 100644 index 0000000000000..0d4f06fd79ec3 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/elasticsearch.md @@ -0,0 +1,266 @@ +--- +title: Elasticsearch +weight: 7 +type: docs +aliases: + - /zh/dev/table/connectors/elasticsearch.html +--- + + +# Elasticsearch SQL 连接器 + +{{< label "Sink: Batch" >}} +{{< label "Sink: Streaming Append & Upsert Mode" >}} + +Elasticsearch 连接器允许将数据写入到 Elasticsearch 引擎的索引中。本文档描述运行 SQL 查询时如何设置 Elasticsearch 连接器。 + +连接器可以工作在 upsert 模式,使用 DDL 中定义的主键与外部系统交换 UPDATE/DELETE 消息。 + +如果 DDL 中没有定义主键,那么连接器只能工作在 append 模式,只能与外部系统交换 INSERT 消息。 + +依赖 +------------ + +{{< sql_download_table "elastic" >}} + +如何创建 Elasticsearch 表 +---------------- + +以下示例展示了如何创建 Elasticsearch sink 表: + +```sql +CREATE TABLE myUserTable ( + user_id STRING, + user_name STRING + uv BIGINT, + pv BIGINT, + PRIMARY KEY (user_id) NOT ENFORCED +) WITH ( + 'connector' = 'elasticsearch-7', + 'hosts' = 'http://localhost:9200', + 'index' = 'users' +); +``` + +连接器参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值数据类型描述
    connector
    必选(none)String指定要使用的连接器,有效值为: +
      +
    • elasticsearch-6:连接到 Elasticsearch 6.x 的集群。
    • +
    • elasticsearch-7:连接到 Elasticsearch 7.x 及更高版本的集群。
    • +
    hosts
    必选(none)String要连接到的一台或多台 Elasticsearch 主机,例如 'http://host_name:9092;http://host_name:9093'
    index
    必选(none)StringElasticsearch 中每条记录的索引。可以是一个静态索引(例如 'myIndex')或一个动态索引(例如 'index-{log_ts|yyyy-MM-dd}')。 + 更多详细信息,请参见下面的动态索引部分。
    document-type
    6.x 版本中必选(none)StringElasticsearch 文档类型。在 elasticsearch-7 中不再需要。
    document-id.key-delimiter
    可选_String复合键的分隔符(默认为"_"),例如,指定为"$"将导致文档 ID 为"KEY1$KEY2$KEY3"。
    username
    可选(none)String用于连接 Elasticsearch 实例的用户名。请注意,Elasticsearch 没有预绑定安全特性,但你可以通过如下指南启用它来保护 Elasticsearch 集群。
    password
    可选(none)String用于连接 Elasticsearch 实例的密码。如果配置了username,则此选项也必须配置为非空字符串。
    failure-handler
    可选failString对 Elasticsearch 请求失败情况下的失败处理策略。有效策略为: +
      +
    • fail:如果请求失败并因此导致作业失败,则抛出异常。
    • +
    • ignore:忽略失败并放弃请求。
    • +
    • retry-rejected:重新添加由于队列容量饱和而失败的请求。
    • +
    • 自定义类名称:使用 ActionRequestFailureHandler 的子类进行失败处理。
    • +
    +
    sink.flush-on-checkpoint
    可选trueBoolean是否在 checkpoint 时执行 flush。禁用后,在 checkpoint 时 sink 将不会等待所有的 pending 请求被 Elasticsearch 确认。因此,sink 不会为请求的 at-least-once 交付提供任何有力保证。 +
    sink.bulk-flush.max-actions
    可选1000Integer每个批量请求的最大缓冲操作数。 + 可以设置为'0'来禁用它。 +
    sink.bulk-flush.max-size
    可选2mbMemorySize每个批量请求的缓冲操作在内存中的最大值。单位必须为 MB。 + 可以设置为'0'来禁用它。 +
    sink.bulk-flush.interval
    可选1sDurationflush 缓冲操作的间隔。 + 可以设置为'0'来禁用它。注意,'sink.bulk-flush.max-size''sink.bulk-flush.max-actions'都设置为'0'的这种 flush 间隔设置允许对缓冲操作进行完全异步处理。 +
    sink.bulk-flush.backoff.strategy
    可选DISABLEDString指定在由于临时请求错误导致任何 flush 操作失败时如何执行重试。有效策略为: +
      +
    • DISABLED:不执行重试,即第一次请求错误后失败。
    • +
    • CONSTANT:等待重试之间的回退延迟。
    • +
    • EXPONENTIAL:先等待回退延迟,然后在重试之间指数递增。
    • +
    +
    sink.bulk-flush.backoff.max-retries
    可选8Integer最大回退重试次数。
    sink.bulk-flush.backoff.delay
    可选50msDuration每次回退尝试之间的延迟。对于 CONSTANT 回退策略,该值是每次重试之间的延迟。对于 EXPONENTIAL 回退策略,该值是初始的延迟。
    connection.max-retry-timeout
    可选(none)Duration最大重试超时时间。
    connection.path-prefix
    可选(none)String添加到每个 REST 通信中的前缀字符串,例如,'/v1'
    format
    可选jsonStringElasticsearch 连接器支持指定格式。该格式必须生成一个有效的 json 文档。 + 默认使用内置的 'json' 格式。更多详细信息,请参阅 }}">JSON Format 页面。 +
    + +特性 +---------------- + +### Key 处理 + +Elasticsearch sink 可以根据是否定义了主键来确定是在 upsert 模式还是 append 模式下工作。 +如果定义了主键,Elasticsearch sink 将以 upsert 模式工作,该模式可以消费包含 UPDATE/DELETE 消息的查询。 +如果未定义主键,Elasticsearch sink 将以 append 模式工作,该模式只能消费包含 INSERT 消息的查询。 + +在 Elasticsearch 连接器中,主键用于计算 Elasticsearch 的文档 id,文档 id 为最多 512 字节且不包含空格的字符串。 +Elasticsearch 连接器通过使用 `document-id.key-delimiter` 指定的键分隔符按照 DDL 中定义的顺序连接所有主键字段,为每一行记录生成一个文档 ID 字符串。 +某些类型不允许作为主键字段,因为它们没有对应的字符串表示形式,例如,`BYTES`,`ROW`,`ARRAY`,`MAP` 等。 +如果未指定主键,Elasticsearch 将自动生成文档 id。 + +有关 PRIMARY KEY 语法的更多详细信息,请参见 [CREATE TABLE DDL]({{< ref "docs/dev/table/sql/create" >}}#create-table)。 + +### 动态索引 + +Elasticsearch sink 同时支持静态索引和动态索引。 + +如果你想使用静态索引,则 `index` 选项值应为纯字符串,例如 `'myusers'`,所有记录都将被写入到 "myusers" 索引中。 + +如果你想使用动态索引,你可以使用 `{field_name}` 来引用记录中的字段值来动态生成目标索引。 +你也可以使用 `'{field_name|date_format_string}'` 将 `TIMESTAMP/DATE/TIME` 类型的字段值转换为 `date_format_string` 指定的格式。 +`date_format_string` 与 Java 的 [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/index.html) 兼容。 +例如,如果选项值设置为 `'myusers-{log_ts|yyyy-MM-dd}'`,则 `log_ts` 字段值为 `2020-03-27 12:25:55` 的记录将被写入到 "myusers-2020-03-27" 索引中。 + + +数据类型映射 +---------------- + +Elasticsearch 将文档存储在 JSON 字符串中。因此数据类型映射介于 Flink 数据类型和 JSON 数据类型之间。 +Flink 为 Elasticsearch 连接器使用内置的 `'json'` 格式。更多类型映射的详细信息,请参阅 [JSON Format]({{< ref "docs/connectors/table/formats/json" >}}) 页面。 + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/table/filesystem.md b/docs/content.zh/docs/connectors/table/filesystem.md new file mode 100644 index 0000000000000..de9438f8fa347 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/filesystem.md @@ -0,0 +1,483 @@ +--- +title: 文件系统 +weight: 8 +type: docs +aliases: + - /zh/dev/table/connectors/filesystem.html +--- + + +# 文件系统 SQL 连接器 + +该连接器提供了对 [Flink 文件系统抽象]({{< ref "docs/deployment/filesystems/overview" >}}) 支持的文件系统中的分区文件的访问. + +文件系统连接器本身就被包括在 Flink 中,不需要任何额外的依赖。当从文件系统中读取或向文件系统写入记录时,需要指定相应的记录格式。 + +文件系统连接器支持对本地文件系统或分布式文件系统的读取和写入。 可以通过如下方式定义文件系统表: + +```sql +CREATE TABLE MyUserTable ( + column_name1 INT, + column_name2 STRING, + ... + part_name1 INT, + part_name2 STRING +) PARTITIONED BY (part_name1, part_name2) WITH ( + 'connector' = 'filesystem', -- 必选: 指定连接器类型 + 'path' = 'file:///path/to/whatever', -- 必选: 指向目录的路径 + 'format' = '...', -- 必选: 文件系统连接器需要指定格式,请查阅 表格式 部分以获取更多细节 + 'partition.default-name' = '...', -- 可选: 动态分区模式下分区字段值是 null 或空字符串时,默认的分区名。 + 'sink.shuffle-by-partition.enable' = '...', -- 可选: 该选项开启了在 sink 阶段通过动态分区字段来 shuffle 数据,该功能可以大大减少文件系统 sink 的文件数,但可能会导致数据倾斜,默认值是 false. + ... +) +``` + +{{< hint info >}} +需要确保包含以下依赖 [Flink File System specific dependencies]({{< ref "docs/deployment/filesystems/overview" >}}). +{{< /hint >}} + +{{< hint info >}} +针对流的文件系统 sources 目前还在开发中。 将来,社区会不断添加对常见的流处理场景的支持, 比如对分区和目录的检测等。 +{{< /hint >}} + +{{< hint warning >}} +新版的文件系统连接器和旧版的文件系统连接器有很大不同:path 参数指定的是一个目录而不是一个文件,该目录下文件的格式也不是肉眼可读的。 +{{< /hint >}} + +## 分区文件 + +Flink 的文件系统连接器在对分区的支持上,使用了标准的 hive 格式。 不过,它不需要预先注册分区,而是基于目录结构自动做了分区发现。比如,以下目录结构的表, 会被自动推导为包含 `datetime` 和 `hour` 分区的分区表。 + +``` +path +└── datetime=2019-08-25 + └── hour=11 + ├── part-0.parquet + ├── part-1.parquet + └── hour=12 + ├── part-0.parquet +└── datetime=2019-08-26 + └── hour=6 + ├── part-0.parquet +``` + +文件系统连接器支持分区新增插入和分区覆盖插入。 参见 [INSERT Statement]({{< ref "docs/dev/table/sql/insert" >}}). 当对分区表进行分区覆盖插入时,只有相应的分区会被覆盖,而不是整个表。 + +## 文件格式 + +文件系统连接器支持多种格式: + + - CSV: [RFC-4180](https://tools.ietf.org/html/rfc4180). 非压缩格式。 + - JSON: 注意文件系统连接器中的 JSON 不是传统的标准的 JSON 格式,而是非压缩的 [newline delimited JSON](http://jsonlines.org/). + - Avro: [Apache Avro](http://avro.apache.org). 可以通过配置 `avro.codec` 支持压缩. + - Parquet: [Apache Parquet](http://parquet.apache.org). 与 Hive 兼容. + - Orc: [Apache Orc](http://orc.apache.org). 与 Hive 兼容. + - Debezium-JSON: [debezium-json]({{< ref "docs/connectors/table/formats/debezium" >}}). + - Canal-JSON: [canal-json]({{< ref "docs/connectors/table/formats/canal" >}}). + - Raw: [raw]({{< ref "docs/connectors/table/formats/raw" >}}). + +## 流式 Sink + +文件系统连接器支持流式的写, 它基于 Flink 的 [Streaming File Sink]({{< ref "docs/connectors/datastream/streamfile_sink" >}}) +将记录写入文件。按行编码的格式支持 csv 和 json。 按块编码的格式支持 parquet, orc 和 avro。 + +你可以直接编写 SQL,把流数据插入到非分区表。 +如果是分区表,可以配置分区操作相关的参数,参见 [分区提交](#分区提交) 以查阅更多细节. + +### 滚动策略 + +分区目录下的数据被分割到分区文件中。每个分区对应的sink的每个接受到了数据的子任务都至少会为该分区生成一个分区文件。 +根据可配置的滚动策略,当前正在写入的分区文件会被关闭,新的分区文件也会被生成。 +该策略基于大小,和指定的文件可被打开的最大 timeout 时长,来滚动分区文件。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDefaultTypeDescription
    sink.rolling-policy.file-size
    128MBMemorySize 滚动前,分区文件最大大小.
    sink.rolling-policy.rollover-interval
    30 minDuration 滚动前,分区文件处于打开状态的最大时长 (默认值是30分钟,以避免产生大量小文件)。 检查该选项的频率由参数 'sink.rolling-policy.check-interval' 控制。
    sink.rolling-policy.check-interval
    1 minDuration 基于时间的滚动策略的检查间隔。该参数控制了基于参数 'sink.rolling-policy.rollover-interval' 检查分区文件是否该被滚动的检查频率 .
    + +**注意:** 对于 bulk 格式 (parquet, orc, avro), 滚动策略和检查点间隔控制了分区文件的大小和个数 (未完成的文件会在下个检查点完成). + +**注意:** 对于行格式 (csv, json), 如果想使得分区文件更快地在文件系统中可见,可以设置连接器参数 `sink.rolling-policy.file-size` 或 `sink.rolling-policy.rollover-interval` ,以及 flink-conf.yaml 中的 `execution.checkpointing.interval` 。 +对于其他格式 (avro, orc), 可以只设置 flink-conf.yaml 中的 `execution.checkpointing.interval` 。 + +### 文件合并 + +file sink 支持文件合并,以允许应用程序可以使用较小的检查点间隔而不产生大量文件。 + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDefaultTypeDescription
    auto-compaction
    falseBoolean 在流式 sink 中是否开启自动合并功能。数据首先会被写入到临时文件,在检查点完成后,该检查点产生的临时文件会被合并。这些临时文件在合并前不可见.
    compaction.file-size
    (none)MemorySize 合并目标文件大小,默认值是滚动文件大小.
    + +启用该参数后,文件合并功能会根据设定的目标文件大小,合并多个小文件到大文件。 +当在生产环境使用文件合并功能时,需要注意: +- 只有检查点内部的文件才会被合并,也就是说,至少会生成跟检查点个数一样多的文件。 +- 合并前文件是可见的,所以文件的可见性是:检查点间隔 + 合并时长。 +- 如果合并花费的时间很长,会对作业产生反压,延长检查点所需时间。 + +### 分区提交 + + +分区数据写完毕后,经常需要通知下游应用。比如,在 Hive metastore 中新增分区或者在目录下新增 `_SUCCESS` 文件。 分区提交策略是可定制的,具体的分区提交行为是基于 `triggers` 和 `policies` 的组合. + +- Trigger: 分区提交的时机,可以基于从分区中提取的时间对应的水印,或者基于处理时间。 +- Policy: 分区提交策略,内置的策略包括提交 `_SUCCESS` 文件和 hive metastore, 也可以自己定制提交策略, 比如触发 hive 生成统计信息,合并小文件等。 + +**注意:** 分区提交只有在动态分区插入模式下才有效。 + +#### 分区提交触发器 + +通过配置分区提交的触发策略,来配置何时提交分区: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDefaultTypeDescription
    sink.partition-commit.trigger
    process-timeString分区提交触发器类型。 + 'process-time': 基于机器时间,既不需要分区时间提取器也不需要水印生成器,一旦 ”当前系统时间“ 超过了 “分区创建系统时间” 和 'sink.partition-commit.delay' 之和,就提交分区; + 'partition-time': 基于从分区字段提取的时间,需要水印生成器,一旦 “水印” 超过了 ”从分区字段提取的时间“ 和 'sink.partition-commit.delay' 之和,就提交分区.
    sink.partition-commit.delay
    0 sDuration该延迟时间之前分区不会被提交。如果是按天的分区,应配置为 '1 d', 如果是按小时的分区,应配置为 '1 h'.
    sink.partition-commit.watermark-time-zone
    UTCString解析 LONG 类型的水印到 TIMESTAMP 类型时所采用的时区,解析得到的水印的 TIMESTAMP 会被用来跟分区时间进行比较以判断分区是否该被提交。 + 该参数只有在参数 `sink.partition-commit.trigger` 被设置为 'partition-time' 时才生效。 + 如果该参数设置的不正确,比如在 TIMESTAMP_LTZ 列上定义了 source rowtime, 但没有设置该参数,则用户可能在若干个小时后才看到分区的提交。 + 该参数的默认值是 'UTC', 代表水印是定义在 TIMESTAMP 列上或没有定义水印。 如果水印定义在 TIMESTAMP_LTZ 列上,则水印的时区是会话的时区。 + 该参数的可选值要么是完整的时区名比如 'America/Los_Angeles',要么是自定义的时区 id 比如 'GMT-08:00'.
    + +有两种类型的触发器: +- 第一种是根据分区的处理时间。 该触发器不需要分区时间提取,也不需要生成水印。通过分区创建时间和当前系统时间来触发分区提交。该触发器更通用但不是很精确。比如,数据的延迟或故障转移都会导致分区的提前提交。 +- 第二种是根据从分区字段提取的时间以及水印。这需要你的作业支持生成水印,分区是根据时间来切割的,比如按小时或按天分区。 + +如果想让下游系统尽快感知到分区,而不管分区数据是否完整: +- 'sink.partition-commit.trigger'='process-time' (默认值) +- 'sink.partition-commit.delay'='0s' (默认值) +一旦分区中有数据,分区立马就会被提交。注意:分区可能会被提交多次。 + +如果想让下游系统只有在分区数据完整时才感知到分区,且你的作业有水印生成的逻辑,也能从分区字段的值中提取到时间: +- 'sink.partition-commit.trigger'='partition-time' +- 'sink.partition-commit.delay'='1h' (根据分区类型指定,如果是按小时的分区可配置为 '1h') +该方式是最精确的提交分区的方式,该方式尽力确保提交的分区包含尽量完整的数据。 + +如果想让下游系统只有在数据完整时才感知到分区,但是没有水印,或者无法从分区字段的值中提取时间: +- 'sink.partition-commit.trigger'='process-time' (默认值) +- 'sink.partition-commit.delay'='1h' (根据分区类型指定,如果是按小时的分区可配置为 '1h') +该方式尽量精确地提交分区,但是数据延迟或故障转移会导致分区的提前提交。 + +延迟数据的处理:延迟的记录会被写入到已经提交的对应分区中,且会再次触发该分区的提交。 + +#### 分区时间提取器 + +时间提取器定义了如何从分区字段值中提取时间. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDefaultTypeDescription
    partition.time-extractor.kind
    defaultString从分区字段提取时间的时间提取器。支持默认值和定制。对于默认值,可以配置时间戳模式。对于定制,应指定提取器类.
    partition.time-extractor.class
    (none)String实现了接口 PartitionTimeExtractor 的提取器类.
    partition.time-extractor.timestamp-pattern
    (none)String 'default' 时间提取器允许用户从分区字段中提取合法的时间戳模式。默认支持从第一个字段按 'yyyy-mm-dd hh:mm:ss' 时间戳模式提取。 + 如果需要从一个分区字段比如 ‘dt’ 提取时间戳,可以配置为: '$dt'; + 如果需要从多个分区字段,比如 'year', 'month', 'day' 和 'hour'提取时间戳,可以配置为:'$year-$month-$day $hour:00:00'; + 如果需要从两字分区字段,比如 'dt' 和 'hour' 提取时间戳,可以配置为:'$dt $hour:00:00'.
    + +默认的提取器是基于由分区字段组合而成的时间戳模式。你也可以指定一个实现了 `PartitionTimeExtractor` 接口的自定义的提取器。 + +```java + +public class HourPartTimeExtractor implements PartitionTimeExtractor { + @Override + public LocalDateTime extract(List keys, List values) { + String dt = values.get(0); + String hour = values.get(1); + return Timestamp.valueOf(dt + " " + hour + ":00:00").toLocalDateTime(); + } +} + +``` + +#### 分区提交策略 + +分区提交策略指定了提交分区时的具体操作. + +- 第一种是 metastore, 只有 hive 表支持该策略, 该策略下文件系统通过目录层次结构来管理分区. +- 第二种是 success 文件, 该策略下会在分区对应的目录下写入一个名为 `_SUCCESS` 的空文件. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDefaultTypeDescription
    sink.partition-commit.policy.kind
    (none)String分区提交策略用来通知下游应用系统某个分区已经写完毕可以被读取了。 + metastore: 向 metastore 中增加分区,只有 hive 支持 metastore 策略,文件系统通过目录结构管理分区; + success-file: 向目录下增加 '_success' 文件; + custom: 使用指定的类来创建提交策略; + 支持同时指定多个提交策略,如:'metastore,success-file'.
    sink.partition-commit.policy.class
    (none)String 实现了 PartitionCommitPolicy 接口的分区提交策略。只有在 custom 提交策略下适用。
    sink.partition-commit.success-file.name
    _SUCCESSString 使用 success-file 分区提交策略时的文件名,默认值是 '_SUCCESS'.
    + +你也可以实现自己的提交策略,如: + +```java + +public class AnalysisCommitPolicy implements PartitionCommitPolicy { + private HiveShell hiveShell; + + @Override + public void commit(Context context) throws Exception { + if (hiveShell == null) { + hiveShell = createHiveShell(context.catalogName()); + } + + hiveShell.execute(String.format( + "ALTER TABLE %s ADD IF NOT EXISTS PARTITION (%s = '%s') location '%s'", + context.tableName(), + context.partitionKeys().get(0), + context.partitionValues().get(0), + context.partitionPath())); + hiveShell.execute(String.format( + "ANALYZE TABLE %s PARTITION (%s = '%s') COMPUTE STATISTICS FOR COLUMNS", + context.tableName(), + context.partitionKeys().get(0), + context.partitionValues().get(0))); + } +} + +``` + +## Sink 并行度 + +向外部文件系统(包括 hive) 写文件时的并行度,在流处理模式和批处理模式下,都可以通过对应的 table 选项指定。默认情况下,该并行度跟上一个上游的 chained operator 的并行度一样。当配置了跟上一个上游的 chained operator 不一样的并行度时,写文件的算子和合并文件的算子(如果使用了的话)会使用指定的并行度。 + + + + + + + + + + + + + + + + + + + + +
    KeyDefaultTypeDescription
    sink.parallelism
    (none)Integer 向外部文件系统写文件时的并行度。必须大于 0,否则会抛出异常.
    + +**注意:** 当前,只有在上游的 changelog 模式是 **INSERT-ONLY** 时,才支持设置 sink 的并行度。否则的话,会抛出异常。 + +## 完整示例 + +如下示例演示了如何使用文件系统连接器编写流查询语句查询 kafka 中的数据并写入到文件系统中,以及通过批查询把结果数据读取出来. + +```sql + +CREATE TABLE kafka_table ( + user_id STRING, + order_amount DOUBLE, + log_ts TIMESTAMP(3), + WATERMARK FOR log_ts AS log_ts - INTERVAL '5' SECOND -- 在 TIMESTAMP 列上定义水印 +) WITH (...); + +CREATE TABLE fs_table ( + user_id STRING, + order_amount DOUBLE, + dt STRING, + `hour` STRING +) PARTITIONED BY (dt, `hour`) WITH ( + 'connector'='filesystem', + 'path'='...', + 'format'='parquet', + 'sink.partition-commit.delay'='1 h', + 'sink.partition-commit.policy.kind'='success-file' +); + +-- streaming sql, 插入数据到文件系统表中 +INSERT INTO fs_table +SELECT + user_id, + order_amount, + DATE_FORMAT(log_ts, 'yyyy-MM-dd'), + DATE_FORMAT(log_ts, 'HH') +FROM kafka_table; + +-- batch sql, 分区裁剪查询 +SELECT * FROM fs_table WHERE dt='2020-05-20' and `hour`='12'; +``` + +如果水印是定义在 TIMESTAMP_LTZ 列上,且使用了 `partition-time` 来提交分区, 则参数 `sink.partition-commit.watermark-time-zone` 需要被设置为会话的时区,否则分区会在若干小时后才会被提交。 +```sql + +CREATE TABLE kafka_table ( + user_id STRING, + order_amount DOUBLE, + ts BIGINT, -- epoch 毫秒时间 + ts_ltz AS TO_TIMESTAMP_LTZ(ts, 3), + WATERMARK FOR ts_ltz AS ts_ltz - INTERVAL '5' SECOND -- 在 TIMESTAMP_LTZ 列上定义水印 +) WITH (...); + +CREATE TABLE fs_table ( + user_id STRING, + order_amount DOUBLE, + dt STRING, + `hour` STRING +) PARTITIONED BY (dt, `hour`) WITH ( + 'connector'='filesystem', + 'path'='...', + 'format'='parquet', + 'partition.time-extractor.timestamp-pattern'='$dt $hour:00:00', + 'sink.partition-commit.delay'='1 h', + 'sink.partition-commit.trigger'='partition-time', + 'sink.partition-commit.watermark-time-zone'='Asia/Shanghai', -- 假定用户配置的时区是 'Asia/Shanghai' + 'sink.partition-commit.policy.kind'='success-file' +); + +-- streaming sql, 插入数据到文件系统表中 +INSERT INTO fs_table +SELECT + user_id, + order_amount, + DATE_FORMAT(ts_ltz, 'yyyy-MM-dd'), + DATE_FORMAT(ts_ltz, 'HH') +FROM kafka_table; + +-- batch sql, 分区裁剪查询 +SELECT * FROM fs_table WHERE dt='2020-05-20' and `hour`='12'; +``` + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/table/formats/_index.md b/docs/content.zh/docs/connectors/table/formats/_index.md new file mode 100644 index 0000000000000..8faac1684ff96 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/_index.md @@ -0,0 +1,23 @@ +--- +title: Formats +bookCollapseSection: true +weight: 2 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/connectors/table/formats/avro-confluent.md b/docs/content.zh/docs/connectors/table/formats/avro-confluent.md new file mode 100644 index 0000000000000..845ca1f90488e --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/avro-confluent.md @@ -0,0 +1,222 @@ +--- +title: Confluent Avro +weight: 4 +type: docs +aliases: + - /zh/dev/table/connectors/formats/avro-confluent.html +--- + + +# Confluent Avro Format + +Format: Serialization Schema +Format: Deserialization Schema + + + +Avro Schema Registry (``avro-confluent``) 格式能让你读取被 ``io.confluent.kafka.serializers.KafkaAvroSerializer`` 序列化的记录,以及可以写入成能被 ``io.confluent.kafka.serializers.KafkaAvroDeserializer`` 反序列化的记录。 + +当以这种格式读取(反序列化)记录时,将根据记录中编码的 schema 版本 id 从配置的 Confluent Schema Registry 中获取 Avro writer schema ,而从 table schema 中推断出 reader schema。 + +当以这种格式写入(序列化)记录时,Avro schema 是从 table schema 中推断出来的,并会用来检索要与数据一起编码的 schema id。我们会在配置的 Confluent Schema Registry 中配置的 [subject](https://docs.confluent.io/current/schema-registry/index.html#schemas-subjects-and-topics) 下,检索 schema id。subject 通过 `avro-confluent.schema-registry.subject` 参数来制定。 + +Avro Schema Registry 格式只能与 [Apache Kafka SQL 连接器]({{< ref "docs/connectors/table/kafka" >}})或 [Upsert Kafka SQL 连接器]({{< ref "docs/connectors/table/upsert-kafka" >}})一起使用。 + +依赖 +------------ + +{{< sql_download_table "avro-confluent" >}} + +如何创建使用 Avro-Confluent 格式的表 +---------------- + +以下是一个使用 Kafka 连接器和 Confluent Avro 格式创建表的示例。 + +{{< tabs "3df131fd-0e20-4635-a8f9-3574a764db7a" >}} +{{< tab "SQL" >}} + +使用原始的 UTF-8 字符串作为 Kafka 的 key,Schema Registry 中注册的 Avro 记录作为 Kafka 的 values 的表的示例: + +```sql +CREATE TABLE user_created ( + + -- 该列映射到 Kafka 原始的 UTF-8 key + the_kafka_key STRING, + + -- 映射到 Kafka value 中的 Avro 字段的一些列 + id STRING, + name STRING, + email STRING + +) WITH ( + + 'connector' = 'kafka', + 'topic' = 'user_events_example1', + 'properties.bootstrap.servers' = 'localhost:9092', + + -- UTF-8 字符串作为 Kafka 的 keys,使用表中的 'the_kafka_key' 列 + 'key.format' = 'raw', + 'key.fields' = 'the_kafka_key', + + 'value.format' = 'avro-confluent', + 'value.avro-confluent.schema-registry.url' = 'http://localhost:8082', + 'value.fields-include' = 'EXCEPT_KEY' +) +``` + +我们可以像下面这样将数据写入到 kafka 表中: + +```sql +INSERT INTO user_created +SELECT + -- 将 user id 复制至映射到 kafka key 的列中 + id as the_kafka_key, + + -- 所有的 values + id, name, email +FROM some_table +``` + +--- + +Kafka 的 key 和 value 在 Schema Registry 中都注册为 Avro 记录的表的示例: + +```sql +CREATE TABLE user_created ( + + -- 该列映射到 Kafka key 中的 Avro 字段 'id' + kafka_key_id STRING, + + -- 映射到 Kafka value 中的 Avro 字段的一些列 + id STRING, + name STRING, + email STRING + +) WITH ( + + 'connector' = 'kafka', + 'topic' = 'user_events_example2', + 'properties.bootstrap.servers' = 'localhost:9092', + + -- 注意:由于哈希分区,在 Kafka key 的上下文中,schema 升级几乎从不向后也不向前兼容。 + 'key.format' = 'avro-confluent', + 'key.avro-confluent.schema-registry.url' = 'http://localhost:8082', + 'key.fields' = 'kafka_key_id', + + -- 在本例中,我们希望 Kafka 的 key 和 value 的 Avro 类型都包含 'id' 字段 + -- => 给表中与 Kafka key 字段关联的列添加一个前缀来避免冲突 + 'key.fields-prefix' = 'kafka_key_', + + 'value.format' = 'avro-confluent', + 'value.avro-confluent.schema-registry.url' = 'http://localhost:8082', + 'value.fields-include' = 'EXCEPT_KEY', + + -- 自 Flink 1.13 起,subjects 具有一个默认值, 但是可以被覆盖: + 'key.avro-confluent.schema-registry.subject' = 'user_events_example2-key2', + 'value.avro-confluent.schema-registry.subject' = 'user_events_example2-value2' +) +``` + +--- +使用 upsert-kafka 连接器,Kafka 的 value 在 Schema Registry 中注册为 Avro 记录的表的示例: + +```sql +CREATE TABLE user_created ( + + -- 该列映射到 Kafka 原始的 UTF-8 key + kafka_key_id STRING, + + -- 映射到 Kafka value 中的 Avro 字段的一些列 + id STRING, + name STRING, + email STRING, + + -- upsert-kafka 连接器需要一个主键来定义 upsert 行为 + PRIMARY KEY (kafka_key_id) NOT ENFORCED + +) WITH ( + + 'connector' = 'upsert-kafka', + 'topic' = 'user_events_example3', + 'properties.bootstrap.servers' = 'localhost:9092', + + -- UTF-8 字符串作为 Kafka 的 keys + -- 在本例中我们不指定 'key.fields',因为它由表的主键决定 + 'key.format' = 'raw', + + -- 在本例中,我们希望 Kafka 的 key 和 value 的 Avro 类型都包含 'id' 字段 + -- => 给表中与 Kafka key 字段关联的列添加一个前缀来避免冲突 + 'key.fields-prefix' = 'kafka_key_', + + 'value.format' = 'avro-confluent', + 'value.avro-confluent.schema-registry.url' = 'http://localhost:8082', + 'value.fields-include' = 'EXCEPT_KEY' +) +``` +{{< /tab >}} +{{< /tabs >}} + +Format 参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值类型描述
    format
    必选(none)String指定要使用的格式,这里应该是 'avro-confluent'
    avro-confluent.schema-registry.url
    必选(none)String用于获取/注册 schemas 的 Confluent Schema Registry 的URL。
    avro-confluent.schema-registry.subject
    可选(none)StringConfluent Schema Registry 主题,用于在序列化期间注册此格式使用的 schema。默认 kafka 和 upsert-kafka 连接器会使用 "<topic_name>-value" 或者 "<topic_name>-key" 作为 subject 名字。但是对于其他连接器(如 filesystem)则在当做 sink 使用时需要显式指定 subject 名字。
    + +数据类型映射 +---------------- + +目前 Apache Flink 都是从 table schema 去推断反序列化期间的 Avro reader schema 和序列化期间的 Avro writer schema。显式地定义 Avro schema 暂不支持。 +[Apache Avro Format]({{< ref "docs/connectors/table/formats/avro" >}}#data-type-mapping)中描述了 Flink 数据类型和 Avro 类型的对应关系。 + +除了此处列出的类型之外,Flink 还支持读取/写入可为空(nullable)的类型。 Flink 将可为空的类型映射到 Avro `union(something, null)`, 其中 `something` 是从 Flink 类型转换的 Avro 类型。 + +您可以参考 [Avro Specification](https://avro.apache.org/docs/current/spec.html) 以获取有关 Avro 类型的更多信息。 diff --git a/docs/content.zh/docs/connectors/table/formats/avro.md b/docs/content.zh/docs/connectors/table/formats/avro.md new file mode 100644 index 0000000000000..7edb8bf3873e3 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/avro.md @@ -0,0 +1,198 @@ +--- +title: Avro +weight: 4 +type: docs +aliases: + - /zh/dev/table/connectors/formats/avro.html +--- + + +# Avro Format + +{{< label "Format: Serialization Schema" >}} +{{< label "Format: Deserialization Schema" >}} + +[Apache Avro](https://avro.apache.org/) format 允许基于 Avro schema 读取和写入 Avro 数据。目前,Avro schema 从 table schema 推导而来。 + +依赖 +------------ + +{{< sql_download_table "avro" >}} + + +如何使用 Avro format 创建表 +---------------- + +这是使用 Kafka 连接器和 Avro format 创建表的示例。 + +```sql +CREATE TABLE user_behavior ( + user_id BIGINT, + item_id BIGINT, + category_id BIGINT, + behavior STRING, + ts TIMESTAMP(3) +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_behavior', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + 'format' = 'avro' +) +``` + +Format 参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值类型描述
    format
    必要(none)String指定使用什么 format,这里应该是 'avro'
    avro.codec
    可选(none)String仅用于 }}">filesystem,avro 压缩编解码器。默认不压缩。目前支持:deflate、snappy、bzip2、xz。
    + +数据类型映射 +---------------- + +目前,Avro schema 通常是从 table schema 中推导而来。尚不支持显式定义 Avro schema。因此,下表列出了从 Flink 类型到 Avro 类型的类型映射。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Flink SQL 类型Avro 类型Avro 逻辑类型
    CHAR / VARCHAR / STRINGstring
    BOOLEANboolean
    BINARY / VARBINARYbytes
    DECIMALfixeddecimal
    TINYINTint
    SMALLINTint
    INTint
    BIGINTlong
    FLOATfloat
    DOUBLEdouble
    DATEintdate
    TIMEinttime-millis
    TIMESTAMPlongtimestamp-millis
    ARRAYarray
    MAP
    + (key 必须是 string/char/varchar 类型)
    map
    MULTISET
    + (元素必须是 string/char/varchar 类型)
    map
    ROWrecord
    + +除了上面列出的类型,Flink 支持读取/写入 nullable 的类型。Flink 将 nullable 的类型映射到 Avro `union(something, null)`,其中 `something` 是从 Flink 类型转换的 Avro 类型。 + +您可以参考 [Avro 规范](https://avro.apache.org/docs/current/spec.html) 获取更多有关 Avro 类型的信息。 diff --git a/docs/content.zh/docs/connectors/table/formats/canal.md b/docs/content.zh/docs/connectors/table/formats/canal.md new file mode 100644 index 0000000000000..d7630f94f4203 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/canal.md @@ -0,0 +1,305 @@ +--- +title: Canal +weight: 6 +type: docs +aliases: + - /zh/dev/table/connectors/formats/canal.html +--- + + +# Canal Format + +{{< label "Changelog-Data-Capture Format" >}} +{{< label "Format: Serialization Schema" >}} +{{< label "Format: Deserialization Schema" >}} + +[Canal](https://github.com/alibaba/canal/wiki) 是一个 CDC(ChangeLog Data Capture,变更日志数据捕获)工具,可以实时地将 MySQL 变更传输到其他系统。Canal 为变更日志提供了统一的数据格式,并支持使用 JSON 或 [protobuf](https://developers.google.com/protocol-buffers) 序列化消息(Canal 默认使用 protobuf)。 + +Flink 支持将 Canal 的 JSON 消息解析为 INSERT / UPDATE / DELETE 消息到 Flink SQL 系统中。在很多情况下,利用这个特性非常的有用,例如 + - 将增量数据从数据库同步到其他系统 + - 日志审计 + - 数据库的实时物化视图 + - 关联维度数据库的变更历史,等等。 + +Flink 还支持将 Flink SQL 中的 INSERT / UPDATE / DELETE 消息编码为 Canal 格式的 JSON 消息,输出到 Kafka 等存储中。 +但需要注意的是,目前 Flink 还不支持将 UPDATE_BEFORE 和 UPDATE_AFTER 合并为一条 UPDATE 消息。因此,Flink 将 UPDATE_BEFORE 和 UPDATE_AFTER 分别编码为 DELETE 和 INSERT 类型的 Canal 消息。 + +*注意:未来会支持 Canal protobuf 类型消息的解析以及输出 Canal 格式的消息。* + +依赖 +------------ + +{{< sql_download_table "canal" >}} + +*注意:有关如何部署 Canal 以将变更日志同步到消息队列,请参阅 [Canal 文档](https://github.com/alibaba/canal/wiki)。* + + +如何使用 Canal Format +---------------- + +Canal 为变更日志提供了统一的格式,下面是一个从 MySQL 库 `products` 表中捕获更新操作的简单示例: + +```json +{ + "data": [ + { + "id": "111", + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": "5.18" + } + ], + "database": "inventory", + "es": 1589373560000, + "id": 9, + "isDdl": false, + "mysqlType": { + "id": "INTEGER", + "name": "VARCHAR(255)", + "description": "VARCHAR(512)", + "weight": "FLOAT" + }, + "old": [ + { + "weight": "5.15" + } + ], + "pkNames": [ + "id" + ], + "sql": "", + "sqlType": { + "id": 4, + "name": 12, + "description": 12, + "weight": 7 + }, + "table": "products", + "ts": 1589373560798, + "type": "UPDATE" +} +``` + +*注意:有关各个字段的含义,请参阅 [Canal 文档](https://github.com/alibaba/canal/wiki)* + +MySQL `products` 表有4列(`id`,`name`,`description` 和 `weight`)。上面的 JSON 消息是 `products` 表上的一个更新事件,表示 `id = 111` 的行数据上 `weight` 字段值从`5.15`变更成为 `5.18`。假设消息已经同步到了一个 Kafka 主题:`products_binlog`,那么就可以使用以下DDL来从这个主题消费消息并解析变更事件。 + +```sql +CREATE TABLE topic_products ( + -- 元数据与 MySQL "products" 表完全相同 + id BIGINT, + name STRING, + description STRING, + weight DECIMAL(10, 2) +) WITH ( + 'connector' = 'kafka', + 'topic' = 'products_binlog', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + 'format' = 'canal-json' -- 使用 canal-json 格式 +) +``` + +将 Kafka 主题注册成 Flink 表之后,就可以将 Canal 消息用作变更日志源。 + +```sql +-- 关于MySQL "products" 表的实时物化视图 +-- 计算相同产品的最新平均重量 +SELECT name, AVG(weight) FROM topic_products GROUP BY name; + +-- 将 MySQL "products" 表的所有数据和增量更改同步到 +-- Elasticsearch "products" 索引以供将来搜索 +INSERT INTO elasticsearch_products +SELECT * FROM topic_products; +``` + +Available Metadata +------------------ + +The following format metadata can be exposed as read-only (`VIRTUAL`) columns in a table definition. + +Attention Format metadata fields are only available if the +corresponding connector forwards format metadata. Currently, only the Kafka connector is able to expose +metadata fields for its value format. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyData TypeDescription
    databaseSTRING NULLThe originating database. Corresponds to the database field in the + Canal record if available.
    tableSTRING NULLThe originating database table. Corresponds to the table field in the + Canal record if available.
    sql-typeMAP<STRING, INT> NULLMap of various sql types. Corresponds to the sqlType field in the + Canal record if available.
    pk-namesARRAY<STRING> NULLArray of primary key names. Corresponds to the pkNames field in the + Canal record if available.
    ingestion-timestampTIMESTAMP_LTZ(3) NULLThe timestamp at which the connector processed the event. Corresponds to the ts + field in the Canal record.
    + +The following example shows how to access Canal metadata fields in Kafka: + +```sql +CREATE TABLE KafkaTable ( + origin_database STRING METADATA FROM 'value.database' VIRTUAL, + origin_table STRING METADATA FROM 'value.table' VIRTUAL, + origin_sql_type MAP METADATA FROM 'value.sql-type' VIRTUAL, + origin_pk_names ARRAY METADATA FROM 'value.pk-names' VIRTUAL, + origin_ts TIMESTAMP(3) METADATA FROM 'value.ingestion-timestamp' VIRTUAL, + user_id BIGINT, + item_id BIGINT, + behavior STRING +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_behavior', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + 'scan.startup.mode' = 'earliest-offset', + 'value.format' = 'canal-json' +); +``` + +Format 参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    选项要求默认类型描述
    format
    必填(none)String指定要使用的格式,此处应为 'canal-json'.
    canal-json.ignore-parse-errors
    选填falseBoolean当解析异常时,是跳过当前字段或行,还是抛出错误失败(默认为 false,即抛出错误失败)。如果忽略字段的解析异常,则会将该字段值设置为null
    canal-json.timestamp-format.standard
    选填'SQL'String指定输入和输出时间戳格式。当前支持的值是 'SQL''ISO-8601': +
      +
    • 选项 'SQL' 将解析 "yyyy-MM-dd HH:mm:ss.s{precision}" 格式的输入时间戳,例如 '2020-12-30 12:13:14.123',并以相同格式输出时间戳。
    • +
    • 选项 'ISO-8601' 将解析 "yyyy-MM-ddTHH:mm:ss.s{precision}" 格式的输入时间戳,例如 '2020-12-30T12:13:14.123',并以相同的格式输出时间戳。
    • +
    +
    canal-json.map-null-key.mode
    选填'FAIL'String指定处理 Map 中 key 值为空的方法. 当前支持的值有 'FAIL', 'DROP''LITERAL': +
      +
    • Option 'FAIL' 将抛出异常,如果遇到 Map 中 key 值为空的数据。
    • +
    • Option 'DROP' 将丢弃 Map 中 key 值为空的数据项。
    • +
    • Option 'LITERAL' 将使用字符串常量来替换 Map 中的空 key 值。字符串常量的值由 'canal-json.map-null-key.literal' 定义。
    • +
    +
    canal-json.map-null-key.literal
    选填'null'String'canal-json.map-null-key.mode' 是 LITERAL 的时候,指定字符串常量替换 Map 中的空 key 值。
    canal-json.encode.decimal-as-plain-number
    选填falseBoolean将所有 DECIMAL 类型的数据保持原状,不使用科学计数法表示。例:0.000000027 默认会表示为 2.7E-8。当此选项设为 true 时,则会表示为 0.000000027
    canal-json.database.include
    optional(none)String一个可选的正则表达式,通过正则匹配 Canal 记录中的 "database" 元字段,仅读取指定数据库的 changelog 记录。正则字符串与 Java 的 Pattern 兼容。
    canal-json.table.include
    optional(none)String一个可选的正则表达式,通过正则匹配 Canal 记录中的 "table" 元字段,仅读取指定表的 changelog 记录。正则字符串与 Java 的 Pattern 兼容。
    + +注意事项 +---------------- + +### 重复的变更事件 + +在正常的操作环境下,Canal 应用能以 **exactly-once** 的语义投递每条变更事件。在这种情况下,Flink 消费 Canal 产生的变更事件能够工作得很好。 +然而,当有故障发生时,Canal 应用只能保证 **at-least-once** 的投递语义。 +这也意味着,在非正常情况下,Canal 可能会投递重复的变更事件到消息队列中,当 Flink 从消息队列中消费的时候就会得到重复的事件。 +这可能会导致 Flink query 的运行得到错误的结果或者非预期的异常。因此,建议在这种情况下,建议在这种情况下,将作业参数 [`table.exec.source.cdc-events-duplicate`]({{< ref "docs/dev/table/config" >}}#table-exec-source-cdc-events-duplicate) 设置成 `true`,并在该 source 上定义 PRIMARY KEY。 +框架会生成一个额外的有状态算子,使用该 primary key 来对变更事件去重并生成一个规范化的 changelog 流。 + +数据类型映射 +---------------- + +目前,Canal Format 使用 JSON Format 进行序列化和反序列化。 有关数据类型映射的更多详细信息,请参阅 [JSON Format 文档]({{< ref "docs/connectors/table/formats/json" >}}#data-type-mapping)。 + diff --git a/docs/content.zh/docs/connectors/table/formats/csv.md b/docs/content.zh/docs/connectors/table/formats/csv.md new file mode 100644 index 0000000000000..417af014a479c --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/csv.md @@ -0,0 +1,228 @@ +--- +title: CSV +weight: 2 +type: docs +aliases: + - /zh/dev/table/connectors/formats/csv.html +--- + + +# CSV Format + +{{< label "Format: Serialization Schema" >}} +{{< label "Format: Deserialization Schema" >}} + +[CSV](https://zh.wikipedia.org/wiki/%E9%80%97%E5%8F%B7%E5%88%86%E9%9A%94%E5%80%BC) Format 允许我们基于 CSV schema 进行解析和生成 CSV 数据。 目前 CSV schema 是基于 table schema 推断而来的。 + +依赖 +------------ + +{{< sql_download_table "csv" >}} + +如何创建使用 CSV 格式的表 +---------------- + + +以下是一个使用 Kafka 连接器和 CSV 格式创建表的示例。 + +```sql +CREATE TABLE user_behavior ( + user_id BIGINT, + item_id BIGINT, + category_id BIGINT, + behavior STRING, + ts TIMESTAMP(3) +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_behavior', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + 'format' = 'csv', + 'csv.ignore-parse-errors' = 'true', + 'csv.allow-comments' = 'true' +) +``` + +Format 参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值类型描述
    format
    必选(none)String指定要使用的格式,这里应该是 'csv'
    csv.field-delimiter
    可选,String字段分隔符 (默认','),必须为单字符。你可以使用反斜杠字符指定一些特殊字符,例如 '\t' 代表制表符。 + 你也可以通过 unicode 编码在纯 SQL 文本中指定一些特殊字符,例如 'csv.field-delimiter' = U&'\0001' 代表 0x01 字符。 +
    csv.disable-quote-character
    可选falseBoolean是否禁止对引用的值使用引号 (默认是 false). 如果禁止,选项 'csv.quote-character' 不能设置。
    csv.quote-character
    可选"String用于围住字段值的引号字符 (默认").
    csv.allow-comments
    可选falseBoolean是否允许忽略注释行(默认不允许),注释行以 '#' 作为起始字符。 + 如果允许注释行,请确保 csv.ignore-parse-errors 也开启了从而允许空行。 +
    csv.ignore-parse-errors
    可选falseBoolean当解析异常时,是跳过当前字段或行,还是抛出错误失败(默认为 false,即抛出错误失败)。如果忽略字段的解析异常,则会将该字段值设置为null
    csv.array-element-delimiter
    可选;String分隔数组和行元素的字符串(默认';').
    csv.escape-character
    可选(none)String转义字符(默认关闭).
    csv.null-literal
    可选(none)String是否将 "null" 字符串转化为 null 值。
    + +数据类型映射 +---------------- + +目前 CSV 的 schema 都是从 table schema 推断而来的。显式地定义 CSV schema 暂不支持。 +Flink 的 CSV Format 数据使用 [jackson databind API](https://github.com/FasterXML/jackson-databind) 去解析 CSV 字符串。 + +下面的表格列出了flink数据和CSV数据的对应关系。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Flink SQL 类型CSV 类型
    CHAR / VARCHAR / STRINGstring
    BOOLEANboolean
    BINARY / VARBINARYstring with encoding: base64
    DECIMALnumber
    TINYINTnumber
    SMALLINTnumber
    INTnumber
    BIGINTnumber
    FLOATnumber
    DOUBLEnumber
    DATEstring with format: date
    TIMEstring with format: time
    TIMESTAMPstring with format: date-time
    INTERVALnumber
    ARRAYarray
    ROWobject
    diff --git a/docs/content.zh/docs/connectors/table/formats/debezium.md b/docs/content.zh/docs/connectors/table/formats/debezium.md new file mode 100644 index 0000000000000..d0283934ba69f --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/debezium.md @@ -0,0 +1,385 @@ +--- +title: Debezium +weight: 5 +type: docs +aliases: + - /zh/dev/table/connectors/formats/debezium.html +--- + + +# Debezium Format + +{{< label "Changelog-Data-Capture Format" >}} +{{< label "Format: Serialization Schema" >}} +{{< label "Format: Deserialization Schema" >}} + +[Debezium](https://debezium.io/) 是一个 CDC(Changelog Data Capture,变更数据捕获)的工具,可以把来自 MySQL、PostgreSQL、Oracle、Microsoft SQL Server 和许多其他数据库的更改实时流式传输到 Kafka 中。 Debezium 为变更日志提供了统一的格式结构,并支持使用 JSON 和 Apache Avro 序列化消息。 + +Flink 支持将 Debezium JSON 和 Avro 消息解析为 INSERT / UPDATE / DELETE 消息到 Flink SQL 系统中。在很多情况下,利用这个特性非常的有用,例如 + - 将增量数据从数据库同步到其他系统 + - 日志审计 + - 数据库的实时物化视图 + - 关联维度数据库的变更历史,等等。 + +Flink 还支持将 Flink SQL 中的 INSERT / UPDATE / DELETE 消息编码为 Debezium 格式的 JSON 或 Avro 消息,输出到 Kafka 等存储中。 +但需要注意的是,目前 Flink 还不支持将 UPDATE_BEFORE 和 UPDATE_AFTER 合并为一条 UPDATE 消息。因此,Flink 将 UPDATE_BEFORE 和 UPDATE_AFTER 分别编码为 DELETE 和 INSERT 类型的 Debezium 消息。 + +依赖 +------------ + +#### Debezium Avro + +{{< sql_download_table "debezium-avro-confluent" >}} + +#### Debezium Json + +{{< sql_download_table "debezium-json" >}} + +*注意: 请参考 [Debezium 文档](https://debezium.io/documentation/reference/1.3/index.html),了解如何设置 Debezium Kafka Connect 用来将变更日志同步到 Kafka 主题。* + + +如何使用 Debezium Format +---------------- + + +Debezium 为变更日志提供了统一的格式,这是一个 JSON 格式的从 MySQL product 表捕获的更新操作的简单示例: + +```json +{ + "before": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.18 + }, + "after": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.15 + }, + "source": {...}, + "op": "u", + "ts_ms": 1589362330904, + "transaction": null +} +``` + +*注意: 请参考 [Debezium 文档](https://debezium.io/documentation/reference/1.3/connectors/mysql.html#mysql-connector-events_debezium),了解每个字段的含义。* + +MySQL 产品表有4列(`id`、`name`、`description`、`weight`)。上面的 JSON 消息是 `products` 表上的一条更新事件,其中 `id = 111` 的行的 `weight` 值从 `5.18` 更改为 `5.15`。假设此消息已同步到 Kafka 主题 `products_binlog`,则可以使用以下 DDL 来使用此主题并解析更改事件。 + +{{< tabs "0b6703c1-021e-4506-a579-b72b8408c0cf" >}} +{{< tab "SQL" >}} +```sql +CREATE TABLE topic_products ( + -- schema 与 MySQL 的 products 表完全相同 + id BIGINT, + name STRING, + description STRING, + weight DECIMAL(10, 2) +) WITH ( + 'connector' = 'kafka', + 'topic' = 'products_binlog', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + -- 使用 'debezium-json' format 来解析 Debezium 的 JSON 消息 + -- 如果 Debezium 用 Avro 编码消息,请使用 'debezium-avro-confluent' + 'format' = 'debezium-json' -- 如果 Debezium 用 Avro 编码消息,请使用 'debezium-avro-confluent' +) +``` +{{< /tab >}} +{{< /tabs >}} + +在某些情况下,用户在设置 Debezium Kafka Connect 时,可能会开启 Kafka 的配置 `'value.converter.schemas.enable'`,用来在消息体中包含 schema 信息。然后,Debezium JSON 消息可能如下所示: + +```json +{ + "schema": {...}, + "payload": { + "before": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.18 + }, + "after": { + "id": 111, + "name": "scooter", + "description": "Big 2-wheel scooter", + "weight": 5.15 + }, + "source": {...}, + "op": "u", + "ts_ms": 1589362330904, + "transaction": null + } +} +``` + +为了解析这一类信息,你需要在上述 DDL WITH 子句中添加选项 `'debezium-json.schema-include' = 'true'`(默认为 false)。通常情况下,建议不要包含 schema 的描述,因为这样会使消息变得非常冗长,并降低解析性能。 + +在将主题注册为 Flink 表之后,可以将 Debezium 消息用作变更日志源。 + +{{< tabs "6a84a0e8-2e56-49db-9089-e836290f8239" >}} +{{< tab "SQL" >}} +```sql +-- MySQL "products" 的实时物化视图 +-- 计算相同产品的最新平均重量 +SELECT name, AVG(weight) FROM topic_products GROUP BY name; + +-- 将 MySQL "products" 表的所有数据和增量更改同步到 +-- Elasticsearch "products" 索引,供将来查找 +INSERT INTO elasticsearch_products +SELECT * FROM topic_products; +``` +{{< /tab >}} +{{< /tabs >}} + +Available Metadata +------------------ + +The following format metadata can be exposed as read-only (`VIRTUAL`) columns in a table definition. + +Attention Format metadata fields are only available if the +corresponding connector forwards format metadata. Currently, only the Kafka connector is able to expose +metadata fields for its value format. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyData TypeDescription
    schemaSTRING NULLJSON string describing the schema of the payload. Null if the schema is not included in + the Debezium record.
    ingestion-timestampTIMESTAMP_LTZ(3) NULLThe timestamp at which the connector processed the event. Corresponds to the ts_ms + field in the Debezium record.
    source.timestampTIMESTAMP_LTZ(3) NULLThe timestamp at which the source system created the event. Corresponds to the source.ts_ms + field in the Debezium record.
    source.databaseSTRING NULLThe originating database. Corresponds to the source.db field in the + Debezium record if available.
    source.schemaSTRING NULLThe originating database schema. Corresponds to the source.schema field in the + Debezium record if available.
    source.tableSTRING NULLThe originating database table. Corresponds to the source.table or source.collection + field in the Debezium record if available.
    source.propertiesMAP<STRING, STRING> NULLMap of various source properties. Corresponds to the source field in the Debezium record.
    + +The following example shows how to access Debezium metadata fields in Kafka: + +```sql +CREATE TABLE KafkaTable ( + origin_ts TIMESTAMP(3) METADATA FROM 'value.ingestion-timestamp' VIRTUAL, + event_time TIMESTAMP(3) METADATA FROM 'value.source.timestamp' VIRTUAL, + origin_database STRING METADATA FROM 'value.source.database' VIRTUAL, + origin_schema STRING METADATA FROM 'value.source.schema' VIRTUAL, + origin_table STRING METADATA FROM 'value.source.table' VIRTUAL, + origin_properties MAP METADATA FROM 'value.source.properties' VIRTUAL, + user_id BIGINT, + item_id BIGINT, + behavior STRING +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_behavior', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + 'scan.startup.mode' = 'earliest-offset', + 'value.format' = 'debezium-json' +); +``` + +Format 参数 +---------------- + +Flink 提供了 `debezium-avro-confluent` 和 `debezium-json` 两种 format 来解析 Debezium 生成的 JSON 格式和 Avro 格式的消息。 +请使用 `debezium-avro-confluent` 来解析 Debezium 的 Avro 消息,使用 `debezium-json` 来解析 Debezium 的 JSON 消息。 + +#### Debezium Avro + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值类型描述
    format
    必选(none)String指定要使用的格式,此处应为 'debezium-avro-confluent'
    debezium-avro-confluent.schema-registry.url
    必选(none)String用于获取/注册 schemas 的 Confluent Schema Registry 的 URL。
    debezium-avro-confluent.schema-registry.subject
    可选(none)StringConfluent Schema Registry主题,用于在序列化期间注册此格式使用的 schema。默认 kafka 连接器会使用 "<topic_name>-value" 作为默认的 subject 名字,但是对于其他连接器(如 filesystem)则在当做 sink 使用时需要显式指定 subject 名字。
    + +#### Debezium Json + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值类型描述
    format
    必选(none)String指定要使用的格式,此处应为 'debezium-json'
    debezium-json.schema-include
    可选falseBoolean设置 Debezium Kafka Connect 时,用户可以启用 Kafka 配置 'value.converter.schemas.enable' 以在消息中包含 schema。此选项表明 Debezium JSON 消息是否包含 schema。
    debezium-json.ignore-parse-errors
    可选falseBoolean当解析异常时,是跳过当前字段或行,还是抛出错误失败(默认为 false,即抛出错误失败)。如果忽略字段的解析异常,则会将该字段值设置为null
    debezium-json.timestamp-format.standard
    可选'SQL'String声明输入和输出的时间戳格式。当前支持的格式为'SQL' 以及 'ISO-8601': +
      +
    • 可选参数 'SQL' 将会以 "yyyy-MM-dd HH:mm:ss.s{precision}" 的格式解析时间戳, 例如 '2020-12-30 12:13:14.123',且会以相同的格式输出。
    • +
    • 可选参数 'ISO-8601' 将会以 "yyyy-MM-ddTHH:mm:ss.s{precision}" 的格式解析输入时间戳, 例如 '2020-12-30T12:13:14.123' ,且会以相同的格式输出。
    • +
    +
    debezium-json.map-null-key.mode
    选填'FAIL'String指定处理 Map 中 key 值为空的方法. 当前支持的值有 'FAIL', 'DROP''LITERAL': +
      +
    • Option 'FAIL' 将抛出异常,如果遇到 Map 中 key 值为空的数据。
    • +
    • Option 'DROP' 将丢弃 Map 中 key 值为空的数据项。
    • +
    • Option 'LITERAL' 将使用字符串常量来替换 Map 中的空 key 值。字符串常量的值由 'debezium-json.map-null-key.literal' 定义。
    • +
    +
    debezium-json.map-null-key.literal
    选填'null'String'debezium-json.map-null-key.mode' 是 LITERAL 的时候,指定字符串常量替换 Map 中的空 key 值。
    debezium-json.encode.decimal-as-plain-number
    选填falseBoolean将所有 DECIMAL 类型的数据保持原状,不使用科学计数法表示。例:0.000000027 默认会表示为 2.7E-8。当此选项设为 true 时,则会表示为 0.000000027
    + + +注意事项 +---------------- + +### 重复的变更事件 + +在正常的操作环境下,Debezium 应用能以 **exactly-once** 的语义投递每条变更事件。在这种情况下,Flink 消费 Debezium 产生的变更事件能够工作得很好。 +然而,当有故障发生时,Debezium 应用只能保证 **at-least-once** 的投递语义。可以查看 [Debezium 官方文档](https://debezium.io/documentation/faq/#what_happens_when_an_application_stops_or_crashes) 了解更多关于 Debezium 的消息投递语义。 +这也意味着,在非正常情况下,Debezium 可能会投递重复的变更事件到 Kafka 中,当 Flink 从 Kafka 中消费的时候就会得到重复的事件。 +这可能会导致 Flink query 的运行得到错误的结果或者非预期的异常。因此,建议在这种情况下,将作业参数 [`table.exec.source.cdc-events-duplicate`]({{< ref "docs/dev/table/config" >}}#table-exec-source-cdc-events-duplicate) 设置成 `true`,并在该 source 上定义 PRIMARY KEY。 +框架会生成一个额外的有状态算子,使用该 primary key 来对变更事件去重并生成一个规范化的 changelog 流。 + +### 消费 Debezium Postgres Connector 产生的数据 + +如果你正在使用 [Debezium PostgreSQL Connector](https://debezium.io/documentation/reference/1.2/connectors/postgresql.html) 捕获变更到 Kafka,请确保被监控表的 [REPLICA IDENTITY](https://www.postgresql.org/docs/current/sql-altertable.html#SQL-CREATETABLE-REPLICA-IDENTITY) 已经被配置成 `FULL` 了,默认值是 `DEFAULT`。 +否则,Flink SQL 将无法正确解析 Debezium 数据。 + +当配置为 `FULL` 时,更新和删除事件将完整包含所有列的之前的值。当为其他配置时,更新和删除事件的 "before" 字段将只包含 primary key 字段的值,或者为 null(没有 primary key)。 +你可以通过运行 `ALTER TABLE REPLICA IDENTITY FULL` 来更改 `REPLICA IDENTITY` 的配置。 +请阅读 [Debezium 关于 PostgreSQL REPLICA IDENTITY 的文档](https://debezium.io/documentation/reference/1.2/connectors/postgresql.html#postgresql-replica-identity) 了解更多。 + +数据类型映射 +---------------- + +目前,Debezium Format 使用 JSON Format 进行序列化和反序列化。有关数据类型映射的更多详细信息,请参考 [JSON Format 文档]({{< ref "docs/connectors/table/formats/json" >}}#data-type-mapping) 和 [Confluent Avro Format 文档]({{< ref "docs/connectors/table/formats/avro-confluent" >}}#data-type-mapping)。 + diff --git a/docs/content.zh/docs/connectors/table/formats/json.md b/docs/content.zh/docs/connectors/table/formats/json.md new file mode 100644 index 0000000000000..f7e3040de1fcc --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/json.md @@ -0,0 +1,231 @@ +--- +title: JSON +weight: 3 +type: docs +aliases: + - /zh/dev/table/connectors/formats/json.html +--- + + +# JSON Format + +{{< label "Format: Serialization Schema" >}} +{{< label "Format: Deserialization Schema" >}} + +[JSON](https://www.json.org/json-en.html) Format 能读写 JSON 格式的数据。当前,JSON schema 是从 table schema 中自动推导而得的。 + +依赖 +------------ + +{{< sql_download_table "json" >}} + +如何创建一张基于 JSON Format 的表 +---------------- + +以下是一个利用 Kafka 以及 JSON Format 构建表的例子。 + +```sql +CREATE TABLE user_behavior ( + user_id BIGINT, + item_id BIGINT, + category_id BIGINT, + behavior STRING, + ts TIMESTAMP(3) +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_behavior', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + 'format' = 'json', + 'json.fail-on-missing-field' = 'false', + 'json.ignore-parse-errors' = 'true' +) +``` + +Format 参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必须默认值类型描述
    format
    必选(none)String声明使用的格式,这里应为'json'
    json.fail-on-missing-field
    可选falseBoolean当解析字段缺失时,是跳过当前字段或行,还是抛出错误失败(默认为 false,即抛出错误失败)。
    json.ignore-parse-errors
    可选falseBoolean当解析异常时,是跳过当前字段或行,还是抛出错误失败(默认为 false,即抛出错误失败)。如果忽略字段的解析异常,则会将该字段值设置为null
    json.timestamp-format.standard
    可选'SQL'String声明输入和输出的 TIMESTAMPTIMESTAMP_LTZ 的格式。当前支持的格式为'SQL' 以及 'ISO-8601': +
      +
    • 可选参数 'SQL' 将会以 "yyyy-MM-dd HH:mm:ss.s{precision}" 的格式解析 TIMESTAMP, 例如 "2020-12-30 12:13:14.123", + 以 "yyyy-MM-dd HH:mm:ss.s{precision}'Z'" 的格式解析 TIMESTAMP_LTZ, 例如 "2020-12-30 12:13:14.123Z" 且会以相同的格式输出。
    • +
    • 可选参数 'ISO-8601' 将会以 "yyyy-MM-ddTHH:mm:ss.s{precision}" 的格式解析输入 TIMESTAMP, 例如 "2020-12-30T12:13:14.123" , + 以 "yyyy-MM-ddTHH:mm:ss.s{precision}'Z'" 的格式解析 TIMESTAMP_LTZ, 例如 "2020-12-30T12:13:14.123Z" 且会以相同的格式输出。
    • +
    +
    json.map-null-key.mode
    选填'FAIL'String指定处理 Map 中 key 值为空的方法. 当前支持的值有 'FAIL', 'DROP''LITERAL': +
      +
    • Option 'FAIL' 将抛出异常,如果遇到 Map 中 key 值为空的数据。
    • +
    • Option 'DROP' 将丢弃 Map 中 key 值为空的数据项。
    • +
    • Option 'LITERAL' 将使用字符串常量来替换 Map 中的空 key 值。字符串常量的值由 'json.map-null-key.literal' 定义。
    • +
    +
    json.map-null-key.literal
    选填'null'String'json.map-null-key.mode' 是 LITERAL 的时候,指定字符串常量替换 Map 中的空 key 值。
    json.encode.decimal-as-plain-number
    选填falseBoolean将所有 DECIMAL 类型的数据保持原状,不使用科学计数法表示。例:0.000000027 默认会表示为 2.7E-8。当此选项设为 true 时,则会表示为 0.000000027
    + +数据类型映射关系 +---------------- + +当前,JSON schema 将会自动从 table schema 之中自动推导得到。不支持显式地定义 JSON schema。 + +在 Flink 中,JSON Format 使用 [jackson databind API](https://github.com/FasterXML/jackson-databind) 去解析和生成 JSON。 + +下表列出了 Flink 中的数据类型与 JSON 中的数据类型的映射关系。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Flink SQL 类型JSON 类型
    CHAR / VARCHAR / STRINGstring
    BOOLEANboolean
    BINARY / VARBINARYstring with encoding: base64
    DECIMALnumber
    TINYINTnumber
    SMALLINTnumber
    INTnumber
    BIGINTnumber
    FLOATnumber
    DOUBLEnumber
    DATEstring with format: date
    TIMEstring with format: time
    TIMESTAMPstring with format: date-time
    TIMESTAMP_WITH_LOCAL_TIME_ZONEstring with format: date-time (with UTC time zone)
    INTERVALnumber
    ARRAYarray
    MAP / MULTISETobject
    ROWobject
    diff --git a/docs/content.zh/docs/connectors/table/formats/maxwell.md b/docs/content.zh/docs/connectors/table/formats/maxwell.md new file mode 100644 index 0000000000000..205b1259296f2 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/maxwell.md @@ -0,0 +1,204 @@ +--- +title: Maxwell +weight: 7 +type: docs +aliases: + - /zh/dev/table/connectors/formats/maxwell.html +--- + + +# Maxwell Format + +{{< label "Changelog-Data-Capture Format" >}} +{{< label "Format: Serialization Schema" >}} +{{< label "Format: Deserialization Schema" >}} + +[Maxwell](https://maxwells-daemon.io/) is a CDC (Changelog Data Capture) tool that can stream changes in real-time from MySQL into Kafka, Kinesis and other streaming connectors. Maxwell provides a unified format schema for changelog and supports to serialize messages using JSON. + +Flink supports to interpret Maxwell JSON messages as INSERT/UPDATE/DELETE messages into Flink SQL system. This is useful in many cases to leverage this feature, such as + - synchronizing incremental data from databases to other systems + - auditing logs + - real-time materialized views on databases + - temporal join changing history of a database table and so on. + +Flink also supports to encode the INSERT/UPDATE/DELETE messages in Flink SQL as Maxwell JSON messages, and emit to external systems like Kafka. +However, currently Flink can't combine UPDATE_BEFORE and UPDATE_AFTER into a single UPDATE message. Therefore, Flink encodes UPDATE_BEFORE and UDPATE_AFTER as DELETE and INSERT Maxwell messages. + +Dependencies +------------ + +{{< sql_download_table "maxwell" >}} + +*Note: please refer to [Maxwell documentation](http://maxwells-daemon.io/quickstart/) about how to synchronize changelog to Kafka topics with Maxwell JSON.* + + +How to use Maxwell format +---------------- + +Maxwell provides a unified format for changelog, here is a simple example for an update operation captured from a MySQL `products` table in JSON format: + +```json +{ + "database":"test", + "table":"e", + "type":"insert", + "ts":1477053217, + "xid":23396, + "commit":true, + "position":"master.000006:800911", + "server_id":23042, + "thread_id":108, + "primary_key": [1, "2016-10-21 05:33:37.523000"], + "primary_key_columns": ["id", "c"], + "data":{ + "id":111, + "name":"scooter", + "description":"Big 2-wheel scooter", + "weight":5.15 + }, + "old":{ + "weight":5.18, + } +} +``` + +*Note: please refer to [Maxwell documentation](http://maxwells-daemon.io/dataformat/) about the meaning of each fields.* + +The MySQL `products` table has 4 columns (`id`, `name`, `description` and `weight`). The above JSON message is an update change event on the `products` table where the `weight` value of the row with `id = 111` is changed from `5.18` to `5.15`. +Assuming this messages is synchronized to Kafka topic `products_binlog`, then we can use the following DDL to consume this topic and interpret the change events. + +```sql +CREATE TABLE topic_products ( + -- schema is totally the same to the MySQL "products" table + id BIGINT, + name STRING, + description STRING, + weight DECIMAL(10, 2) +) WITH ( + 'connector' = 'kafka', + 'topic' = 'products_binlog', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + 'format' = 'maxwell-json' +) +``` + +After registering the topic as a Flink table, then you can consume the Maxwell messages as a changelog source. + +```sql +-- a real-time materialized view on the MySQL "products" +-- which calculate the latest average of weight for the same products +SELECT name, AVG(weight) FROM topic_products GROUP BY name; + +-- synchronize all the data and incremental changes of MySQL "products" table to +-- Elasticsearch "products" index for future searching +INSERT INTO elasticsearch_products +SELECT * FROM topic_products; +``` + +Format Options +---------------- + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    format
    required(none)StringSpecify what format to use, here should be 'maxwell-json'.
    maxwell-json.ignore-parse-errors
    optionalfalseBooleanSkip fields and rows with parse errors instead of failing. + Fields are set to null in case of errors.
    maxwell-json.timestamp-format.standard
    optional'SQL'StringSpecify the input and output timestamp format. Currently supported values are 'SQL' and 'ISO-8601': +
      +
    • Option 'SQL' will parse input timestamp in "yyyy-MM-dd HH:mm:ss.s{precision}" format, e.g '2020-12-30 12:13:14.123' and output timestamp in the same format.
    • +
    • Option 'ISO-8601'will parse input timestamp in "yyyy-MM-ddTHH:mm:ss.s{precision}" format, e.g '2020-12-30T12:13:14.123' and output timestamp in the same format.
    • +
    +
    maxwell-json.map-null-key.mode
    optional'FAIL'StringSpecify the handling mode when serializing null keys for map data. Currently supported values are 'FAIL', 'DROP' and 'LITERAL': +
      +
    • Option 'FAIL' will throw exception when encountering map with null key.
    • +
    • Option 'DROP' will drop null key entries for map data.
    • +
    • Option 'LITERAL' will replace null key with string literal. The string literal is defined by maxwell-json.map-null-key.literal option.
    • +
    +
    maxwell-json.map-null-key.literal
    optional'null'StringSpecify string literal to replace null key when 'maxwell-json.map-null-key.mode' is LITERAL.
    maxwell-json.encode.decimal-as-plain-number
    optionalfalseBooleanEncode all decimals as plain numbers instead of possible scientific notations. By default, decimals may be written using scientific notation. For example, 0.000000027 is encoded as 2.7E-8 by default, and will be written as 0.000000027 if set this option to true.
    + +
    + +Caveats +---------------- + +### Duplicate change events + +The Maxwell application allows to deliver every change event **exactly-once**. Flink works pretty well when consuming Maxwell produced events in this situation. +If Maxwell application works in **at-least-once** delivery, it may deliver duplicate change events to Kafka and Flink will get the duplicate events. +This may cause Flink query to get wrong results or unexpected exceptions. Thus, it is recommended to set job configuration [`table.exec.source.cdc-events-duplicate`]({{< ref "docs/dev/table/config" >}}#table-exec-source-cdc-events-duplicate) to `true` and define PRIMARY KEY on the source in this situation. +Framework will generate an additional stateful operator, and use the primary key to deduplicate the change events and produce a normalized changelog stream. + +Data Type Mapping +---------------- + +Currently, the Maxwell format uses JSON for serialization and deserialization. Please refer to [JSON Format documentation]({{< ref "docs/connectors/table/formats/json" >}}#data-type-mapping) for more details about the data type mapping. diff --git a/docs/content.zh/docs/connectors/table/formats/orc.md b/docs/content.zh/docs/connectors/table/formats/orc.md new file mode 100644 index 0000000000000..57877f0609777 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/orc.md @@ -0,0 +1,173 @@ +--- +title: Orc +weight: 9 +type: docs +aliases: + - /zh/dev/table/connectors/formats/orc.html +--- + + +# Orc Format + +{{< label "Format: Serialization Schema" >}} +{{< label "Format: Deserialization Schema" >}} + + +[Apache Orc](https://orc.apache.org/) Format 允许读写 ORC 数据。 + +依赖 +------------ + +{{< sql_download_table "orc" >}} + + +如何用 Orc 格式创建一个表格 +---------------- + +下面是一个用 Filesystem connector 和 Orc format 创建表格的例子 + +```sql +CREATE TABLE user_behavior ( + user_id BIGINT, + item_id BIGINT, + category_id BIGINT, + behavior STRING, + ts TIMESTAMP(3), + dt STRING +) PARTITIONED BY (dt) WITH ( + 'connector' = 'filesystem', + 'path' = '/tmp/user_behavior', + 'format' = 'orc' +) +``` + +Format 参数 +---------------- + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值类型描述
    format
    必选(none)String指定要使用的格式,这里应该是 'orc'。
    + +Orc 格式也支持来源于 [Table properties](https://orc.apache.org/docs/hive-config.html#table-properties) 的表属性。 举个例子,你可以设置 `orc.compress=SNAPPY` 来允许spappy压缩。 + +数据类型映射 +---------------- + +Orc 格式类型的映射和 Apache Hive 是兼容的。下面的表格列出了 Flink 类型的数据和 Orc 类型的数据的映射关系。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Flink 数据类型Orc 物理类型Orc 逻辑类型
    CHARbytesCHAR
    VARCHARbytesVARCHAR
    STRINGbytesSTRING
    BOOLEANlongBOOLEAN
    BYTESbytesBINARY
    DECIMALdecimalDECIMAL
    TINYINTlongBYTE
    SMALLINTlongSHORT
    INTlongINT
    BIGINTlongLONG
    FLOATdoubleFLOAT
    DOUBLEdoubleDOUBLE
    DATElongDATE
    TIMESTAMPtimestampTIMESTAMP
    + +注意 复合数据类型: 数组、 映射和行类型暂不支持。 diff --git a/docs/content.zh/docs/connectors/table/formats/overview.md b/docs/content.zh/docs/connectors/table/formats/overview.md new file mode 100644 index 0000000000000..ebb1da703b56f --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/overview.md @@ -0,0 +1,99 @@ +--- +title: "Formats" +weight: 1 +type: docs +aliases: + - /dev/table/connectors/formats/ +--- + + +# Formats + +Flink 提供了一套与表连接器(table connector)一起使用的表格式(table format)。表格式是一种存储格式,定义了如何把二进制数据映射到表的列上。 + +Flink 支持以下格式: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FormatsSupported Connectors
    }}">CSV}}">Apache Kafka, + }}">Upsert Kafka, + }}">Amazon Kinesis Data Streams, + }}">Filesystem
    }}">JSON}}">Apache Kafka, + }}">Upsert Kafka, + }}">Amazon Kinesis Data Streams, + }}">Filesystem, + }}">Elasticsearch
    }}">Apache Avro}}">Apache Kafka, + }}">Upsert Kafka, + }}">Amazon Kinesis Data Streams, + }}">Filesystem
    }}">Confluent Avro}}">Apache Kafka, + }}">Upsert Kafka
    }}">Debezium CDC}}">Apache Kafka, + }}">Filesystem
    }}">Canal CDC}}">Apache Kafka, + }}">Filesystem
    }}">Maxwell CDC}}">Apache Kafka, + }}">Filesystem
    }}">Apache Parquet}}">Filesystem
    }}">Apache ORC}}">Filesystem
    }}">Raw}}">Apache Kafka, + }}">Upsert Kafka, + }}">Amazon Kinesis Data Streams, + }}">Filesystem
    diff --git a/docs/content.zh/docs/connectors/table/formats/parquet.md b/docs/content.zh/docs/connectors/table/formats/parquet.md new file mode 100644 index 0000000000000..a736770ad058f --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/parquet.md @@ -0,0 +1,180 @@ +--- +title: Parquet +weight: 8 +type: docs +aliases: + - /zh/dev/table/connectors/formats/parquet.html +--- + + +# Parquet 格式 + +{{< label "Format: Serialization Schema" >}} +{{< label "Format: Deserialization Schema" >}} + +[Apache Parquet](https://parquet.apache.org/) 格式允许读写 Parquet 数据. + +依赖 +------------ + +{{< sql_download_table "parquet" >}} + +如何创建基于 Parquet 格式的表 +---------------- + +以下为用 Filesystem 连接器和 Parquet 格式创建表的示例, + +```sql +CREATE TABLE user_behavior ( + user_id BIGINT, + item_id BIGINT, + category_id BIGINT, + behavior STRING, + ts TIMESTAMP(3), + dt STRING +) PARTITIONED BY (dt) WITH ( + 'connector' = 'filesystem', + 'path' = '/tmp/user_behavior', + 'format' = 'parquet' +) +``` + +Format 参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必须默认值类型描述
    format
    必选(none)String指定使用的格式,此处应为"parquet"。
    parquet.utc-timezone
    可选falseBoolean使用 UTC 时区或本地时区在纪元时间和 LocalDateTime 之间进行转换。Hive 0.x/1.x/2.x 使用本地时区,但 Hive 3.x 使用 UTC 时区。
    + +Parquet 格式也支持 [ParquetOutputFormat](https://www.javadoc.io/doc/org.apache.parquet/parquet-hadoop/1.10.0/org/apache/parquet/hadoop/ParquetOutputFormat.html) 的配置。 +例如, 可以配置 `parquet.compression=GZIP` 来开启 gzip 压缩。 + +数据类型映射 +---------------- + +目前,Parquet 格式类型映射与 Apache Hive 兼容,但与 Apache Spark 有所不同: + +- Timestamp:不论精度,映射 timestamp 类型至 int96。 +- Decimal:根据精度,映射 decimal 类型至固定长度字节的数组。 + +下表列举了 Flink 中的数据类型与 JSON 中的数据类型的映射关系。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Flink 数据类型Parquet 类型Parquet 逻辑类型
    CHAR / VARCHAR / STRINGBINARYUTF8
    BOOLEANBOOLEAN
    BINARY / VARBINARYBINARY
    DECIMALFIXED_LEN_BYTE_ARRAYDECIMAL
    TINYINTINT32INT_8
    SMALLINTINT32INT_16
    INTINT32
    BIGINTINT64
    FLOATFLOAT
    DOUBLEDOUBLE
    DATEINT32DATE
    TIMEINT32TIME_MILLIS
    TIMESTAMPINT96
    + +注意 暂不支持复合数据类型(Array、Map 与 Row)。 diff --git a/docs/content.zh/docs/connectors/table/formats/raw.md b/docs/content.zh/docs/connectors/table/formats/raw.md new file mode 100644 index 0000000000000..5bd721c1498e7 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/formats/raw.md @@ -0,0 +1,172 @@ +--- +title: Raw +weight: 10 +type: docs +aliases: + - /zh/dev/table/connectors/formats/raw.html +--- + + +# Raw Format + +{{< label "Format: Serialization Schema" >}} +{{< label "Format: Deserialization Schema" >}} + +Raw format 允许读写原始(基于字节)值作为单个列。 + +注意: 这种格式将 `null` 值编码成 `byte[]` 类型的 `null`。这样在 `upsert-kafka` 中使用时可能会有限制,因为 `upsert-kafka` 将 `null` 值视为 墓碑消息(在键上删除)。因此,如果该字段可能具有 `null` 值,我们建议避免使用 `upsert-kafka` 连接器和 `raw` format 作为 `value.format`。 + +Raw format 连接器是内置的。 + +示例 +---------------- + +例如,你可能在 Kafka 中具有原始日志数据,并希望使用 Flink SQL 读取和分析此类数据。 + +``` +47.29.201.179 - - [28/Feb/2019:13:17:10 +0000] "GET /?p=1 HTTP/2.0" 200 5316 "https://domain.com/?p=1" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36" "2.75" +``` + +下面的代码创建了一张表,使用 `raw` format 以 UTF-8 编码的形式从中读取(也可以写入)底层的 Kafka topic 作为匿名字符串值: + +```sql +CREATE TABLE nginx_log ( + log STRING +) WITH ( + 'connector' = 'kafka', + 'topic' = 'nginx_log', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + 'format' = 'raw' +) +``` + +然后,你可以将原始数据读取为纯字符串,之后使用用户自定义函数将其分为多个字段进行进一步分析。例如 示例中的 `my_split`。 + +```sql +SELECT t.hostname, t.datetime, t.url, t.browser, ... +FROM( + SELECT my_split(log) as t FROM nginx_log +); +``` + +相对应的,你也可以将一个 STRING 类型的列以 UTF-8 编码的匿名字符串值写入 Kafka topic。 + +Format 参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值类型描述
    format
    必选(none)String指定要使用的格式, 这里应该是 'raw'。
    raw.charset
    可选UTF-8String指定字符集来编码文本字符串。
    raw.endianness
    可选big-endianString指定字节序来编码数字值的字节。有效值为'big-endian'和'little-endian'。 + 更多细节可查阅 字节序
    + +数据类型映射 +---------------- + +下表详细说明了这种格式支持的 SQL 类型,包括用于编码和解码的序列化类和反序列化类的详细信息。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Flink SQL 类型
    CHAR / VARCHAR / STRINGUTF-8(默认)编码的文本字符串。
    + 编码字符集可以通过 'raw.charset' 进行配置。
    BINARY / VARBINARY / BYTES字节序列本身。
    BOOLEAN表示布尔值的单个字节,0表示 false, 1 表示 true。
    TINYINT有符号数字值的单个字节。
    SMALLINT采用big-endian(默认)编码的两个字节。
    + 字节序可以通过 'raw.endianness' 配置。
    INT采用 big-endian (默认)编码的四个字节。
    + 字节序可以通过 'raw.endianness' 配置。
    BIGINT采用 big-endian (默认)编码的八个字节。
    + 字节序可以通过 'raw.endianness' 配置。
    FLOAT采用 IEEE 754 格式和 big-endian (默认)编码的四个字节。
    + 字节序可以通过 'raw.endianness' 配置。
    DOUBLE采用 IEEE 754 格式和 big-endian (默认)编码的八个字节。
    + 字节序可以通过 'raw.endianness' 配置。
    RAW通过 RAW 类型的底层 TypeSerializer 序列化的字节序列。
    + diff --git a/docs/content.zh/docs/connectors/table/hbase.md b/docs/content.zh/docs/connectors/table/hbase.md new file mode 100644 index 0000000000000..930c134619b36 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/hbase.md @@ -0,0 +1,336 @@ +--- +title: HBase +weight: 9 +type: docs +aliases: + - /zh/dev/table/connectors/hbase.html +--- + + +# HBase SQL 连接器 + +{{< label "Scan Source: Bounded" >}} +{{< label "Lookup Source: Sync Mode" >}} +{{< label "Sink: Batch" >}} +{{< label "Sink: Streaming Upsert Mode" >}} + +HBase 连接器支持读取和写入 HBase 集群。本文档介绍如何使用 HBase 连接器基于 HBase 进行 SQL 查询。 + +HBase 连接器在 upsert 模式下运行,可以使用 DDL 中定义的主键与外部系统交换更新操作消息。但是主键只能基于 HBase 的 rowkey 字段定义。如果没有声明主键,HBase 连接器默认取 rowkey 作为主键。 + +依赖 +------------ + +{{< sql_download_table "hbase" >}} + + +如何使用 HBase 表 +---------------- + +所有 HBase 表的列簇必须定义为 ROW 类型,字段名对应列簇名(column family),嵌套的字段名对应列限定符名(column qualifier)。用户只需在表结构中声明查询中使用的的列簇和列限定符。除了 ROW 类型的列,剩下的原子数据类型字段(比如,STRING, BIGINT)将被识别为 HBase 的 rowkey,一张表中只能声明一个 rowkey。rowkey 字段的名字可以是任意的,如果是保留关键字,需要用反引号。 + +```sql +-- 在 Flink SQL 中注册 HBase 表 "mytable" +CREATE TABLE hTable ( + rowkey INT, + family1 ROW, + family2 ROW, + family3 ROW, + PRIMARY KEY (rowkey) NOT ENFORCED +) WITH ( + 'connector' = 'hbase-1.4', + 'table-name' = 'mytable', + 'zookeeper.quorum' = 'localhost:2181' +); + +-- 用 ROW(...) 构造函数构造列簇,并往 HBase 表写数据。 +-- 假设 "T" 的表结构是 [rowkey, f1q1, f2q2, f2q3, f3q4, f3q5, f3q6] +INSERT INTO hTable +SELECT rowkey, ROW(f1q1), ROW(f2q2, f2q3), ROW(f3q4, f3q5, f3q6) FROM T; + +-- 从 HBase 表扫描数据 +SELECT rowkey, family1, family3.q4, family3.q6 FROM hTable; + +-- temporal join HBase 表,将 HBase 表作为维表 +SELECT * FROM myTopic +LEFT JOIN hTable FOR SYSTEM_TIME AS OF myTopic.proctime +ON myTopic.key = hTable.rowkey; +``` + +连接器参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值数据类型描述
    connector
    必选(none)String指定使用的连接器, 支持的值如下 : +
      +
    • hbase-1.4: 连接 HBase 1.4.x 集群
    • +
    • hbase-2.2: 连接 HBase 2.2.x 集群
    • +
    +
    table-name
    必选(none)String连接的 HBase 表名。
    zookeeper.quorum
    必选(none)StringHBase Zookeeper quorum 信息。
    zookeeper.znode.parent
    可选/hbaseStringHBase 集群的 Zookeeper 根目录。
    null-string-literal
    可选nullString当字符串值为 null 时的存储形式,默认存成 "null" 字符串。HBase 的 source 和 sink 的编解码将所有数据类型(除字符串外)将 null 值以空字节来存储。
    sink.buffer-flush.max-size
    可选2mbMemorySize写入的参数选项。每次写入请求缓存行的最大大小。它能提升写入 HBase 数据库的性能,但是也可能增加延迟。设置为 "0" 关闭此选项。 +
    sink.buffer-flush.max-rows
    可选1000Integer写入的参数选项。 每次写入请求缓存的最大行数。它能提升写入 HBase 数据库的性能,但是也可能增加延迟。设置为 "0" 关闭此选项。 +
    sink.buffer-flush.interval
    可选1sDuration写入的参数选项。刷写缓存行的间隔。它能提升写入 HBase 数据库的性能,但是也可能增加延迟。设置为 "0" 关闭此选项。注意:"sink.buffer-flush.max-size" 和 "sink.buffer-flush.max-rows" 同时设置为 "0",刷写选项整个异步处理缓存行为。 +
    sink.parallelism
    可选(none)Integer为 HBase sink operator 定义并行度。默认情况下,并行度由框架决定,和链在一起的上游 operator 一样。
    lookup.async
    可选falseBoolean是否启用异步查找。如果为真,查找将是异步的。注意:异步方式只支持 hbase-2.2 连接器
    lookup.cache.max-rows
    可选(无)Integer查找缓存的最大行数,超过这个值,最旧的行将过期。注意:"lookup.cache.max-rows" 和 "lookup.cache.ttl" 必须同时被设置。默认情况下,查找缓存是禁用的。
    lookup.cache.ttl
    可选(无)Duration查找缓存中每一行的最大生存时间,在这段时间内,最老的行将过期。注意:"lookup.cache.max-rows" 和 "lookup.cache.ttl" 必须同时被设置。默认情况下,查找缓存是禁用的。
    lookup.max-retries
    可选3Integer查找数据库失败时的最大重试次数。
    properties.*
    可选(无)String + 可以设置任意 HBase 的配置项。后缀名必须匹配在 HBase 配置文档 中定义的配置键。Flink 将移除 "properties." 配置键前缀并将变换后的配置键和值传入底层的 HBase 客户端。 + 例如您可以设置 'properties.hbase.security.authentication' = 'kerberos' 等kerberos认证参数。 +
    + + + +数据类型映射表 +---------------- + +HBase 以字节数组存储所有数据。在读和写过程中要序列化和反序列化数据。 + +Flink 的 HBase 连接器利用 HBase(Hadoop) 的工具类 `org.apache.hadoop.hbase.util.Bytes` 进行字节数组和 Flink 数据类型转换。 + +Flink 的 HBase 连接器将所有数据类型(除字符串外)`null` 值编码成空字节。对于字符串类型,`null` 值的字面值由`null-string-literal`选项值决定。 + +数据类型映射表如下: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Flink 数据类型HBase 转换
    CHAR / VARCHAR / STRING +{{< highlight "java" >}} +byte[] toBytes(String s) +String toString(byte[] b) +{{< /highlight >}} +
    BOOLEAN +{{< highlight "java" >}} +byte[] toBytes(boolean b) +boolean toBoolean(byte[] b) +{{< /highlight >}} +
    BINARY / VARBINARY返回 byte[]
    DECIMAL +{{< highlight "java" >}} +byte[] toBytes(BigDecimal v) +BigDecimal toBigDecimal(byte[] b) +{{< /highlight >}} +
    TINYINT +{{< highlight "java" >}} +new byte[] { val } +bytes[0] // returns first and only byte from bytes +{{< /highlight >}} +
    SMALLINT +{{< highlight "java" >}} +byte[] toBytes(short val) +short toShort(byte[] bytes) +{{< /highlight >}} +
    INT +{{< highlight "java" >}} +byte[] toBytes(int val) +int toInt(byte[] bytes) +{{< /highlight >}} +
    BIGINT +{{< highlight "java" >}} +byte[] toBytes(long val) +long toLong(byte[] bytes) +{{< /highlight >}} +
    FLOAT +{{< highlight "java" >}} +byte[] toBytes(float val) +float toFloat(byte[] bytes) +{{< /highlight >}} +
    DOUBLE +{{< highlight "java" >}} +byte[] toBytes(double val) +double toDouble(byte[] bytes) +{{< /highlight >}} +
    DATE从 1970-01-01 00:00:00 UTC 开始的天数,int 值。
    TIME从 1970-01-01 00:00:00 UTC 开始天的毫秒数,int 值。
    TIMESTAMP从 1970-01-01 00:00:00 UTC 开始的毫秒数,long 值。
    ARRAY不支持
    MAP / MULTISET不支持
    ROW不支持
    + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/table/hive/_index.md b/docs/content.zh/docs/connectors/table/hive/_index.md new file mode 100644 index 0000000000000..615740adb5005 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/hive/_index.md @@ -0,0 +1,23 @@ +--- +title: Hive +bookCollapseSection: true +weight: 16 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/connectors/table/hive/hive_catalog.md b/docs/content.zh/docs/connectors/table/hive/hive_catalog.md new file mode 100644 index 0000000000000..0353a1be80bfc --- /dev/null +++ b/docs/content.zh/docs/connectors/table/hive/hive_catalog.md @@ -0,0 +1,397 @@ +--- +title: "Hive Catalog" +weight: 2 +type: docs +aliases: + - /zh/dev/table/connectors/hive/hive_catalog.html +--- + + +# Hive Catalog + +Hive Metastore has evolved into the de facto metadata hub over the years in Hadoop ecosystem. Many companies have a single +Hive Metastore service instance in their production to manage all of their metadata, either Hive metadata or non-Hive metadata, + as the source of truth. + +For users who have both Hive and Flink deployments, `HiveCatalog` enables them to use Hive Metastore to manage Flink's metadata. + +For users who have just Flink deployment, `HiveCatalog` is the only persistent catalog provided out-of-box by Flink. +Without a persistent catalog, users using [Flink SQL CREATE DDL]({{< ref "docs/dev/table/sql/create" >}}) have to repeatedly +create meta-objects like a Kafka table in each session, which wastes a lot of time. `HiveCatalog` fills this gap by empowering +users to create tables and other meta-objects only once, and reference and manage them with convenience later on across sessions. + + +## Set up HiveCatalog + +### Dependencies + +Setting up a `HiveCatalog` in Flink requires the same [dependencies]({{< ref "docs/connectors/table/hive/overview" >}}#dependencies) +as those of an overall Flink-Hive integration. + +### Configuration + +Setting up a `HiveCatalog` in Flink requires the same [configuration]({{< ref "docs/connectors/table/hive/overview" >}}#connecting-to-hive) +as those of an overall Flink-Hive integration. + + +## How to use HiveCatalog + +Once configured properly, `HiveCatalog` should just work out of box. Users can create Flink meta-objects with DDL, and should +see them immediately afterwards. + +`HiveCatalog` can be used to handle two kinds of tables: Hive-compatible tables and generic tables. Hive-compatible tables +are those stored in a Hive-compatible way, in terms of both metadata and data in the storage layer. Therefore, Hive-compatible tables +created via Flink can be queried from Hive side. + +Generic tables, on the other hand, are specific to Flink. When creating generic tables with `HiveCatalog`, we're just using +HMS to persist the metadata. While these tables are visible to Hive, it's unlikely Hive is able to understand +the metadata. And therefore using such tables in Hive leads to undefined behavior. + +Flink uses the property '*is_generic*' to tell whether a table is Hive-compatible or generic. When creating a table with +`HiveCatalog`, it's by default considered generic. If you'd like to create a Hive-compatible table, make sure to set +`is_generic` to false in your table properties. + +As stated above, generic tables shouldn't be used from Hive. In Hive CLI, you can call `DESCRIBE FORMATTED` for a table and +decide whether it's generic or not by checking the `is_generic` property. Generic tables will have `is_generic=true`. + +### Example + +We will walk through a simple example here. + +#### step 1: set up a Hive Metastore + +Have a Hive Metastore running. + +Here, we set up a local Hive Metastore and our `hive-site.xml` file in local path `/opt/hive-conf/hive-site.xml`. +We have some configs like the following: + +```xml + + + + javax.jdo.option.ConnectionURL + jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true + metadata is stored in a MySQL server + + + + javax.jdo.option.ConnectionDriverName + com.mysql.jdbc.Driver + MySQL JDBC driver class + + + + javax.jdo.option.ConnectionUserName + ... + user name for connecting to mysql server + + + + javax.jdo.option.ConnectionPassword + ... + password for connecting to mysql server + + + + hive.metastore.uris + thrift://localhost:9083 + IP address (or fully-qualified domain name) and port of the metastore host + + + + hive.metastore.schema.verification + true + + + +``` + + +Test connection to the HMS with Hive Cli. Running some commands, we can see we have a database named `default` and there's no table in it. + + +```bash + +hive> show databases; +OK +default +Time taken: 0.032 seconds, Fetched: 1 row(s) + +hive> show tables; +OK +Time taken: 0.028 seconds, Fetched: 0 row(s) +``` + + +#### step 2: configure Flink cluster and SQL CLI + +Add all Hive dependencies to `/lib` dir in Flink distribution, and modify SQL CLI's yaml config file `sql-cli-defaults.yaml` as following: + +```yaml + +execution: + planner: blink + type: streaming + ... + current-catalog: myhive # set the HiveCatalog as the current catalog of the session + current-database: mydatabase + +catalogs: + - name: myhive + type: hive + hive-conf-dir: /opt/hive-conf # contains hive-site.xml +``` + + +#### step 3: set up a Kafka cluster + +Bootstrap a local Kafka 2.3.0 cluster with a topic named "test", and produce some simple data to the topic as tuple of name and age. + +```bash + +localhost$ bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test +>tom,15 +>john,21 + +``` + + +These message can be seen by starting a Kafka console consumer. + +```bash +localhost$ bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic test --from-beginning + +tom,15 +john,21 + +``` + + +#### step 4: start SQL Client, and create a Kafka table with Flink SQL DDL + +Start Flink SQL Client, create a simple Kafka 2.3.0 table via DDL, and verify its schema. + +```bash + +Flink SQL> CREATE TABLE mykafka (name String, age Int) WITH ( + 'connector.type' = 'kafka', + 'connector.version' = 'universal', + 'connector.topic' = 'test', + 'connector.properties.bootstrap.servers' = 'localhost:9092', + 'format.type' = 'csv', + 'update-mode' = 'append' +); +[INFO] Table has been created. + +Flink SQL> DESCRIBE mykafka; +root + |-- name: STRING + |-- age: INT + +``` + +Verify the table is also visible to Hive via Hive Cli, and note that the table has property `is_generic=true`: + +```bash +hive> show tables; +OK +mykafka +Time taken: 0.038 seconds, Fetched: 1 row(s) + +hive> describe formatted mykafka; +OK +# col_name data_type comment + + +# Detailed Table Information +Database: default +Owner: null +CreateTime: ...... +LastAccessTime: UNKNOWN +Retention: 0 +Location: ...... +Table Type: MANAGED_TABLE +Table Parameters: + flink.connector.properties.bootstrap.servers localhost:9092 + flink.connector.topic test + flink.connector.type kafka + flink.connector.version universal + flink.format.type csv + flink.generic.table.schema.0.data-type VARCHAR(2147483647) + flink.generic.table.schema.0.name name + flink.generic.table.schema.1.data-type INT + flink.generic.table.schema.1.name age + flink.update-mode append + is_generic true + transient_lastDdlTime ...... + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +Time taken: 0.158 seconds, Fetched: 36 row(s) + +``` + + +#### step 5: run Flink SQL to query the Kafka table + +Run a simple select query from Flink SQL Client in a Flink cluster, either standalone or yarn-session. + +```bash +Flink SQL> select * from mykafka; + +``` + + +Produce some more messages in the Kafka topic + +```bash +localhost$ bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic test --from-beginning + +tom,15 +john,21 +kitty,30 +amy,24 +kaiky,18 + +``` + + +You should see results produced by Flink in SQL Client now, as: + + +```bash + SQL Query Result (Table) + Refresh: 1 s Page: Last of 1 + + name age + tom 15 + john 21 + kitty 30 + amy 24 + kaiky 18 + +``` + +## Supported Types + +`HiveCatalog` supports all Flink types for generic tables. + +For Hive-compatible tables, `HiveCatalog` needs to map Flink data types to corresponding Hive types as described in +the following table: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Flink Data TypeHive Data Type
    CHAR(p)CHAR(p)
    VARCHAR(p)VARCHAR(p)
    STRINGSTRING
    BOOLEANBOOLEAN
    TINYINTTINYINT
    SMALLINTSMALLINT
    INTINT
    BIGINTLONG
    FLOATFLOAT
    DOUBLEDOUBLE
    DECIMAL(p, s)DECIMAL(p, s)
    DATEDATE
    TIMESTAMP(9)TIMESTAMP
    BYTESBINARY
    ARRAY<T>LIST<T>
    MAPMAP
    ROWSTRUCT
    + +Something to note about the type mapping: +* Hive's `CHAR(p)` has a maximum length of 255 +* Hive's `VARCHAR(p)` has a maximum length of 65535 +* Hive's `MAP` only supports primitive key types while Flink's `MAP` can be any data type +* Hive's `UNION` type is not supported +* Hive's `TIMESTAMP` always has precision 9 and doesn't support other precisions. Hive UDFs, on the other hand, can process `TIMESTAMP` values with a precision <= 9. +* Hive doesn't support Flink's `TIMESTAMP_WITH_TIME_ZONE`, `TIMESTAMP_WITH_LOCAL_TIME_ZONE`, and `MULTISET` +* Flink's `INTERVAL` type cannot be mapped to Hive `INTERVAL` type yet + +## Scala Shell + +NOTE: since blink planner is not well supported in Scala Shell at the moment, it's **NOT** recommended to use Hive connector in Scala Shell. diff --git a/docs/content.zh/docs/connectors/table/hive/hive_dialect.md b/docs/content.zh/docs/connectors/table/hive/hive_dialect.md new file mode 100644 index 0000000000000..9840494a4c365 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/hive/hive_dialect.md @@ -0,0 +1,419 @@ +--- +title: "Hive 方言" +weight: 3 +type: docs +aliases: + - /zh/dev/table/connectors/hive/hive_dialect.html +--- + + +# Hive 方言 + +从 1.11.0 开始,在使用 Hive 方言时,Flink 允许用户用 Hive 语法来编写 SQL 语句。通过提供与 Hive 语法的兼容性,我们旨在改善与 Hive 的互操作性,并减少用户需要在 Flink 和 Hive 之间切换来执行不同语句的情况。 + +## 使用 Hive 方言 + +Flink 目前支持两种 SQL 方言: `default` 和 `hive`。你需要先切换到 Hive 方言,然后才能使用 Hive 语法编写。下面介绍如何使用 SQL 客户端和 Table API 设置方言。 +还要注意,你可以为执行的每个语句动态切换方言。无需重新启动会话即可使用其他方言。 + +### SQL 客户端 + +SQL 方言可以通过 `table.sql-dialect` 属性指定。因此你可以通过 SQL 客户端 yaml 文件中的 `configuration` 部分来设置初始方言。 + +```yaml + +execution: + planner: blink + type: batch + result-mode: table + +configuration: + table.sql-dialect: hive + +``` + +你同样可以在 SQL 客户端启动后设置方言。 + +```bash + +Flink SQL> set table.sql-dialect=hive; -- to use hive dialect +[INFO] Session property has been set. + +Flink SQL> set table.sql-dialect=default; -- to use default dialect +[INFO] Session property has been set. + +``` + +### Table API + +你可以使用 Table API 为 TableEnvironment 设置方言。 + +{{< tabs "82a7968d-df12-4db2-83ab-16f09b263935" >}} +{{< tab "Java" >}} +```java + +EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner()...build(); +TableEnvironment tableEnv = TableEnvironment.create(settings); +// to use hive dialect +tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE); +// to use default dialect +tableEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT); + +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +from pyflink.table import * + +settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build() +t_env = TableEnvironment.create(settings) + +# to use hive dialect +t_env.get_config().set_sql_dialect(SqlDialect.HIVE) +# to use default dialect +t_env.get_config().set_sql_dialect(SqlDialect.DEFAULT) + +``` +{{< /tab >}} +{{< /tabs >}} + +## DDL + +本章节列出了 Hive 方言支持的 DDL 语句。我们主要关注语法。你可以参考 [Hive 文档](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL) +了解每个 DDL 语句的语义。 + +### CATALOG + +#### Show + +```sql +SHOW CURRENT CATALOG; +``` + +### DATABASE + +#### Show + +```sql +SHOW DATABASES; +``` + +#### Create + +```sql +CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name + [COMMENT database_comment] + [LOCATION fs_path] + [WITH DBPROPERTIES (property_name=property_value, ...)]; +``` + +#### Alter + +##### Update Properties + +```sql +ALTER (DATABASE|SCHEMA) database_name SET DBPROPERTIES (property_name=property_value, ...); +``` + +##### Update Owner + +```sql +ALTER (DATABASE|SCHEMA) database_name SET OWNER [USER|ROLE] user_or_role; +``` + +##### Update Location + +```sql +ALTER (DATABASE|SCHEMA) database_name SET LOCATION fs_path; +``` + +#### Drop + +```sql +DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT|CASCADE]; +``` + +#### Use + +```sql +USE database_name; +``` + +### TABLE + +#### Show + +```sql +SHOW TABLES; +``` + +#### Create + +```sql +CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name + [(col_name data_type [column_constraint] [COMMENT col_comment], ... [table_constraint])] + [COMMENT table_comment] + [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] + [ + [ROW FORMAT row_format] + [STORED AS file_format] + ] + [LOCATION fs_path] + [TBLPROPERTIES (property_name=property_value, ...)] + +row_format: + : DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char] + [MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char] + [NULL DEFINED AS char] + | SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, ...)] + +file_format: + : SEQUENCEFILE + | TEXTFILE + | RCFILE + | ORC + | PARQUET + | AVRO + | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname + +column_constraint: + : NOT NULL [[ENABLE|DISABLE] [VALIDATE|NOVALIDATE] [RELY|NORELY]] + +table_constraint: + : [CONSTRAINT constraint_name] PRIMARY KEY (col_name, ...) [[ENABLE|DISABLE] [VALIDATE|NOVALIDATE] [RELY|NORELY]] +``` + +#### Alter + +##### Rename + +```sql +ALTER TABLE table_name RENAME TO new_table_name; +``` + +##### Update Properties + +```sql +ALTER TABLE table_name SET TBLPROPERTIES (property_name = property_value, property_name = property_value, ... ); +``` + +##### Update Location + +```sql +ALTER TABLE table_name [PARTITION partition_spec] SET LOCATION fs_path; +``` + +如果指定了 `partition_spec`,那么必须完整,即具有所有分区列的值。如果指定了,该操作将作用在对应分区上而不是表上。 + +##### Update File Format + +```sql +ALTER TABLE table_name [PARTITION partition_spec] SET FILEFORMAT file_format; +``` + +如果指定了 `partition_spec`,那么必须完整,即具有所有分区列的值。如果指定了,该操作将作用在对应分区上而不是表上。 + +##### Update SerDe Properties + +```sql +ALTER TABLE table_name [PARTITION partition_spec] SET SERDE serde_class_name [WITH SERDEPROPERTIES serde_properties]; + +ALTER TABLE table_name [PARTITION partition_spec] SET SERDEPROPERTIES serde_properties; + +serde_properties: + : (property_name = property_value, property_name = property_value, ... ) +``` + +如果指定了 `partition_spec`,那么必须完整,即具有所有分区列的值。如果指定了,该操作将作用在对应分区上而不是表上。 + +##### Add Partitions + +```sql +ALTER TABLE table_name ADD [IF NOT EXISTS] (PARTITION partition_spec [LOCATION fs_path])+; +``` + +##### Drop Partitions + +```sql +ALTER TABLE table_name DROP [IF EXISTS] PARTITION partition_spec[, PARTITION partition_spec, ...]; +``` + +##### Add/Replace Columns + +```sql +ALTER TABLE table_name + ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], ...) + [CASCADE|RESTRICT] +``` + +##### Change Column + +```sql +ALTER TABLE table_name CHANGE [COLUMN] col_old_name col_new_name column_type + [COMMENT col_comment] [FIRST|AFTER column_name] [CASCADE|RESTRICT]; +``` + +#### Drop + +```sql +DROP TABLE [IF EXISTS] table_name; +``` + +### VIEW + +#### Create + +```sql +CREATE VIEW [IF NOT EXISTS] view_name [(column_name, ...) ] + [COMMENT view_comment] + [TBLPROPERTIES (property_name = property_value, ...)] + AS SELECT ...; +``` + +#### Alter + +**注意**: 变更视图只在 Table API 中有效,SQL 客户端不支持。 + +##### Rename + +```sql +ALTER VIEW view_name RENAME TO new_view_name; +``` + +##### Update Properties + +```sql +ALTER VIEW view_name SET TBLPROPERTIES (property_name = property_value, ... ); +``` + +##### Update As Select + +```sql +ALTER VIEW view_name AS select_statement; +``` + +#### Drop + +```sql +DROP VIEW [IF EXISTS] view_name; +``` + +### FUNCTION + +#### Show + +```sql +SHOW FUNCTIONS; +``` + +#### Create + +```sql +CREATE FUNCTION function_name AS class_name; +``` + +#### Drop + +```sql +DROP FUNCTION [IF EXISTS] function_name; +``` + +## DML & DQL _`Beta`_ + +Hive 方言支持常用的 Hive [DML](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML) +和 [DQL](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Select) 。 下表列出了一些 Hive 方言支持的语法。 + +- [SORT/CLUSTER/DISTRIBUTE BY](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+SortBy) +- [Group By](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+GroupBy) +- [Join](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Joins) +- [Union](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Union) +- [LATERAL VIEW](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+LateralView) +- [Window Functions](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+WindowingAndAnalytics) +- [SubQueries](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+SubQueries) +- [CTE](https://cwiki.apache.org/confluence/display/Hive/Common+Table+Expression) +- [INSERT INTO dest schema](https://issues.apache.org/jira/browse/HIVE-9481) +- [Implicit type conversions](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types#LanguageManualTypes-AllowedImplicitConversions) + +为了实现更好的语法和语义的兼容,强烈建议使用 [HiveModule]({{< ref "docs/connectors/table/hive/hive_functions" >}}#use-hive-built-in-functions-via-hivemodule) +并将其放在 Module 列表的首位,以便在函数解析时优先使用 Hive 内置函数。 + +Hive 方言不再支持 [Flink SQL 语法]({{< ref "docs/dev/table/sql/queries" >}}) 。 若需使用 Flink 语法,请切换到 `default` 方言。 + +以下是一个使用 Hive 方言的示例。 + +```bash +Flink SQL> create catalog myhive with ('type' = 'hive', 'hive-conf-dir' = '/opt/hive-conf'); +[INFO] Execute statement succeed. + +Flink SQL> use catalog myhive; +[INFO] Execute statement succeed. + +Flink SQL> load module hive; +[INFO] Execute statement succeed. + +Flink SQL> use modules hive,core; +[INFO] Execute statement succeed. + +Flink SQL> set table.sql-dialect=hive; +[INFO] Session property has been set. + +Flink SQL> select explode(array(1,2,3)); -- call hive udtf ++-----+ +| col | ++-----+ +| 1 | +| 2 | +| 3 | ++-----+ +3 rows in set + +Flink SQL> create table tbl (key int,value string); +[INFO] Execute statement succeed. + +Flink SQL> insert overwrite table tbl values (5,'e'),(1,'a'),(1,'a'),(3,'c'),(2,'b'),(3,'c'),(3,'c'),(4,'d'); +[INFO] Submitting SQL update statement to the cluster... +[INFO] SQL update statement has been successfully submitted to the cluster: + +Flink SQL> select * from tbl cluster by key; -- run cluster by +2021-04-22 16:13:57,005 INFO org.apache.hadoop.mapred.FileInputFormat [] - Total input paths to process : 1 ++-----+-------+ +| key | value | ++-----+-------+ +| 1 | a | +| 1 | a | +| 5 | e | +| 2 | b | +| 3 | c | +| 3 | c | +| 3 | c | +| 4 | d | ++-----+-------+ +8 rows in set +``` + +## 注意 + +以下是使用 Hive 方言的一些注意事项。 + +- Hive 方言只能用于操作 Hive 对象,并要求当前 Catalog 是一个 [HiveCatalog]({{< ref "docs/connectors/table/hive/hive_catalog" >}}) 。 +- Hive 方言只支持 `db.table` 这种两级的标识符,不支持带有 Catalog 名字的标识符。 +- 虽然所有 Hive 版本支持相同的语法,但是一些特定的功能是否可用仍取决于你使用的[Hive 版本]({{< ref "docs/connectors/table/hive/overview" >}}#支持的hive版本)。例如,更新数据库位置 + 只在 Hive-2.4.0 或更高版本支持。 +- 执行 DML 和 DQL 时应该使用 [HiveModule]({{< ref "docs/connectors/table/hive/hive_functions" >}}#use-hive-built-in-functions-via-hivemodule) 。 diff --git a/docs/content.zh/docs/connectors/table/hive/hive_functions.md b/docs/content.zh/docs/connectors/table/hive/hive_functions.md new file mode 100644 index 0000000000000..4d1d071b290ee --- /dev/null +++ b/docs/content.zh/docs/connectors/table/hive/hive_functions.md @@ -0,0 +1,211 @@ +--- +title: "Hive Functions" +weight: 5 +type: docs +aliases: + - /zh/dev/table/connectors/hive/hive_functions.html +--- + + +# Hive Functions + +## Use Hive Built-in Functions via HiveModule + +The `HiveModule` provides Hive built-in functions as Flink system (built-in) functions to Flink SQL and Table API users. + +For detailed information, please refer to [HiveModule]({{< ref "docs/dev/table/modules" >}}#hivemodule). + +{{< tabs "2e76857e-17c6-45ee-9da8-0819e132e40c" >}} +{{< tab "Java" >}} +```java + +String name = "myhive"; +String version = "2.3.4"; + +tableEnv.loadModue(name, new HiveModule(version)); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +val name = "myhive" +val version = "2.3.4" + +tableEnv.loadModue(name, new HiveModule(version)); +``` +{{< /tab >}} +{{< tab "Python" >}} +```Python +from pyflink.table.module import HiveModule + +name = "myhive" +version = "2.3.4" + +t_env.load_module(name, HiveModule(version)) +``` +{{< /tab >}} +{{< tab "YAML" >}} +```yaml +modules: + - name: core + type: core + - name: myhive + type: hive +``` +{{< /tab >}} +{{< /tabs >}} + +{{< hint info >}} +Some Hive built-in functions in older versions have [thread safety issues](https://issues.apache.org/jira/browse/HIVE-16183). +We recommend users patch their own Hive to fix them. +{{< /hint >}} + +## Hive User Defined Functions + +Users can use their existing Hive User Defined Functions in Flink. + +Supported UDF types include: + +- UDF +- GenericUDF +- GenericUDTF +- UDAF +- GenericUDAFResolver2 + +Upon query planning and execution, Hive's UDF and GenericUDF are automatically translated into Flink's ScalarFunction, +Hive's GenericUDTF is automatically translated into Flink's TableFunction, +and Hive's UDAF and GenericUDAFResolver2 are translated into Flink's AggregateFunction. + +To use a Hive User Defined Function, user have to + +- set a HiveCatalog backed by Hive Metastore that contains that function as current catalog of the session +- include a jar that contains that function in Flink's classpath +- use Blink planner. + +## Using Hive User Defined Functions + +Assuming we have the following Hive functions registered in Hive Metastore: + + +```java +/** + * Test simple udf. Registered under name 'myudf' + */ +public class TestHiveSimpleUDF extends UDF { + + public IntWritable evaluate(IntWritable i) { + return new IntWritable(i.get()); + } + + public Text evaluate(Text text) { + return new Text(text.toString()); + } +} + +/** + * Test generic udf. Registered under name 'mygenericudf' + */ +public class TestHiveGenericUDF extends GenericUDF { + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + checkArgument(arguments.length == 2); + + checkArgument(arguments[1] instanceof ConstantObjectInspector); + Object constant = ((ConstantObjectInspector) arguments[1]).getWritableConstantValue(); + checkArgument(constant instanceof IntWritable); + checkArgument(((IntWritable) constant).get() == 1); + + if (arguments[0] instanceof IntObjectInspector || + arguments[0] instanceof StringObjectInspector) { + return arguments[0]; + } else { + throw new RuntimeException("Not support argument: " + arguments[0]); + } + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + return arguments[0].get(); + } + + @Override + public String getDisplayString(String[] children) { + return "TestHiveGenericUDF"; + } +} + +/** + * Test split udtf. Registered under name 'mygenericudtf' + */ +public class TestHiveUDTF extends GenericUDTF { + + @Override + public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { + checkArgument(argOIs.length == 2); + + // TEST for constant arguments + checkArgument(argOIs[1] instanceof ConstantObjectInspector); + Object constant = ((ConstantObjectInspector) argOIs[1]).getWritableConstantValue(); + checkArgument(constant instanceof IntWritable); + checkArgument(((IntWritable) constant).get() == 1); + + return ObjectInspectorFactory.getStandardStructObjectInspector( + Collections.singletonList("col1"), + Collections.singletonList(PrimitiveObjectInspectorFactory.javaStringObjectInspector)); + } + + @Override + public void process(Object[] args) throws HiveException { + String str = (String) args[0]; + for (String s : str.split(",")) { + forward(s); + forward(s); + } + } + + @Override + public void close() { + } +} + +``` + +From Hive CLI, we can see they are registered: + +```bash +hive> show functions; +OK +...... +mygenericudf +myudf +myudtf + +``` + + +Then, users can use them in SQL as: + + +```bash + +Flink SQL> select mygenericudf(myudf(name), 1) as a, mygenericudf(myudf(age), 1) as b, s from mysourcetable, lateral table(myudtf(name, 1)) as T(s); + +``` diff --git a/docs/content.zh/docs/connectors/table/hive/hive_read_write.md b/docs/content.zh/docs/connectors/table/hive/hive_read_write.md new file mode 100644 index 0000000000000..6d45f8dd298be --- /dev/null +++ b/docs/content.zh/docs/connectors/table/hive/hive_read_write.md @@ -0,0 +1,454 @@ +--- +title: "Hive Read & Write" +weight: 4 +type: docs +aliases: + - /zh/dev/table/connectors/hive/hive_read_write.html + - /zh/dev/table/hive/hive_streaming.html +--- + + +# Hive Read & Write + +Using the `HiveCatalog`, Apache Flink can be used for unified `BATCH` and `STREAM` processing of Apache +Hive Tables. This means Flink can be used as a more performant alternative to Hive’s batch engine, +or to continuously read and write data into and out of Hive tables to power real-time data +warehousing applications. + +## Reading + +Flink supports reading data from Hive in both `BATCH` and `STREAMING` modes. When run as a `BATCH` +application, Flink will execute its query over the state of the table at the point in time when the +query is executed. `STREAMING` reads will continuously monitor the table and incrementally fetch +new data as it is made available. Flink will read tables as bounded by default. + +`STREAMING` reads support consuming both partitioned and non-partitioned tables. +For partitioned tables, Flink will monitor the generation of new partitions, and read +them incrementally when available. For non-partitioned tables, Flink will monitor the generation +of new files in the folder and read new files incrementally. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDefaultTypeDescription
    streaming-source.enable
    falseBooleanEnable streaming source or not. NOTES: Please make sure that each partition/file should be written atomically, otherwise the reader may get incomplete data.
    streaming-source.partition.include
    allStringOption to set the partitions to read, the supported option are `all` and `latest`, the `all` means read all partitions; the `latest` means read latest partition in order of 'streaming-source.partition.order', the `latest` only works` when the streaming hive source table used as temporal table. By default the option is `all`. + Flink supports temporal join the latest hive partition by enabling 'streaming-source.enable' and setting 'streaming-source.partition.include' to 'latest', at the same time, user can assign the partition compare order and data update interval by configuring following partition-related options. +
    streaming-source.monitor-interval
    NoneDurationTime interval for consecutively monitoring partition/file. + Notes: The default interval for hive streaming reading is '1 m', the default interval for hive streaming temporal join is '60 m', this is because there's one framework limitation that every TM will visit the Hive metaStore in current hive streaming temporal join implementation which may produce pressure to metaStore, this will improve in the future.
    streaming-source.partition-order
    partition-nameStringThe partition order of streaming source, support create-time, partition-time and partition-name. create-time compares partition/file creation time, this is not the partition create time in Hive metaStore, but the folder/file modification time in filesystem, if the partition folder somehow gets updated, e.g. add new file into folder, it can affect how the data is consumed. partition-time compares the time extracted from partition name. partition-name compares partition name's alphabetical order. For non-partition table, this value should always be 'create-time'. By default the value is partition-name. The option is equality with deprecated option 'streaming-source.consume-order'.
    streaming-source.consume-start-offset
    NoneStringStart offset for streaming consuming. How to parse and compare offsets depends on your order. For create-time and partition-time, should be a timestamp string (yyyy-[m]m-[d]d [hh:mm:ss]). For partition-time, will use partition time extractor to extract time from partition. + For partition-name, is the partition name string (e.g. pt_year=2020/pt_mon=10/pt_day=01).
    + +[SQL Hints]({{< ref "docs/dev/table/sql/queries/hints" >}}) can be used to apply configurations to a Hive table +without changing its definition in the Hive metastore. + +```sql + +SELECT * +FROM hive_table +/*+ OPTIONS('streaming-source.enable'='true', 'streaming-source.consume-start-offset'='2020-05-20') */; + +``` + +**Notes** + +- Monitor strategy is to scan all directories/files currently in the location path. Many partitions may cause performance degradation. +- Streaming reads for non-partitioned tables requires that each file be written atomically into the target directory. +- Streaming reading for partitioned tables requires that each partition should be added atomically in the view of hive metastore. If not, new data added to an existing partition will be consumed. +- Streaming reads do not support watermark grammar in Flink DDL. These tables cannot be used for window operators. + +### Reading Hive Views + +Flink is able to read from Hive defined views, but some limitations apply: + +1) The Hive catalog must be set as the current catalog before you can query the view. +This can be done by either `tableEnv.useCatalog(...)` in Table API or `USE CATALOG ...` in SQL Client. + +2) Hive and Flink SQL have different syntax, e.g. different reserved keywords and literals. +Make sure the view’s query is compatible with Flink grammar. + +### Vectorized Optimization upon Read + +Flink will automatically used vectorized reads of Hive tables when the following conditions are met: + +- Format: ORC or Parquet. +- Columns without complex data type, like hive types: List, Map, Struct, Union. + +This feature is enabled by default. +It may be disabled with the following configuration. + +```bash +table.exec.hive.fallback-mapred-reader=true +``` + +### Source Parallelism Inference + +By default, Flink will infer the optimal parallelism for its Hive readers +based on the number of files, and number of blocks in each file. + +Flink allows you to flexibly configure the policy of parallelism inference. You can configure the +following parameters in `TableConfig` (note that these parameters affect all sources of the job): + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyDefaultTypeDescription
    table.exec.hive.infer-source-parallelism
    trueBooleanIf is true, source parallelism is inferred according to splits number. If is false, parallelism of source are set by config.
    table.exec.hive.infer-source-parallelism.max
    1000IntegerSets max infer parallelism for source operator.
    + +## Temporal Table Join + +You can use a Hive table as a temporal table, and then a stream can correlate the Hive table by temporal join. +Please see [temporal join]({{< ref "docs/dev/table/sql/queries/joins" >}}#temporal-joins) for more information about the temporal join. + +Flink supports processing-time temporal join Hive Table, the processing-time temporal join always joins the latest version of temporal table. +Flink supports temporal join both partitioned table and Hive non-partitioned table, for partitioned table, Flink supports tracking the latest partition of Hive table automatically. + +**NOTE**: Flink does not support event-time temporal join Hive table yet. + +### Temporal Join The Latest Partition + +For a partitioned table which is changing over time, we can read it out as an unbounded stream, the partition can be acted as a version of the temporal table if every partition contains complete data of a version, +the version of temporal table keeps the data of the partition. + +Flink support tracking the latest partition(version) of temporal table automatically in processing time temporal join, the latest partition(version) is defined by 'streaming-source.partition-order' option, +This is the most common user cases that use Hive table as dimension table in a Flink stream application job. + +**NOTE:** This feature is only support in Flink `STREAMING` Mode. + +The following demo shows a classical business pipeline, the dimension table comes from Hive and it's updated once every day by a batch pipeline job or a Flink job, the kafka stream comes from real time online business data or log and need to join with the dimension table to enrich stream. + +```sql +-- Assume the data in hive table is updated per day, every day contains the latest and complete dimension data +SET table.sql-dialect=hive; +CREATE TABLE dimension_table ( + product_id STRING, + product_name STRING, + unit_price DECIMAL(10, 4), + pv_count BIGINT, + like_count BIGINT, + comment_count BIGINT, + update_time TIMESTAMP(3), + update_user STRING, + ... +) PARTITIONED BY (pt_year STRING, pt_month STRING, pt_day STRING) TBLPROPERTIES ( + -- using default partition-name order to load the latest partition every 12h (the most recommended and convenient way) + 'streaming-source.enable' = 'true', + 'streaming-source.partition.include' = 'latest', + 'streaming-source.monitor-interval' = '12 h', + 'streaming-source.partition-order' = 'partition-name', -- option with default value, can be ignored. + + -- using partition file create-time order to load the latest partition every 12h + 'streaming-source.enable' = 'true', + 'streaming-source.partition.include' = 'latest', + 'streaming-source.partition-order' = 'create-time', + 'streaming-source.monitor-interval' = '12 h' + + -- using partition-time order to load the latest partition every 12h + 'streaming-source.enable' = 'true', + 'streaming-source.partition.include' = 'latest', + 'streaming-source.monitor-interval' = '12 h', + 'streaming-source.partition-order' = 'partition-time', + 'partition.time-extractor.kind' = 'default', + 'partition.time-extractor.timestamp-pattern' = '$pt_year-$pt_month-$pt_day 00:00:00' +); + +SET table.sql-dialect=default; +CREATE TABLE orders_table ( + order_id STRING, + order_amount DOUBLE, + product_id STRING, + log_ts TIMESTAMP(3), + proctime as PROCTIME() +) WITH (...); + + +-- streaming sql, kafka temporal join a hive dimension table. Flink will automatically reload data from the +-- configured latest partition in the interval of 'streaming-source.monitor-interval'. + +SELECT * FROM orders_table AS o +JOIN dimension_table FOR SYSTEM_TIME AS OF o.proctime AS dim +ON o.product_id = dim.product_id; + +``` + +### Temporal Join The Latest Table + +For a Hive table, we can read it out as a bounded stream. In this case, the Hive table can only track its latest version at the time when we query. +The latest version of table keep all data of the Hive table. + +When performing the temporal join the latest Hive table, the Hive table will be cached in Slot memory and each record from the stream is joined against the table by key to decide whether a match is found. +Using the latest Hive table as a temporal table does not require any additional configuration. Optionally, you can configure the TTL of the Hive table cache with the following property. After the cache expires, the Hive table will be scanned again to load the latest data. + + + + + + + + + + + + + + + + + + +
    KeyDefaultTypeDescription
    lookup.join.cache.ttl
    60 minDurationThe cache TTL (e.g. 10min) for the build table in lookup join. By default the TTL is 60 minutes. NOTES: The option only works when lookup bounded hive table source, if you're using streaming hive source as temporal table, please use 'streaming-source.monitor-interval' to configure the interval of data update. +
    + +The following demo shows load all data of hive table as a temporal table. + +```sql +-- Assume the data in hive table is overwrite by batch pipeline. +SET table.sql-dialect=hive; +CREATE TABLE dimension_table ( + product_id STRING, + product_name STRING, + unit_price DECIMAL(10, 4), + pv_count BIGINT, + like_count BIGINT, + comment_count BIGINT, + update_time TIMESTAMP(3), + update_user STRING, + ... +) TBLPROPERTIES ( + 'streaming-source.enable' = 'false', -- option with default value, can be ignored. + 'streaming-source.partition.include' = 'all', -- option with default value, can be ignored. + 'lookup.join.cache.ttl' = '12 h' +); + +SET table.sql-dialect=default; +CREATE TABLE orders_table ( + order_id STRING, + order_amount DOUBLE, + product_id STRING, + log_ts TIMESTAMP(3), + proctime as PROCTIME() +) WITH (...); + + +-- streaming sql, kafka join a hive dimension table. Flink will reload all data from dimension_table after cache ttl is expired. + +SELECT * FROM orders_table AS o +JOIN dimension_table FOR SYSTEM_TIME AS OF o.proctime AS dim +ON o.product_id = dim.product_id; + +``` +Note: + +1. Each joining subtask needs to keep its own cache of the Hive table. Please make sure the Hive table can fit into the memory of a TM task slot. +2. It is encouraged to set a relatively large value both for `streaming-source.monitor-interval`(latest partition as temporal table) or `lookup.join.cache.ttl`(all partitions as temporal table). Otherwise, Jobs are prone to performance issues as the table needs to be updated and reloaded too frequently. +3. Currently we simply load the whole Hive table whenever the cache needs refreshing. There's no way to differentiate +new data from the old. + +## Writing + +Flink supports writing data from Hive in both `BATCH` and `STREAMING` modes. When run as a `BATCH` +application, Flink will write to a Hive table only making those records visible when the Job finishes. +`BATCH` writes support both appending to and overwriting existing tables. + +```sql +# ------ INSERT INTO will append to the table or partition, keeping the existing data intact ------ +Flink SQL> INSERT INTO mytable SELECT 'Tom', 25; + +# ------ INSERT OVERWRITE will overwrite any existing data in the table or partition ------ +Flink SQL> INSERT OVERWRITE mytable SELECT 'Tom', 25; +``` + +Data can also be inserted into particular partitions. + +```sql +# ------ Insert with static partition ------ +Flink SQL> INSERT OVERWRITE myparttable PARTITION (my_type='type_1', my_date='2019-08-08') SELECT 'Tom', 25; + +# ------ Insert with dynamic partition ------ +Flink SQL> INSERT OVERWRITE myparttable SELECT 'Tom', 25, 'type_1', '2019-08-08'; + +# ------ Insert with static(my_type) and dynamic(my_date) partition ------ +Flink SQL> INSERT OVERWRITE myparttable PARTITION (my_type='type_1') SELECT 'Tom', 25, '2019-08-08'; +``` + +`STREAMING` writes continuously adding new data to Hive, committing records - making them +visible - incrementally. Users control when/how to trigger commits with several properties. Insert +overwrite is not supported for streaming write. + +The below examples show how the streaming sink can be used to write a streaming query to write data from Kafka into a Hive table with partition-commit, +and runs a batch query to read that data back out. + +Please see the [streaming sink]({{< ref "docs/connectors/table/filesystem" >}}#streaming-sink) for a full list of available configurations. + +```sql + +SET table.sql-dialect=hive; +CREATE TABLE hive_table ( + user_id STRING, + order_amount DOUBLE +) PARTITIONED BY (dt STRING, hr STRING) STORED AS parquet TBLPROPERTIES ( + 'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00', + 'sink.partition-commit.trigger'='partition-time', + 'sink.partition-commit.delay'='1 h', + 'sink.partition-commit.policy.kind'='metastore,success-file' +); + +SET table.sql-dialect=default; +CREATE TABLE kafka_table ( + user_id STRING, + order_amount DOUBLE, + log_ts TIMESTAMP(3), + WATERMARK FOR log_ts AS log_ts - INTERVAL '5' SECOND -- Define watermark on TIMESTAMP column +) WITH (...); + +-- streaming sql, insert into hive table +INSERT INTO TABLE hive_table +SELECT user_id, order_amount, DATE_FORMAT(log_ts, 'yyyy-MM-dd'), DATE_FORMAT(log_ts, 'HH') +FROM kafka_table; + +-- batch sql, select with partition pruning +SELECT * FROM hive_table WHERE dt='2020-05-20' and hr='12'; + +``` + +If the watermark is defined on TIMESTAMP_LTZ column and used `partition-time` to commit, the `sink.partition-commit.watermark-time-zone` is required to set to the session time zone, otherwise the partition committed may happen after a few hours. +```sql + +SET table.sql-dialect=hive; +CREATE TABLE hive_table ( + user_id STRING, + order_amount DOUBLE +) PARTITIONED BY (dt STRING, hr STRING) STORED AS parquet TBLPROPERTIES ( + 'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00', + 'sink.partition-commit.trigger'='partition-time', + 'sink.partition-commit.delay'='1 h', + 'sink.partition-commit.watermark-time-zone'='Asia/Shanghai', -- Assume user configured time zone is 'Asia/Shanghai' + 'sink.partition-commit.policy.kind'='metastore,success-file' +); + +SET table.sql-dialect=default; +CREATE TABLE kafka_table ( + user_id STRING, + order_amount DOUBLE, + ts BIGINT, -- time in epoch milliseconds + ts_ltz AS TO_TIMESTAMP_LTZ(ts, 3), + WATERMARK FOR ts_ltz AS ts_ltz - INTERVAL '5' SECOND -- Define watermark on TIMESTAMP_LTZ column +) WITH (...); + +-- streaming sql, insert into hive table +INSERT INTO TABLE hive_table +SELECT user_id, order_amount, DATE_FORMAT(ts_ltz, 'yyyy-MM-dd'), DATE_FORMAT(ts_ltz, 'HH') +FROM kafka_table; + +-- batch sql, select with partition pruning +SELECT * FROM hive_table WHERE dt='2020-05-20' and hr='12'; + +``` + +By default, for streaming writes, Flink only supports renaming committers, meaning the S3 filesystem +cannot support exactly-once streaming writes. +Exactly-once writes to S3 can be achieved by configuring the following parameter to false. +This will instruct the sink to use Flink's native writers but only works for +parquet and orc file types. +This configuration is set in the `TableConfig` and will affect all sinks of the job. + + + + + + + + + + + + + + + + + + +
    KeyDefaultTypeDescription
    table.exec.hive.fallback-mapred-writer
    trueBooleanIf it is false, using flink native writer to write parquet and orc files; if it is true, using hadoop mapred record writer to write parquet and orc files.
    + + +## Formats + +Flink's Hive integration has been tested against the following file formats: + +- Text +- CSV +- SequenceFile +- ORC +- Parquet diff --git a/docs/content.zh/docs/connectors/table/hive/overview.md b/docs/content.zh/docs/connectors/table/hive/overview.md new file mode 100644 index 0000000000000..a32a3ff4d1809 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/hive/overview.md @@ -0,0 +1,460 @@ +--- +title: "Overview" +weight: 1 +type: docs +aliases: + - /zh/dev/table/connectors/hive/ +--- + + +# Hive + +[Apache Hive](https://hive.apache.org/) 已经成为了数据仓库生态系统中的核心。 +它不仅仅是一个用于大数据分析和ETL场景的SQL引擎,同样它也是一个数据管理平台,可用于发现,定义,和演化数据。 + +Flink 与 Hive 的集成包含两个层面。 + +一是利用了 Hive 的 MetaStore 作为持久化的 Catalog,用户可通过`HiveCatalog`将不同会话中的 Flink 元数据存储到 Hive Metastore 中。 +例如,用户可以使用`HiveCatalog`将其 Kafka 表或 Elasticsearch 表存储在 Hive Metastore 中,并后续在 SQL 查询中重新使用它们。 + +二是利用 Flink 来读写 Hive 的表。 + +`HiveCatalog`的设计提供了与 Hive 良好的兼容性,用户可以"开箱即用"的访问其已有的 Hive 数仓。 +您不需要修改现有的 Hive Metastore,也不需要更改表的数据位置或分区。 + +* 我们强烈建议用户使用 [Blink planner]({{< ref "docs/dev/table/overview" >}}#dependency-structure) 与 Hive 集成。 + +## 支持的Hive版本 + +Flink 支持一下的 Hive 版本。 + +- 1.0 + - 1.0.0 + - 1.0.1 +- 1.1 + - 1.1.0 + - 1.1.1 +- 1.2 + - 1.2.0 + - 1.2.1 + - 1.2.2 +- 2.0 + - 2.0.0 + - 2.0.1 +- 2.1 + - 2.1.0 + - 2.1.1 +- 2.2 + - 2.2.0 +- 2.3 + - 2.3.0 + - 2.3.1 + - 2.3.2 + - 2.3.3 + - 2.3.4 + - 2.3.5 + - 2.3.6 +- 3.1 + - 3.1.0 + - 3.1.1 + - 3.1.2 + +请注意,某些功能是否可用取决于您使用的 Hive 版本,这些限制不是由 Flink 所引起的: + +- Hive 内置函数在使用 Hive-1.2.0 及更高版本时支持。 +- 列约束,也就是 PRIMARY KEY 和 NOT NULL,在使用 Hive-3.1.0 及更高版本时支持。 +- 更改表的统计信息,在使用 Hive-1.2.0 及更高版本时支持。 +- `DATE`列统计信息,在使用 Hive-1.2.0 及更高版时支持。 +- 使用 Hive-2.0.x 版本时不支持写入 ORC 表。 + +### 依赖项 + +要与 Hive 集成,您需要在 Flink 下的`/lib/`目录中添加一些额外的依赖包, +以便通过 Table API 或 SQL Client 与 Hive 进行交互。 +或者,您可以将这些依赖项放在专用文件夹中,并分别使用 Table API 程序或 SQL Client 的`-C`或`-l`选项将它们添加到 classpath 中。 + +Apache Hive 是基于 Hadoop 之上构建的, 首先您需要 Hadoop 的依赖,请参考 +Providing Hadoop classes: +``` +export HADOOP_CLASSPATH=`hadoop classpath` +``` + +有两种添加 Hive 依赖项的方法。第一种是使用 Flink 提供的 Hive Jar包。您可以根据使用的 Metastore 的版本来选择对应的 Hive jar。第二个方式是分别添加每个所需的 jar 包。如果您使用的 Hive 版本尚未在此处列出,则第二种方法会更适合。 + +**注意**:建议您优先使用 Flink 提供的 Hive jar 包。仅在 Flink 提供的 Hive jar 不满足您的需求时,再考虑使用分开添加 jar 包的方式。 + +#### 使用 Flink 提供的 Hive jar + +下表列出了所有可用的 Hive jar。您可以选择一个并放在 Flink 发行版的`/lib/` 目录中。 + +| Metastore version | Maven dependency | SQL Client JAR | +| :---------------- | :--------------------------- | :----------------------| +| 1.0.0 - 1.2.2 | `flink-sql-connector-hive-1.2.2` | {{< stable >}}[Download](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-1.2.2{{< scala_version >}}/{{< version >}}/flink-sql-connector-hive-1.2.2{{< scala_version >}}-{{< version >}}.jar) {{< /stable >}}{{< unstable >}} Only available for stable releases {{< /unstable >}} | +| 2.0.0 - 2.2.0 | `flink-sql-connector-hive-2.2.0` | {{< stable >}}[Download](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-2.2.0{{< scala_version >}}/{{< version >}}/flink-sql-connector-hive-2.2.0{{< scala_version >}}-{{< version >}}.jar) {{< /stable >}}{{< unstable >}} Only available for stable releases {{< /unstable >}} | +| 2.3.0 - 2.3.6 | `flink-sql-connector-hive-2.3.6` | {{< stable >}}[Download](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-2.3.6{{< scala_version >}}/{{< version >}}/flink-sql-connector-hive-2.3.6{{< scala_version >}}-{{< version >}}.jar) {{< /stable >}}{{< unstable >}} Only available for stable releases {{< /unstable >}} | +| 3.0.0 - 3.1.2 | `flink-sql-connector-hive-3.1.2` | {{< stable >}}[Download](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-3.1.2{{< scala_version >}}/{{< version >}}/flink-sql-connector-hive-3.1.2{{< scala_version >}}-{{< version >}}.jar) {{< /stable >}}{{< unstable >}} Only available for stable releases {{< /unstable >}} | + +#### 用户定义的依赖项 + +您可以在下方找到不同Hive主版本所需要的依赖项。 + +{{< tabs "8623cd64-8623-4922-92d2-ee82ec410d96" >}} +{{< tab "Hive 2.3.4" >}} +```txt +/flink-{{< version >}} + /lib + + // Flink's Hive connector.Contains flink-hadoop-compatibility and flink-orc jars + flink-connector-hive{{< scala_version >}}-{{< version >}}.jar + + // Hive dependencies + hive-exec-2.3.4.jar + + // add antlr-runtime if you need to use hive dialect + antlr-runtime-3.5.2.jar + +``` +{{< /tab >}} +{{< tab "Hive 1.0.0" >}} +```txt +/flink-{{< version >}} + /lib + + // Flink's Hive connector + flink-connector-hive{{< scala_version >}}-{{< version >}}.jar + + // Hive dependencies + hive-metastore-1.0.0.jar + hive-exec-1.0.0.jar + libfb303-0.9.0.jar // libfb303 is not packed into hive-exec in some versions, need to add it separately + + // Orc dependencies -- required by the ORC vectorized optimizations + orc-core-1.4.3-nohive.jar + aircompressor-0.8.jar // transitive dependency of orc-core + + // add antlr-runtime if you need to use hive dialect + antlr-runtime-3.5.2.jar + +``` +{{< /tab >}} +{{< tab "Hive 1.1.0" >}} +```txt +/flink-{{< version >}} + /lib + + // Flink's Hive connector + flink-connector-hive{{< scala_version >}}-{{< version >}}.jar + + // Hive dependencies + hive-metastore-1.1.0.jar + hive-exec-1.1.0.jar + libfb303-0.9.2.jar // libfb303 is not packed into hive-exec in some versions, need to add it separately + + // Orc dependencies -- required by the ORC vectorized optimizations + orc-core-1.4.3-nohive.jar + aircompressor-0.8.jar // transitive dependency of orc-core + + // add antlr-runtime if you need to use hive dialect + antlr-runtime-3.5.2.jar + +``` +{{< /tab >}} +{{< tab "Hive 1.2.1" >}} +```txt +/flink-{{< version >}} + /lib + + // Flink's Hive connector + flink-connector-hive{{< scala_version >}}-{{< version >}}.jar + + // Hive dependencies + hive-metastore-1.2.1.jar + hive-exec-1.2.1.jar + libfb303-0.9.2.jar // libfb303 is not packed into hive-exec in some versions, need to add it separately + + // Orc dependencies -- required by the ORC vectorized optimizations + orc-core-1.4.3-nohive.jar + aircompressor-0.8.jar // transitive dependency of orc-core + + // add antlr-runtime if you need to use hive dialect + antlr-runtime-3.5.2.jar + +``` +{{< /tab >}} +{{< tab "Hive 2.0.0" >}} +```txt +/flink-{{< version >}} + /lib + + // Flink's Hive connector + flink-connector-hive{{< scala_version >}}-{{< version >}}.jar + + // Hive dependencies + hive-exec-2.0.0.jar + + // add antlr-runtime if you need to use hive dialect + antlr-runtime-3.5.2.jar + +``` +{{< /tab >}} +{{< tab "Hive 2.1.0" >}} +```txt +/flink-{{< version >}} + /lib + + // Flink's Hive connector + flink-connector-hive{{< scala_version >}}-{{< version >}}.jar + + // Hive dependencies + hive-exec-2.1.0.jar + + // add antlr-runtime if you need to use hive dialect + antlr-runtime-3.5.2.jar + +``` +{{< /tab >}} +{{< tab "Hive 2.2.0" >}} +```txt +/flink-{{< version >}} + /lib + + // Flink's Hive connector + flink-connector-hive{{< scala_version >}}-{{< version >}}.jar + + // Hive dependencies + hive-exec-2.2.0.jar + + // Orc dependencies -- required by the ORC vectorized optimizations + orc-core-1.4.3.jar + aircompressor-0.8.jar // transitive dependency of orc-core + + // add antlr-runtime if you need to use hive dialect + antlr-runtime-3.5.2.jar + +``` +{{< /tab >}} +{{< tab "Hive 3.1.0" >}} +```txt +/flink-{{< version >}} + /lib + + // Flink's Hive connector + flink-connector-hive{{< scala_version >}}-{{< version >}}.jar + + // Hive dependencies + hive-exec-3.1.0.jar + libfb303-0.9.3.jar // libfb303 is not packed into hive-exec in some versions, need to add it separately + + // add antlr-runtime if you need to use hive dialect + antlr-runtime-3.5.2.jar + +``` +{{< /tab >}} +{{< /tabs >}} + +### Maven 依赖 + +如果您在构建自己的应用程序,则需要在 mvn 文件中添加以下依赖项。 +您应该在运行时添加以上的这些依赖项,而不要在已生成的 jar 文件中去包含它们。 + +```xml + + + org.apache.flink + flink-connector-hive{{ site.scala_version_suffix }} + {{site.version}} + provided + + + + org.apache.flink + flink-table-api-java-bridge{{ site.scala_version_suffix }} + {{site.version}} + provided + + + + + org.apache.hive + hive-exec + ${hive.version} + provided + +``` + +## 连接到Hive + +通过 TableEnvironment 或者 YAML 配置,使用 [Catalog 接口]({{< ref "docs/dev/table/catalogs" >}}) 和 [HiveCatalog]({{< ref "docs/connectors/table/hive/hive_catalog" >}})连接到现有的 Hive 集群。 + +请注意,虽然 HiveCatalog 不需要特定的 planner,但读写Hive表仅适用于 Blink planner。因此,强烈建议您在连接到 Hive 仓库时使用 Blink planner。 + +以下是如何连接到 Hive 的示例: + +{{< tabs "2ca7cad8-0b84-45db-92d9-a75abd8808e7" >}} +{{< tab "Java" >}} + +```java + +EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().build(); +TableEnvironment tableEnv = TableEnvironment.create(settings); + +String name = "myhive"; +String defaultDatabase = "mydatabase"; +String hiveConfDir = "/opt/hive-conf"; + +HiveCatalog hive = new HiveCatalog(name, defaultDatabase, hiveConfDir); +tableEnv.registerCatalog("myhive", hive); + +// set the HiveCatalog as the current catalog of the session +tableEnv.useCatalog("myhive"); +``` +{{< /tab >}} +{{< tab "Scala" >}} + +```scala + +val settings = EnvironmentSettings.newInstance().useBlinkPlanner().build() +val tableEnv = TableEnvironment.create(settings) + +val name = "myhive" +val defaultDatabase = "mydatabase" +val hiveConfDir = "/opt/hive-conf" + +val hive = new HiveCatalog(name, defaultDatabase, hiveConfDir) +tableEnv.registerCatalog("myhive", hive) + +// set the HiveCatalog as the current catalog of the session +tableEnv.useCatalog("myhive") +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +from pyflink.table import * +from pyflink.table.catalog import HiveCatalog + +settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build() +t_env = TableEnvironment.create(settings) + +catalog_name = "myhive" +default_database = "mydatabase" +hive_conf_dir = "/opt/hive-conf" + +hive_catalog = HiveCatalog(catalog_name, default_database, hive_conf_dir) +t_env.register_catalog("myhive", hive_catalog) + +# set the HiveCatalog as the current catalog of the session +tableEnv.use_catalog("myhive") +``` +{{< /tab >}} +{{< tab "YAML" >}} +```yaml + +execution: + planner: blink + ... + current-catalog: myhive # set the HiveCatalog as the current catalog of the session + current-database: mydatabase + +catalogs: + - name: myhive + type: hive + hive-conf-dir: /opt/hive-conf +``` +{{< /tab >}} +{{< tab "SQL" >}} +```sql + +CREATE CATALOG myhive WITH ( + 'type' = 'hive', + 'default-database' = 'mydatabase', + 'hive-conf-dir' = '/opt/hive-conf' +); +-- set the HiveCatalog as the current catalog of the session +USE CATALOG myhive; +``` +{{< /tab >}} +{{< /tabs >}} + +下表列出了通过 YAML 文件或 DDL 定义 `HiveCatalog` 时所支持的参数。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数必选默认值类型描述
    type
    (无)StringCatalog 的类型。 创建 HiveCatalog 时,该参数必须设置为'hive'
    name
    (无)StringCatalog 的名字。仅在使用 YAML file 时需要指定。
    hive-conf-dir
    (无)String指向包含 hive-site.xml 目录的 URI。 该 URI 必须是 Hadoop 文件系统所支持的类型。 如果指定一个相对 URI,即不包含 scheme,则默认为本地文件系统。如果该参数没有指定,我们会在 class path 下查找hive-site.xml。
    default-database
    defaultString当一个catalog被设为当前catalog时,所使用的默认当前database。
    hive-version
    (无)StringHiveCatalog 能够自动检测使用的 Hive 版本。我们建议不要手动设置 Hive 版本,除非自动检测机制失败。
    hadoop-conf-dir
    (无)StringHadoop 配置文件目录的路径。目前仅支持本地文件系统路径。我们推荐使用 HADOOP_CONF_DIR 环境变量来指定 Hadoop 配置。因此仅在环境变量不满足您的需求时再考虑使用该参数,例如当您希望为每个 HiveCatalog 单独设置 Hadoop 配置时。
    + + +## DDL + +即将支持在 Flink 中创建 Hive 表,视图,分区和函数的DDL。 + +## DML + +Flink 支持 DML 写入 Hive 表,请参考[读写 Hive 表]({{< ref "docs/connectors/table/hive/hive_read_write" >}}) diff --git a/docs/content.zh/docs/connectors/table/jdbc.md b/docs/content.zh/docs/connectors/table/jdbc.md new file mode 100644 index 0000000000000..d31e40714170d --- /dev/null +++ b/docs/content.zh/docs/connectors/table/jdbc.md @@ -0,0 +1,597 @@ +--- +title: JDBC +weight: 6 +type: docs +aliases: + - /zh/dev/table/connectors/jdbc.html +--- + + +# JDBC SQL 连接器 + +{{< label "Scan Source: Bounded" >}} +{{< label "Lookup Source: Sync Mode" >}} +{{< label "Sink: Batch" >}} +{{< label "Sink: Streaming Append & Upsert Mode" >}} + +JDBC 连接器允许使用 JDBC 驱动向任意类型的关系型数据库读取或者写入数据。本文档描述了针对关系型数据库如何通过建立 JDBC 连接器来执行 SQL 查询。 + +如果在 DDL 中定义了主键,JDBC sink 将以 upsert 模式与外部系统交换 UPDATE/DELETE 消息;否则,它将以 append 模式与外部系统交换消息且不支持消费 UPDATE/DELETE 消息。 + +依赖 +------------ + +{{< sql_download_table "jdbc" >}} + + +在连接到具体数据库时,也需要对应的驱动依赖,目前支持的驱动如下: + +| Driver | Group Id | Artifact Id | JAR | +| :-----------| :------------------| :----------------------| :----------------| +| MySQL | `mysql` | `mysql-connector-java` | [下载](https://repo.maven.apache.org/maven2/mysql/mysql-connector-java/) | +| PostgreSQL | `org.postgresql` | `postgresql` | [下载](https://jdbc.postgresql.org/download.html) | +| Derby | `org.apache.derby` | `derby` | [下载](http://db.apache.org/derby/derby_downloads.html) | + + +当前,JDBC 连接器和驱动不在 Flink 二进制发布包中,请参阅[这里]({{< ref "docs/dev/datastream/project-configuration" >}})了解在集群上执行时何连接它们。 + + +如何创建 JDBC 表 +---------------- + +JDBC table 可以按如下定义: + +```sql +-- 在 Flink SQL 中注册一张 MySQL 表 'users' +CREATE TABLE MyUserTable ( + id BIGINT, + name STRING, + age INT, + status BOOLEAN, + PRIMARY KEY (id) NOT ENFORCED +) WITH ( + 'connector' = 'jdbc', + 'url' = 'jdbc:mysql://localhost:3306/mydatabase', + 'table-name' = 'users' +); + +-- 从另一张表 "T" 将数据写入到 JDBC 表中 +INSERT INTO MyUserTable +SELECT id, name, age, status FROM T; + +-- 查看 JDBC 表中的数据 +SELECT id, name, age, status FROM MyUserTable; + +-- JDBC 表在时态表关联中作为维表 +SELECT * FROM myTopic +LEFT JOIN MyUserTable FOR SYSTEM_TIME AS OF myTopic.proctime +ON myTopic.key = MyUserTable.id; +``` + +连接器参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必填默认值类型描述
    connector
    必填(none)String指定使用什么类型的连接器,这里应该是'jdbc'
    url
    必填(none)StringJDBC 数据库 url。
    table-name
    必填(none)String连接到 JDBC 表的名称。
    driver
    可选(none)String用于连接到此 URL 的 JDBC 驱动类名,如果不设置,将自动从 URL 中推导。
    username
    可选(none)StringJDBC 用户名。如果指定了 'username''password' 中的任一参数,则两者必须都被指定。
    password
    可选(none)StringJDBC 密码。
    connection.max-retry-timeout
    可选60sDuration最大重试超时时间,以秒为单位且不应该小于 1 秒。
    scan.partition.column
    可选(none)String用于将输入进行分区的列名。请参阅下面的分区扫描部分了解更多详情。
    scan.partition.num
    可选(none)Integer分区数。
    scan.partition.lower-bound
    可选(none)Integer第一个分区的最小值。
    scan.partition.upper-bound
    可选(none)Integer最后一个分区的最大值。
    scan.fetch-size
    可选0Integer每次循环读取时应该从数据库中获取的行数。如果指定的值为 '0',则该配置项会被忽略。
    scan.auto-commit
    可选trueBoolean在 JDBC 驱动程序上设置 auto-commit 标志, + 它决定了每个语句是否在事务中自动提交。有些 JDBC 驱动程序,特别是 + Postgres,可能需要将此设置为 false 以便流化结果。
    lookup.cache.max-rows
    可选(none)Integerlookup cache 的最大行数,若超过该值,则最老的行记录将会过期。 + 默认情况下,lookup cache 是未开启的。请参阅下面的 Lookup Cache 部分了解更多详情。
    lookup.cache.ttl
    可选(none)Durationlookup cache 中每一行记录的最大存活时间,若超过该时间,则最老的行记录将会过期。 + 默认情况下,lookup cache 是未开启的。请参阅下面的 Lookup Cache 部分了解更多详情。
    lookup.max-retries
    可选3Integer查询数据库失败的最大重试时间。
    sink.buffer-flush.max-rows
    可选100Integerflush 前缓存记录的最大值,可以设置为 '0' 来禁用它。
    sink.buffer-flush.interval
    可选1sDurationflush 间隔时间,超过该时间后异步线程将 flush 数据。可以设置为 '0' 来禁用它。注意, 为了完全异步地处理缓存的 flush 事件,可以将 'sink.buffer-flush.max-rows' 设置为 '0' 并配置适当的 flush 时间间隔。
    sink.max-retries
    可选3Integer写入记录到数据库失败后的最大重试次数。
    sink.parallelism
    可选(none)Integer用于定义 JDBC sink 算子的并行度。默认情况下,并行度是由框架决定:使用与上游链式算子相同的并行度。
    + +特性 +-------- + +### 键处理 + +当写入数据到外部数据库时,Flink 会使用 DDL 中定义的主键。如果定义了主键,则连接器将以 upsert 模式工作,否则连接器将以 append 模式工作。 + +在 upsert 模式下,Flink 将根据主键判断插入新行或者更新已存在的行,这种方式可以确保幂等性。为了确保输出结果是符合预期的,推荐为表定义主键并且确保主键是底层数据库中表的唯一键或主键。在 append 模式下,Flink 会把所有记录解释为 INSERT 消息,如果违反了底层数据库中主键或者唯一约束,INSERT 插入可能会失败。 + +有关 PRIMARY KEY 语法的更多详细信息,请参见 [CREATE TABLE DDL]({{< ref "docs/dev/table/sql/create" >}}#create-table)。 + +### 分区扫描 + +为了在并行 `Source` task 实例中加速读取数据,Flink 为 JDBC table 提供了分区扫描的特性。 + +如果下述分区扫描参数中的任一项被指定,则下述所有的分区扫描参数必须都被指定。这些参数描述了在多个 task 并行读取数据时如何对表进行分区。 +`scan.partition.column` 必须是相关表中的数字、日期或时间戳列。注意,`scan.partition.lower-bound` 和 `scan.partition.upper-bound` 用于决定分区的起始位置和过滤表中的数据。如果是批处理作业,也可以在提交 flink 作业之前获取最大值和最小值。 + +- `scan.partition.column`:输入用于进行分区的列名。 +- `scan.partition.num`:分区数。 +- `scan.partition.lower-bound`:第一个分区的最小值。 +- `scan.partition.upper-bound`:最后一个分区的最大值。 + +### Lookup Cache + +JDBC 连接器可以用在时态表关联中作为一个可 lookup 的 source (又称为维表),当前只支持同步的查找模式。 + +默认情况下,lookup cache 是未启用的,你可以设置 `lookup.cache.max-rows` and `lookup.cache.ttl` 参数来启用。 + +lookup cache 的主要目的是用于提高时态表关联 JDBC 连接器的性能。默认情况下,lookup cache 不开启,所以所有请求都会发送到外部数据库。 +当 lookup cache 被启用时,每个进程(即 TaskManager)将维护一个缓存。Flink 将优先查找缓存,只有当缓存未查找到时才向外部数据库发送请求,并使用返回的数据更新缓存。 +当缓存命中最大缓存行 `lookup.cache.max-rows` 或当行超过最大存活时间 `lookup.cache.ttl` 时,缓存中最老的行将被设置为已过期。 +缓存中的记录可能不是最新的,用户可以将 `lookup.cache.ttl` 设置为一个更小的值以获得更好的刷新数据,但这可能会增加发送到数据库的请求数。所以要做好吞吐量和正确性之间的平衡。 + +### 幂等写入 + +如果在 DDL 中定义了主键,JDBC sink 将使用 upsert 语义而不是普通的 INSERT 语句。upsert 语义指的是如果底层数据库中存在违反唯一性约束,则原子地添加新行或更新现有行,这种方式确保了幂等性。 + +如果出现故障,Flink 作业会从上次成功的 checkpoint 恢复并重新处理,这可能导致在恢复过程中重复处理消息。强烈推荐使用 upsert 模式,因为如果需要重复处理记录,它有助于避免违反数据库主键约束和产生重复数据。 + +除了故障恢复场景外,数据源(kafka topic)也可能随着时间的推移自然地包含多个具有相同主键的记录,这使得 upsert 模式是用户期待的。 + +由于 upsert 没有标准的语法,因此下表描述了不同数据库的 DML 语法: + + + + + + + + + + + + + + + + + + +
    DatabaseUpsert Grammar
    MySQLINSERT .. ON DUPLICATE KEY UPDATE ..
    PostgreSQLINSERT .. ON CONFLICT .. DO UPDATE SET ..
    + +### Postgres 数据库作为 Catalog + +`JdbcCatalog` 允许用户通过 JDBC 协议将 Flink 连接到关系数据库。 + +目前,`PostgresCatalog` 是 JDBC Catalog 的唯一实现,`PostgresCatalog` 只支持有限的 `Catalog` 方法,包括: + +```java +// Postgres Catalog 支持的方法 +PostgresCatalog.databaseExists(String databaseName) +PostgresCatalog.listDatabases() +PostgresCatalog.getDatabase(String databaseName) +PostgresCatalog.listTables(String databaseName) +PostgresCatalog.getTable(ObjectPath tablePath) +PostgresCatalog.tableExists(ObjectPath tablePath) +``` + +其他的 `Catalog` 方法现在还是不支持的。 + +#### PostgresCatalog 的使用 + +请参阅 [Dependencies](#dependencies) 部分了解如何配置 JDBC 连接器和 Postgres 驱动。 + +Postgres catalog 支持以下参数: +- `name`:必填,catalog 的名称。 +- `default-database`:必填,默认要连接的数据库。 +- `username`:必填,Postgres 账户的用户名。 +- `password`:必填,账户的密码。 +- `base-url`:必填,应该符合 `"jdbc:postgresql://:"` 的格式,同时这里不应该包含数据库名。 + +{{< tabs "10bd8bfb-674c-46aa-8a36-385537df5791" >}} +{{< tab "SQL" >}} +```sql +CREATE CATALOG mypg WITH( + 'type' = 'jdbc', + 'default-database' = '...', + 'username' = '...', + 'password' = '...', + 'base-url' = '...' +); + +USE CATALOG mypg; +``` +{{< /tab >}} +{{< tab "Java" >}} +```java + +EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build(); +TableEnvironment tableEnv = TableEnvironment.create(settings); + +String name = "mypg"; +String defaultDatabase = "mydb"; +String username = "..."; +String password = "..."; +String baseUrl = "..." + +JdbcCatalog catalog = new JdbcCatalog(name, defaultDatabase, username, password, baseUrl); +tableEnv.registerCatalog("mypg", catalog); + +// 设置 JdbcCatalog 为会话的当前 catalog +tableEnv.useCatalog("mypg"); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +val settings = EnvironmentSettings.newInstance().inStreamingMode().build() +val tableEnv = TableEnvironment.create(settings) + +val name = "mypg" +val defaultDatabase = "mydb" +val username = "..." +val password = "..." +val baseUrl = "..." + +val catalog = new JdbcCatalog(name, defaultDatabase, username, password, baseUrl) +tableEnv.registerCatalog("mypg", catalog) + +// 设置 JdbcCatalog 为会话的当前 catalog +tableEnv.useCatalog("mypg") +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +from pyflink.table.catalog import JdbcCatalog + +environment_settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build() +t_env = TableEnvironment.create(environment_settings) + +name = "mypg" +default_database = "mydb" +username = "..." +password = "..." +base_url = "..." + +catalog = JdbcCatalog(name, default_database, username, password, base_url) +t_env.register_catalog("mypg", catalog) + +# 设置 JdbcCatalog 为会话的当前 catalog +t_env.use_catalog("mypg") +``` +{{< /tab >}} +{{< tab "YAML" >}} +```yaml + +execution: + planner: blink + ... + current-catalog: mypg # 设置 JdbcCatalog 为会话的当前 catalog + current-database: mydb + +catalogs: + - name: mypg + type: jdbc + default-database: mydb + username: ... + password: ... + base-url: ... +``` +{{< /tab >}} +{{< /tabs >}} + +#### PostgresSQL 元空间映射 + +除了数据库之外,postgresSQL 还有一个额外的命名空间 `schema`。一个 Postgres 实例可以拥有多个数据库,每个数据库可以拥有多个 schema,其中一个 schema 默认名为 “public”,每个 schema 可以包含多张表。 +在 Flink 中,当查询由 Postgres catalog 注册的表时,用户可以使用 `schema_name.table_name` 或只有 `table_name`,其中 `schema_name` 是可选的,默认值为 “public”。 + +因此,Flink Catalog 和 Postgres 之间的元空间映射如下: + +| Flink Catalog Metaspace Structure | Postgres Metaspace Structure | +| :------------------------------------| :-----------------------------------| +| catalog name (defined in Flink only) | N/A | +| database name | database name | +| table name | [schema_name.]table_name | + +Flink 中的 Postgres 表的完整路径应该是 ``"..``"``。如果指定了 schema,请注意需要转义 ``。 + +这里提供了一些访问 Postgres 表的例子: + +```sql +-- 扫描 'public' schema(即默认 schema)中的 'test_table' 表,schema 名称可以省略 +SELECT * FROM mypg.mydb.test_table; +SELECT * FROM mydb.test_table; +SELECT * FROM test_table; + +-- 扫描 'custom_schema' schema 中的 'test_table2' 表, +-- 自定义 schema 不能省略,并且必须与表一起转义。 +SELECT * FROM mypg.mydb.`custom_schema.test_table2` +SELECT * FROM mydb.`custom_schema.test_table2`; +SELECT * FROM `custom_schema.test_table2`; +``` + +数据类型映射 +---------------- +Flink 支持连接到多个使用方言(dialect)的数据库,如 MySQL、PostgresSQL、Derby 等。其中,Derby 通常是用于测试目的。下表列出了从关系数据库数据类型到 Flink SQL 数据类型的类型映射,映射表可以使得在 Flink 中定义 JDBC 表更加简单。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    MySQL typePostgreSQL typeFlink SQL type}}">
    TINYINTTINYINT
    + SMALLINT
    + TINYINT UNSIGNED
    + SMALLINT
    + INT2
    + SMALLSERIAL
    + SERIAL2
    SMALLINT
    + INT
    + MEDIUMINT
    + SMALLINT UNSIGNED
    + INTEGER
    + SERIAL
    INT
    + BIGINT
    + INT UNSIGNED
    + BIGINT
    + BIGSERIAL
    BIGINT
    BIGINT UNSIGNEDDECIMAL(20, 0)
    BIGINTBIGINTBIGINT
    FLOAT + REAL
    + FLOAT4
    FLOAT
    + DOUBLE
    + DOUBLE PRECISION
    + FLOAT8
    + DOUBLE PRECISION
    DOUBLE
    + NUMERIC(p, s)
    + DECIMAL(p, s)
    + NUMERIC(p, s)
    + DECIMAL(p, s)
    DECIMAL(p, s)
    + BOOLEAN
    + TINYINT(1)
    BOOLEANBOOLEAN
    DATEDATEDATE
    TIME [(p)]TIME [(p)] [WITHOUT TIMEZONE]TIME [(p)] [WITHOUT TIMEZONE]
    DATETIME [(p)]TIMESTAMP [(p)] [WITHOUT TIMEZONE]TIMESTAMP [(p)] [WITHOUT TIMEZONE]
    + CHAR(n)
    + VARCHAR(n)
    + TEXT
    + CHAR(n)
    + CHARACTER(n)
    + VARCHAR(n)
    + CHARACTER VARYING(n)
    + TEXT
    STRING
    + BINARY
    + VARBINARY
    + BLOB
    BYTEABYTES
    ARRAYARRAY
    + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/table/kafka.md b/docs/content.zh/docs/connectors/table/kafka.md new file mode 100644 index 0000000000000..709e616274fa3 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/kafka.md @@ -0,0 +1,540 @@ +--- +title: Kafka +weight: 3 +type: docs +aliases: + - /zh/dev/table/connectors/kafka.html +--- + + +# Apache Kafka SQL 连接器 + +{{< label "Scan Source: Unbounded" >}} +{{< label "Sink: Streaming Append Mode" >}} + +Kafka 连接器提供从 Kafka topic 中消费和写入数据的能力。 + +依赖 +------------ + +{{< sql_download_table "kafka" >}} + +Kafka 连接器目前并不包含在 Flink 的二进制发行版中,请查阅 [这里]({{< ref "docs/dev/datastream/project-configuration" >}}) 了解如何在集群运行中引用 Kafka 连接器。 + +如何创建 Kafka 表 +---------------- + +以下示例展示了如何创建 Kafka 表: + +```sql +CREATE TABLE KafkaTable ( + `user_id` BIGINT, + `item_id` BIGINT, + `behavior` STRING, + `ts` TIMESTAMP(3) METADATA FROM 'timestamp' +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_behavior', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + 'scan.startup.mode' = 'earliest-offset', + 'format' = 'csv' +) +``` + +可用的元数据 +------------------ + +以下的连接器元数据可以在表定义中通过元数据列的形式获取。 + +`R/W` 列定义了一个元数据是可读的(`R`)还是可写的(`W`)。 +只读列必须声明为 `VIRTUAL` 以在 `INSERT INTO` 操作中排除它们。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    数据类型描述R/W
    topicSTRING NOT NULLKafka 记录的 Topic 名。R
    partitionINT NOT NULLKafka 记录的 partition ID。R
    headersMAP NOT NULL二进制 Map 类型的 Kafka 记录头(Header)。R/W
    leader-epochINT NULLKafka 记录的 Leader epoch(如果可用)。R
    offsetBIGINT NOT NULLKafka 记录在 partition 中的 offset。R
    timestampTIMESTAMP_LTZ(3) NOT NULLKafka 记录的时间戳。R/W
    timestamp-typeSTRING NOT NULLKafka 记录的时间戳类型。可能的类型有 "NoTimestampType", + "CreateTime"(会在写入元数据时设置),或 "LogAppendTime"。R
    + +以下扩展的 `CREATE TABLE` 示例展示了使用这些元数据字段的语法: + +```sql +CREATE TABLE KafkaTable ( + `event_time` TIMESTAMP(3) METADATA FROM 'timestamp', + `partition` BIGINT METADATA VIRTUAL, + `offset` BIGINT METADATA VIRTUAL, + `user_id` BIGINT, + `item_id` BIGINT, + `behavior` STRING +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_behavior', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + 'scan.startup.mode' = 'earliest-offset', + 'format' = 'csv' +); +``` + +**格式元信息** + +连接器可以读出消息格式的元数据。格式元数据的配置键以 `'value.'` 作为前缀。 + +以下示例展示了如何获取 Kafka 和 Debezium 的元数据字段: + +```sql +CREATE TABLE KafkaTable ( + `event_time` TIMESTAMP(3) METADATA FROM 'value.source.timestamp' VIRTUAL, -- from Debezium format + `origin_table` STRING METADATA FROM 'value.source.table' VIRTUAL, -- from Debezium format + `partition_id` BIGINT METADATA FROM 'partition' VIRTUAL, -- from Kafka connector + `offset` BIGINT METADATA VIRTUAL, -- from Kafka connector + `user_id` BIGINT, + `item_id` BIGINT, + `behavior` STRING +) WITH ( + 'connector' = 'kafka', + 'topic' = 'user_behavior', + 'properties.bootstrap.servers' = 'localhost:9092', + 'properties.group.id' = 'testGroup', + 'scan.startup.mode' = 'earliest-offset', + 'value.format' = 'debezium-json' +); +``` + +连接器参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值数据类型描述
    connector
    必选(无)String指定使用的连接器,Kafka 连接器使用 'kafka'
    topic
    required for sink(无)String当表用作 source 时读取数据的 topic 名。亦支持用分号间隔的 topic 列表,如 'topic-1;topic-2'。注意,对 source 表而言,'topic' 和 'topic-pattern' 两个选项只能使用其中一个。当表被用作 sink 时,该配置表示写入的 topic 名。注意 sink 表不支持 topic 列表。
    topic-pattern
    可选(无)String匹配读取 topic 名称的正则表达式。在作业开始运行时,所有匹配该正则表达式的 topic 都将被 Kafka consumer 订阅。注意,对 source 表而言,'topic' 和 'topic-pattern' 两个选项只能使用其中一个。
    properties.bootstrap.servers
    必选(无)String逗号分隔的 Kafka broker 列表。
    properties.group.id
    required by source(无)StringThe id of the consumer group for Kafka source, optional for Kafka sink.Kafka source 的 consumer 组 id,对于 Kafka sink 可选填。
    properties.*
    可选(无)String + 可以设置和传递任意 Kafka 的配置项。后缀名必须匹配在 Kafka 配置文档 中定义的配置键。Flink 将移除 "properties." 配置键前缀并将变换后的配置键和值传入底层的 Kafka 客户端。例如,你可以通过 'properties.allow.auto.create.topics' = 'false' 来禁用 topic 的自动创建。但是某些配置项不支持进行配置,因为 Flink 会覆盖这些配置,例如 'key.deserializer''value.deserializer'。 +
    format
    必选(无)String用来序列化或反序列化 Kafka 消息的格式。 + 请参阅 }}">格式 页面以获取更多关于格式的细节和相关配置项。 + 注意:该配置项和 'value.format' 二者必需其一。 +
    key.format
    可选(无)String用来序列化和反序列化 Kafka 消息键(Key)的格式。 + 请参阅 }}">格式 页面以获取更多关于格式的细节和相关配置项。 + 注意:如果定义了键格式,则配置项 'key.fields' 也是必需的。 + 否则 Kafka 记录将使用空值作为键。 +
    key.fields
    可选[]List<String>表结构中用来配置消息键(Key)格式数据类型的字段列表。默认情况下该列表为空,因此消息键没有定义。 + 列表格式为 'field1;field2'。 +
    key.fields-prefix
    可选(无)String为所有消息键(Key)格式字段指定自定义前缀,以避免与消息体(Value)格式字段重名。默认情况下前缀为空。 + 如果定义了前缀,表结构和配置项 'key.fields' 都需要使用带前缀的名称。 + 当构建消息键格式字段时,前缀会被移除,消息键格式将会使用无前缀的名称。 + 请注意该配置项要求必须将 'value.fields-include' 配置为 'EXCEPT_KEY'。 +
    value.format
    必选(无)String序列化和反序列化 Kafka 消息体时使用的格式。 + 请参阅 }}">格式 页面以获取更多关于格式的细节和相关配置项。 + 注意:该配置项和 'format' 二者必需其一。 +
    value.fields-include
    可选ALL

    枚举类型

    可选值:[ALL, EXCEPT_KEY]
    定义消息体(Value)格式如何处理消息键(Key)字段的策略。 + 默认情况下,表结构中 'ALL' 即所有的字段都会包含在消息体格式中,即消息键字段在消息键和消息体格式中都会出现。 +
    scan.startup.mode
    可选group-offsetsStringKafka consumer 的启动模式。有效值为:'earliest-offset''latest-offset''group-offsets''timestamp''specific-offsets'。 + 请参阅下方 起始消费位点 以获取更多细节。
    scan.startup.specific-offsets
    可选(无)String在使用 'specific-offsets' 启动模式时为每个 partition 指定 offset,例如 'partition:0,offset:42;partition:1,offset:300'。 +
    scan.startup.timestamp-millis
    可选(无)Long在使用 'timestamp' 启动模式时指定启动的时间戳(单位毫秒)。
    scan.topic-partition-discovery.interval
    可选(无)DurationConsumer 定期探测动态创建的 Kafka topic 和 partition 的时间间隔。
    sink.partitioner
    可选'default'StringFlink partition 到 Kafka partition 的分区映射关系,可选值有: +
      +
    • default:使用 Kafka 默认的分区器对消息进行分区。
    • +
    • fixed:每个 Flink partition 最终对应最多一个 Kafka partition。
    • +
    • round-robin:Flink partition 按轮循(round-robin)的模式对应到 Kafka partition。只有当未指定消息的消息键时生效。
    • +
    • 自定义 FlinkKafkaPartitioner 的子类:例如 'org.mycompany.MyPartitioner'
    • +
    + 请参阅下方 Sink 分区 以获取更多细节。 +
    sink.semantic
    可选at-least-onceString定义 Kafka sink 的语义。有效值为 'at-least-once''exactly-once''none'。请参阅 一致性保证 以获取更多细节。
    sink.parallelism
    可选(无)Integer定义 Kafka sink 算子的并行度。默认情况下,并行度由框架定义为与上游串联的算子相同。
    + +特性 +---------------- + +### 消息键(Key)与消息体(Value)的格式 + +Kafka 消息的消息键和消息体部分都可以使用某种 [格式]({{< ref "docs/connectors/table/formats/overview" >}}) 来序列化或反序列化成二进制数据。 + +**消息体格式** + +由于 Kafka 消息中消息键是可选的,以下语句将使用消息体格式读取和写入消息,但不使用消息键格式。 +`'format'` 选项与 `'value.format'` 意义相同。 +所有的格式配置使用格式识别符作为前缀。 + +```sql +CREATE TABLE KafkaTable (, + `ts` TIMESTAMP(3) METADATA FROM 'timestamp', + `user_id` BIGINT, + `item_id` BIGINT, + `behavior` STRING +) WITH ( + 'connector' = 'kafka', + ... + + 'format' = 'json', + 'json.ignore-parse-errors' = 'true' +) +``` + +消息体格式将配置为以下的数据类型: + +```text +ROW<`user_id` BIGINT, `item_id` BIGINT, `behavior` STRING> +``` + +**消息键和消息体格式** + +以下示例展示了如何配置和使用消息键和消息体格式。 +格式配置使用 `'key'` 或 `'value'` 加上格式识别符作为前缀。 + +```sql +CREATE TABLE KafkaTable ( + `ts` TIMESTAMP(3) METADATA FROM 'timestamp', + `user_id` BIGINT, + `item_id` BIGINT, + `behavior` STRING +) WITH ( + 'connector' = 'kafka', + ... + + 'key.format' = 'json', + 'key.json.ignore-parse-errors' = 'true', + 'key.fields' = 'user_id;item_id', + + 'value.format' = 'json', + 'value.json.fail-on-missing-field' = 'false', + 'value.fields-include' = 'ALL' +) +``` + +消息键格式包含了在 `'key.fields'` 中列出的字段(使用 `';'` 分隔)和字段顺序。 +因此将配置为以下的数据类型: + +```text +ROW<`user_id` BIGINT, `item_id` BIGINT> +``` + +由于消息体格式配置为 `'value.fields-include' = 'ALL'`,所以消息键字段也会出现在消息体格式的数据类型中: + +```text +ROW<`user_id` BIGINT, `item_id` BIGINT, `behavior` STRING> +``` + +**重名的格式字段** + +如果消息键字段和消息体字段重名,连接器无法根据表结构信息将这些列区分开。 +`'key.fields-prefix'` 配置项可以在表结构中为消息键字段指定一个唯一名称,并在配置消息键格式的时候保留原名。 + +以下示例展示了在消息键和消息体中同时包含 `version` 字段的情况: + +```sql +CREATE TABLE KafkaTable ( + `k_version` INT, + `k_user_id` BIGINT, + `k_item_id` BIGINT, + `version` INT, + `behavior` STRING +) WITH ( + 'connector' = 'kafka', + ... + + 'key.format' = 'json', + 'key.fields-prefix' = 'k_', + 'key.fields' = 'k_version;k_user_id;k_item_id', + + 'value.format' = 'json', + 'value.fields-include' = 'EXCEPT_KEY' +) +``` + +消息体格式必须配置为 `'EXCEPT_KEY'` 模式。格式将被配置为以下的数据类型: + +```text +消息键格式: +ROW<`version` INT, `user_id` BIGINT, `item_id` BIGINT> + +消息体格式: +ROW<`version` INT, `behavior` STRING> +``` + +### Topic 和 Partition 的探测 + +`topic` 和 `topic-pattern` 配置项决定了 source 消费的 topic 或 topic 的匹配规则。`topic` 配置项可接受使用分号间隔的 topic 列表,例如 `topic-1;topic-2`。 +`topic-pattern` 配置项使用正则表达式来探测匹配的 topic。例如 `topic-pattern` 设置为 `test-topic-[0-9]`,则在作业启动时,所有匹配该正则表达式的 topic(以 `test-topic-` 开头,以一位数字结尾)都将被 consumer 订阅。 + +为允许 consumer 在作业启动之后探测到动态创建的 topic,请将 `scan.topic-partition-discovery.interval` 配置为一个非负值。这将使 consumer 能够探测匹配名称规则的 topic 中新的 partition。 + +请参阅 [Kafka DataStream 连接器文档]({{< ref "docs/connectors/datastream/kafka" >}}#kafka-consumer-topic-和分区发现) 以获取更多关于 topic 和 partition 探测的信息。 + +注意 topic 列表和 topic 匹配规则只适用于 source。对于 sink 端,Flink 目前只支持单一 topic。 + +### 起始消费位点 + +`scan.startup.mode` 配置项决定了 Kafka consumer 的启动模式。有效值为: +
      +
    • `group-offsets`:从 Zookeeper/Kafka 中某个指定的消费组已提交的偏移量开始。
    • +
    • `earliest-offset`:从可能的最早偏移量开始。
    • +
    • `latest-offset`:从最末尾偏移量开始。
    • +
    • `timestamp`:从用户为每个 partition 指定的时间戳开始。
    • +
    • `specific-offsets`:从用户为每个 partition 指定的偏移量开始。
    • +
    + +默认值 `group-offsets` 表示从 Zookeeper/Kafka 中最近一次已提交的偏移量开始消费。 + +如果使用了 `timestamp`,必须使用另外一个配置项 `scan.startup.timestamp-millis` 来指定一个从格林尼治标准时间 1970 年 1 月 1 日 00:00:00.000 开始计算的毫秒单位时间戳作为起始时间。 + +如果使用了 `specific-offsets`,必须使用另外一个配置项 `scan.startup.specific-offsets` 来为每个 partition 指定起始偏移量, +例如,选项值 `partition:0,offset:42;partition:1,offset:300` 表示 partition `0` 从偏移量 `42` 开始,partition `1` 从偏移量 `300` 开始。 + + +### CDC 变更日志(Changelog) Source + +Flink 原生支持使用 Kafka 作为 CDC 变更日志(changelog) source。如果 Kafka topic 中的消息是通过变更数据捕获(CDC)工具从其他数据库捕获的变更事件,则你可以使用 CDC 格式将消息解析为 Flink SQL 系统中的插入(INSERT)、更新(UPDATE)、删除(DELETE)消息。 + +在许多情况下,变更日志(changelog) source 都是非常有用的功能,例如将数据库中的增量数据同步到其他系统,审核日志,数据库的物化视图,时态表关联数据库表的更改历史等。 + +Flink 提供了几种 CDC 格式: + +* [debezium]({{< ref "docs/connectors/table/formats/debezium.md" >}}) +* [canal]({{< ref "docs/connectors/table/formats/canal.md" >}}) +* [maxwell]({{< ref "docs/connectors/table/formats/maxwell.md" >}}) + +### Sink 分区 + +配置项 `sink.partitioner` 指定了从 Flink 分区到 Kafka 分区的映射关系。 +默认情况下,Flink 使用 [Kafka 默认分区器](https://github.com/apache/kafka/blob/trunk/clients/src/main/java/org/apache/kafka/clients/producer/internals/DefaultPartitioner.java) 来对消息分区。默认分区器对没有消息键的消息使用 [粘性分区策略(sticky partition strategy)](https://www.confluent.io/blog/apache-kafka-producer-improvements-sticky-partitioner/) 进行分区,对含有消息键的消息使用 murmur2 哈希算法计算分区。 + +为了控制数据行到分区的路由,也可以提供一个自定义的 sink 分区器。'fixed' 分区器会将同一个 Flink 分区中的消息写入同一个 Kafka 分区,从而减少网络连接的开销。 + +### 一致性保证 + +默认情况下,如果查询在 [启用 checkpoint]({{< ref "docs/dev/datastream/fault-tolerance/checkpointing" >}}#enabling-and-configuring-checkpointing) 模式下执行时,Kafka sink 按照至少一次(at-lease-once)语义保证将数据写入到 Kafka topic 中。 + +当 Flink checkpoint 启用时,`kafka` 连接器可以提供精确一次(exactly-once)的语义保证。 + +除了启用 Flink checkpoint,还可以通过传入对应的 `sink.semantic` 选项来选择三种不同的运行模式: + + * `none`:Flink 不保证任何语义。已经写出的记录可能会丢失或重复。 + * `at-least-once` (默认设置):保证没有记录会丢失(但可能会重复)。 + * `exactly-once`:使用 Kafka 事务提供精确一次(exactly-once)语义。当使用事务向 Kafka 写入数据时,请将所有从 Kafka 中消费记录的应用中的 `isolation.level` 配置项设置成实际所需的值(`read_committed` 或 `read_uncommitted`,后者为默认值)。 + +请参阅 [Kafka 文档]({{< ref "docs/connectors/datastream/kafka" >}}#kafka-producers-和容错) 以获取更多关于语义保证的信息。 + +### Source 按分区 Watermark + +Flink 对于 Kafka 支持发送按分区的 watermark。Watermark 在 Kafka consumer 中生成。 +按分区 watermark 的合并方式和在流 shuffle 时合并 Watermark 的方式一致。 +Source 输出的 watermark 由读取的分区中最小的 watermark 决定。 +如果 topic 中的某些分区闲置,watermark 生成器将不会向前推进。 +你可以在表配置中设置 [`'table.exec.source.idle-timeout'`]({{< ref "docs/dev/table/config" >}}#table-exec-source-idle-timeout) 选项来避免上述问题。 + +请参阅 [Kafka watermark 策略]({{< ref "docs/dev/datastream/event-time/generating_watermarks" >}}#watermark-策略和-kafka-连接器) 以获取更多细节。 + +数据类型映射 +---------------- + +Kafka 将消息键值以二进制进行存储,因此 Kafka 并不存在 schema 或数据类型。Kafka 消息使用格式配置进行序列化和反序列化,例如 csv,json,avro。 +因此,数据类型映射取决于使用的格式。请参阅 [格式]({{< ref "docs/connectors/table/formats/overview" >}}) 页面以获取更多细节。 + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/table/kinesis.md b/docs/content.zh/docs/connectors/table/kinesis.md new file mode 100644 index 0000000000000..dfbc8d8b00663 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/kinesis.md @@ -0,0 +1,744 @@ +--- +title: Kinesis +weight: 5 +type: docs +aliases: + - /zh/dev/table/connectors/kinesis.html +--- + + +# Amazon Kinesis Data Streams SQL Connector + +{{< label "Scan Source: Unbounded" >}} +{{< label "Sink: Streaming Append Mode" >}} + +The Kinesis connector allows for reading data from and writing data into [Amazon Kinesis Data Streams (KDS)](https://aws.amazon.com/kinesis/data-streams/). + +Dependencies +------------ + +{{< sql_download_table "kinesis" >}} + +How to create a Kinesis data stream table +----------------------------------------- + +Follow the instructions from the [Amazon KDS Developer Guide](https://docs.aws.amazon.com/streams/latest/dev/learning-kinesis-module-one-create-stream.html) to set up a Kinesis stream. +The following example shows how to create a table backed by a Kinesis data stream: + +```sql +CREATE TABLE KinesisTable ( + `user_id` BIGINT, + `item_id` BIGINT, + `category_id` BIGINT, + `behavior` STRING, + `ts` TIMESTAMP(3) +) +PARTITIONED BY (user_id, item_id) +WITH ( + 'connector' = 'kinesis', + 'stream' = 'user_behavior', + 'aws.region' = 'us-east-2', + 'scan.stream.initpos' = 'LATEST', + 'format' = 'csv' +); +``` + +Available Metadata +------------------ + +The following metadata can be exposed as read-only (`VIRTUAL`) columns in a table definition. + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyData TypeDescription
    timestampTIMESTAMP_LTZ(3) NOT NULLThe approximate time when the record was inserted into the stream.
    shard-idVARCHAR(128) NOT NULLThe unique identifier of the shard within the stream from which the record was read.
    sequence-numberVARCHAR(128) NOT NULLThe unique identifier of the record within its shard.
    + +The extended `CREATE TABLE` example demonstrates the syntax for exposing these metadata fields: + +```sql +CREATE TABLE KinesisTable ( + `user_id` BIGINT, + `item_id` BIGINT, + `category_id` BIGINT, + `behavior` STRING, + `ts` TIMESTAMP(3), + `arrival_time` TIMESTAMP(3) METADATA FROM 'timestamp' VIRTUAL, + `shard_id` VARCHAR(128) NOT NULL METADATA FROM 'shard-id' VIRTUAL, + `sequence_number` VARCHAR(128) NOT NULL METADATA FROM 'sequence-number' VIRTUAL +) +PARTITIONED BY (user_id, item_id) +WITH ( + 'connector' = 'kinesis', + 'stream' = 'user_behavior', + 'aws.region' = 'us-east-2', + 'scan.stream.initpos' = 'LATEST', + 'format' = 'csv' +); +``` + + +Connector Options +----------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionRequiredDefaultTypeDescription
    Common Options
    connector
    required(none)StringSpecify what connector to use. For Kinesis use 'kinesis'.
    stream
    required(none)StringName of the Kinesis data stream backing this table.
    format
    required(none)StringThe format used to deserialize and serialize Kinesis data stream records. See Data Type Mapping for details.
    aws.region
    optional(none)StringThe AWS region where the stream is defined. Either this or aws.endpoint are required.
    aws.endpoint
    optional(none)StringThe AWS endpoint for Kinesis (derived from the AWS region setting if not set). Either this or aws.region are required.
    Authentication Options
    aws.credentials.provider
    optionalAUTOStringA credentials provider to use when authenticating against the Kinesis endpoint. See Authentication for details.
    aws.credentials.basic.accesskeyid
    optional(none)StringThe AWS access key ID to use when setting credentials provider type to BASIC.
    aws.credentials.basic.secretkey
    optional(none)StringThe AWS secret key to use when setting credentials provider type to BASIC.
    aws.credentials.profile.path
    optional(none)StringOptional configuration for profile path if credential provider type is set to be PROFILE.
    aws.credentials.profile.name
    optional(none)StringOptional configuration for profile name if credential provider type is set to be PROFILE.
    aws.credentials.role.arn
    optional(none)StringThe role ARN to use when credential provider type is set to ASSUME_ROLE or WEB_IDENTITY_TOKEN.
    aws.credentials.role.sessionName
    optional(none)StringThe role session name to use when credential provider type is set to ASSUME_ROLE or WEB_IDENTITY_TOKEN.
    aws.credentials.role.externalId
    optional(none)StringThe external ID to use when credential provider type is set to ASSUME_ROLE.
    aws.credentials.role.provider
    optional(none)StringThe credentials provider that provides credentials for assuming the role when credential provider type is set to ASSUME_ROLE. Roles can be nested, so this value can again be set to ASSUME_ROLE
    aws.credentials.webIdentityToken.file
    optional(none)StringThe absolute path to the web identity token file that should be used if provider type is set to WEB_IDENTITY_TOKEN.
    Source Options
    scan.stream.initpos
    optionalLATESTStringInitial position to be used when reading from the table. See Start Reading Position for details.
    scan.stream.initpos-timestamp
    optional(none)StringThe initial timestamp to start reading Kinesis stream from (when scan.stream.initpos is AT_TIMESTAMP). See Start Reading Position for details.
    scan.stream.initpos-timestamp-format
    optionalyyyy-MM-dd'T'HH:mm:ss.SSSXXXStringThe date format of initial timestamp to start reading Kinesis stream from (when scan.stream.initpos is AT_TIMESTAMP). See Start Reading Position for details.
    scan.stream.recordpublisher
    optionalPOLLINGStringThe RecordPublisher type to use for sources. See Enhanced Fan-Out for details.
    scan.stream.efo.consumername
    optional(none)StringThe name of the EFO consumer to register with KDS. See Enhanced Fan-Out for details.
    scan.stream.efo.registration
    optionalLAZYStringDetermine how and when consumer de-/registration is performed (LAZY|EAGER|NONE). See Enhanced Fan-Out for details.
    scan.stream.efo.consumerarn
    optional(none)StringThe prefix of consumer ARN for a given stream. See Enhanced Fan-Out for details.
    scan.stream.efo.http-client.max-concurrency
    optional10000IntegerMaximum number of allowed concurrent requests for the EFO client. See Enhanced Fan-Out for details.
    scan.stream.describe.maxretries
    optional50IntegerThe maximum number of describeStream attempts if we get a recoverable exception.
    scan.stream.describe.backoff.base
    optional2000LongThe base backoff time (in milliseconds) between each describeStream attempt (for consuming from DynamoDB streams).
    scan.stream.describe.backoff.max
    optional5000LongThe maximum backoff time (in milliseconds) between each describeStream attempt (for consuming from DynamoDB streams).
    scan.stream.describe.backoff.expconst
    optional1.5DoubleThe power constant for exponential backoff between each describeStream attempt (for consuming from DynamoDB streams).
    scan.list.shards.maxretries
    optional10IntegerThe maximum number of listShards attempts if we get a recoverable exception.
    scan.list.shards.backoff.base
    optional1000LongThe base backoff time (in milliseconds) between each listShards attempt.
    scan.list.shards.backoff.max
    optional5000LongThe maximum backoff time (in milliseconds) between each listShards attempt.
    scan.list.shards.backoff.expconst
    optional1.5DoubleThe power constant for exponential backoff between each listShards attempt.
    scan.stream.describestreamconsumer.maxretries
    optional50IntegerThe maximum number of describeStreamConsumer attempts if we get a recoverable exception.
    scan.stream.describestreamconsumer.backoff.base
    optional2000LongThe base backoff time (in milliseconds) between each describeStreamConsumer attempt.
    scan.stream.describestreamconsumer.backoff.max
    optional5000LongThe maximum backoff time (in milliseconds) between each describeStreamConsumer attempt.
    scan.stream.describestreamconsumer.backoff.expconst
    optional1.5DoubleThe power constant for exponential backoff between each describeStreamConsumer attempt.
    scan.stream.registerstreamconsumer.maxretries
    optional10IntegerThe maximum number of registerStream attempts if we get a recoverable exception.
    scan.stream.registerstreamconsumer.timeout
    optional60IntegerThe maximum time in seconds to wait for a stream consumer to become active before giving up.
    scan.stream.registerstreamconsumer.backoff.base
    optional500LongThe base backoff time (in milliseconds) between each registerStream attempt.
    scan.stream.registerstreamconsumer.backoff.max
    optional2000LongThe maximum backoff time (in milliseconds) between each registerStream attempt.
    scan.stream.registerstreamconsumer.backoff.expconst
    optional1.5DoubleThe power constant for exponential backoff between each registerStream attempt.
    scan.stream.deregisterstreamconsumer.maxretries
    optional10IntegerThe maximum number of deregisterStream attempts if we get a recoverable exception.
    scan.stream.deregisterstreamconsumer.timeout
    optional60IntegerThe maximum time in seconds to wait for a stream consumer to deregister before giving up.
    scan.stream.deregisterstreamconsumer.backoff.base
    optional500LongThe base backoff time (in milliseconds) between each deregisterStream attempt.
    scan.stream.deregisterstreamconsumer.backoff.max
    optional2000LongThe maximum backoff time (in milliseconds) between each deregisterStream attempt.
    scan.stream.deregisterstreamconsumer.backoff.expconst
    optional1.5DoubleThe power constant for exponential backoff between each deregisterStream attempt.
    scan.shard.subscribetoshard.maxretries
    optional10IntegerThe maximum number of subscribeToShard attempts if we get a recoverable exception.
    scan.shard.subscribetoshard.backoff.base
    optional1000LongThe base backoff time (in milliseconds) between each subscribeToShard attempt.
    scan.shard.subscribetoshard.backoff.max
    optional2000LongThe maximum backoff time (in milliseconds) between each subscribeToShard attempt.
    scan.shard.subscribetoshard.backoff.expconst
    optional1.5DoubleThe power constant for exponential backoff between each subscribeToShard attempt.
    scan.shard.getrecords.maxrecordcount
    optional10000IntegerThe maximum number of records to try to get each time we fetch records from a AWS Kinesis shard.
    scan.shard.getrecords.maxretries
    optional3IntegerThe maximum number of getRecords attempts if we get a recoverable exception.
    scan.shard.getrecords.backoff.base
    optional300LongThe base backoff time (in milliseconds) between getRecords attempts if we get a ProvisionedThroughputExceededException.
    scan.shard.getrecords.backoff.max
    optional1000LongThe maximum backoff time (in milliseconds) between getRecords attempts if we get a ProvisionedThroughputExceededException.
    scan.shard.getrecords.backoff.expconst
    optional1.5DoubleThe power constant for exponential backoff between each getRecords attempt.
    scan.shard.getrecords.intervalmillis
    optional200LongThe interval (in milliseconds) between each getRecords request to a AWS Kinesis shard in milliseconds.
    scan.shard.getiterator.maxretries
    optional3IntegerThe maximum number of getShardIterator attempts if we get ProvisionedThroughputExceededException.
    scan.shard.getiterator.backoff.base
    optional300LongThe base backoff time (in milliseconds) between getShardIterator attempts if we get a ProvisionedThroughputExceededException.
    scan.shard.getiterator.backoff.max
    optional1000LongThe maximum backoff time (in milliseconds) between getShardIterator attempts if we get a ProvisionedThroughputExceededException.
    scan.shard.getiterator.backoff.expconst
    optional1.5DoubleThe power constant for exponential backoff between each getShardIterator attempt.
    scan.shard.discovery.intervalmillis
    optional10000IntegerThe interval between each attempt to discover new shards.
    scan.shard.adaptivereads
    optionalfalseBooleanThe config to turn on adaptive reads from a shard. See the AdaptivePollingRecordPublisher documentation for details.
    scan.shard.idle.interval
    optional-1LongThe interval (in milliseconds) after which to consider a shard idle for purposes of watermark generation. A positive value will allow the watermark to progress even when some shards don't receive new records.
    scan.watermark.sync.interval
    optional30000LongThe interval (in milliseconds) for periodically synchronizing the shared watermark state.
    scan.watermark.lookahead.millis
    optional0LongThe maximum delta (in milliseconds) allowed for the reader to advance ahead of the shared global watermark.
    scan.watermark.sync.queue.capacity
    optional100IntegerThe maximum number of records that will be buffered before suspending consumption of a shard.
    Sink Options
    sink.partitioner
    optionalrandom or row-basedStringOptional output partitioning from Flink's partitions into Kinesis shards. See Sink Partitioning for details.
    sink.partitioner-field-delimiter
    optional|StringOptional field delimiter for a fields-based partitioner derived from a PARTITION BY clause. See Sink Partitioning for details.
    sink.producer.*
    optional(none) + Sink options for the KinesisProducer. + Suffix names must match the KinesisProducerConfiguration setters in lower-case hyphenated style (for example, sink.producer.collection-max-count or sink.producer.aggregation-max-count). + The transformed action keys are passed to the sink.producer.* to KinesisProducerConfigurations#fromProperties. + Note that some of the defaults are overwritten by KinesisConfigUtil. +
    + +Features +-------- + +### Authorization + +Make sure to [create an appropriate IAM policy](https://docs.aws.amazon.com/streams/latest/dev/controlling-access.html) to allow reading from / writing to the Kinesis data streams. + +### Authentication + +Depending on your deployment you would choose a different Credentials Provider to allow access to Kinesis. +By default, the `AUTO` Credentials Provider is used. +If the access key ID and secret key are set in the deployment configuration, this results in using the `BASIC` provider. + +A specific [AWSCredentialsProvider](https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/index.html?com/amazonaws/auth/AWSCredentialsProvider.html) can be **optionally** set using the `aws.credentials.provider` setting. +Supported values are: + +* `AUTO` - Use the default AWS Credentials Provider chain that searches for credentials in the following order: `ENV_VARS`, `SYS_PROPS`, `WEB_IDENTITY_TOKEN`, `PROFILE`, and EC2/ECS credentials provider. +* `BASIC` - Use access key ID and secret key supplied as configuration. +* `ENV_VAR` - Use `AWS_ACCESS_KEY_ID` & `AWS_SECRET_ACCESS_KEY` environment variables. +* `SYS_PROP` - Use Java system properties `aws.accessKeyId` and `aws.secretKey`. +* `PROFILE` - Use an AWS credentials profile to create the AWS credentials. +* `ASSUME_ROLE` - Create AWS credentials by assuming a role. The credentials for assuming the role must be supplied. +* `WEB_IDENTITY_TOKEN` - Create AWS credentials by assuming a role using Web Identity Token. + +### Start Reading Position + +You can configure table sources to start reading a table-backing Kinesis data stream from a specific position through the `scan.stream.initpos` option. +Available values are: + +* `LATEST`: read shards starting from the latest record. +* `TRIM_HORIZON`: read shards starting from the earliest record possible (data may be trimmed by Kinesis depending on the current retention settings of the backing stream). +* `AT_TIMESTAMP`: read shards starting from a specified timestamp. The timestamp value should be specified through the `scan.stream.initpos-timestamp` in one of the following formats: + * A non-negative double value representing the number of seconds that has elapsed since the Unix epoch (for example, `1459799926.480`). + * A value conforming to a user-defined `SimpleDateFormat` specified at `scan.stream.initpos-timestamp-format`. + If a user does not define a format, the default pattern will be `yyyy-MM-dd'T'HH:mm:ss.SSSXXX`. + For example, timestamp value is `2016-04-04` and user-defined format is `yyyy-MM-dd`, or timestamp value is `2016-04-04T19:58:46.480-00:00` and a user-defined format is not provided. + +### Sink Partitioning + +Kinesis data streams consist of one or more shards, and the `sink.partitioner` option allows you to control how records written into a multi-shard Kinesis-backed table will be partitioned between its shards. +Valid values are: + +* `fixed`: Kinesis `PartitionKey` values derived from the Flink subtask index, so each Flink partition ends up in at most one Kinesis partition (assuming that no re-sharding takes place at runtime). +* `random`: Kinesis `PartitionKey` values are assigned randomly. This is the default value for tables not defined with a `PARTITION BY` clause. +* Custom `FixedKinesisPartitioner` subclass: e.g. `'org.mycompany.MyPartitioner'`. + +{{< hint info >}} +Records written into tables defining a `PARTITION BY` clause will always be partitioned based on a concatenated projection of the `PARTITION BY` fields. +In this case, the `sink.partitioner` field cannot be used to modify this behavior (attempting to do this results in a configuration error). +You can, however, use the `sink.partitioner-field-delimiter` option to set the delimiter of field values in the concatenated [PartitionKey](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_PutRecord.html#Streams-PutRecord-request-PartitionKey) string (an empty string is also a valid delimiter). +{{< /hint >}} + +### Enhanced Fan-Out + +[Enhanced Fan-Out (EFO)](https://aws.amazon.com/blogs/aws/kds-enhanced-fanout/) increases the maximum number of concurrent consumers per Kinesis data stream. +Without EFO, all concurrent Kinesis consumers share a single read quota per shard. +Using EFO, each consumer gets a distinct dedicated read quota per shard, allowing read throughput to scale with the number of consumers. + +Note Using EFO will [incur additional cost](https://aws.amazon.com/kinesis/data-streams/pricing/). + +You can enable and configure EFO with the following properties: + +* `scan.stream.recordpublisher`: Determines whether to use `EFO` or `POLLING`. +* `scan.stream.efo.consumername`: A name to identify the consumer when the above value is `EFO`. +* `scan.stream.efo.registration`: Strategy for (de-)registration of `EFO` consumers with the name given by the `scan.stream.efo.consumername` value. Valid strategies are: + * `LAZY` (default): Stream consumers are registered when the Flink job starts running. + If the stream consumer already exists, it will be reused. + This is the preferred strategy for the majority of applications. + However, jobs with parallelism greater than 1 will result in tasks competing to register and acquire the stream consumer ARN. + For jobs with very large parallelism this can result in an increased start-up time. + The describe operation has a limit of 20 [transactions per second](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_DescribeStreamConsumer.html), + this means application startup time will increase by roughly `parallelism/20 seconds`. + * `EAGER`: Stream consumers are registered in the `FlinkKinesisConsumer` constructor. + If the stream consumer already exists, it will be reused. + This will result in registration occurring when the job is constructed, + either on the Flink Job Manager or client environment submitting the job. + Using this strategy results in a single thread registering and retrieving the stream consumer ARN, + reducing startup time over `LAZY` (with large parallelism). + However, consider that the client environment will require access to the AWS services. + * `NONE`: Stream consumer registration is not performed by `FlinkKinesisConsumer`. + Registration must be performed externally using the [AWS CLI or SDK](https://aws.amazon.com/tools/) + to invoke [RegisterStreamConsumer](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_RegisterStreamConsumer.html). + Stream consumer ARNs should be provided to the job via the consumer configuration. +* `scan.stream.efo.consumerarn.`: ARNs identifying externally registered ARN-consumers (substitute `` with the name of your stream in the parameter name). + Use this if you choose to use `NONE` as a `scan.stream.efo.registration` strategy. + +Note For a given Kinesis data stream, each EFO consumer must have a unique name. +However, consumer names do not have to be unique across data streams. +Reusing a consumer name will result in existing subscriptions being terminated. + +Note With the `LAZY` and `EAGER` strategies, stream consumers are de-registered when the job is shutdown gracefully. +In the event that a job terminates within executing the shutdown hooks, stream consumers will remain active. +In this situation the stream consumers will be gracefully reused when the application restarts. +With the `NONE` strategy, stream consumer de-registration is not performed by `FlinkKinesisConsumer`. + +Data Type Mapping +---------------- + +Kinesis stores records as Base64-encoded binary data objects, so it doesn't have a notion of internal record structure. +Instead, Kinesis records are deserialized and serialized by formats, e.g. 'avro', 'csv', or 'json'. +To determine the data type of the messages in your Kinesis-backed tables, pick a suitable Flink format with the `format` keyword. +Please refer to the [Formats]({{< ref "docs/connectors/table/formats/overview" >}}) pages for more details. + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/table/overview.md b/docs/content.zh/docs/connectors/table/overview.md new file mode 100644 index 0000000000000..51fce6ddcfbf6 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/overview.md @@ -0,0 +1,356 @@ +--- +title: "概览" +weight: 1 +type: docs +aliases: + - /zh/dev/table/connectors/ +--- + + +# Table & SQL Connectors + + +Flink's Table API & SQL programs can be connected to other external systems for reading and writing both batch and streaming tables. A table source provides access to data which is stored in external systems (such as a database, key-value store, message queue, or file system). A table sink emits a table to an external storage system. Depending on the type of source and sink, they support different formats such as CSV, Avro, Parquet, or ORC. + +This page describes how to register table sources and table sinks in Flink using the natively supported connectors. After a source or sink has been registered, it can be accessed by Table API & SQL statements. + +If you want to implement your own *custom* table source or sink, have a look at the [user-defined sources & sinks page]({{< ref "docs/dev/table/sourcessinks" >}}). + +Supported Connectors +------------ + +Flink natively support various connectors. The following tables list all available connectors. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameVersionSourceSink
    }}">FilesystemBounded and Unbounded Scan, LookupStreaming Sink, Batch Sink
    }}">Elasticsearch6.x & 7.xNot supportedStreaming Sink, Batch Sink
    }}">Apache Kafka0.10+Unbounded ScanStreaming Sink, Batch Sink
    }}">Amazon Kinesis Data StreamsUnbounded ScanStreaming Sink
    }}">JDBCBounded Scan, LookupStreaming Sink, Batch Sink
    }}">Apache HBase1.4.x & 2.2.xBounded Scan, LookupStreaming Sink, Batch Sink
    }}">Apache Hive}}#supported-hive-versions">Supported VersionsUnbounded Scan, Bounded Scan, LookupStreaming Sink, Batch Sink
    + +{{< top >}} + +How to use connectors +-------- + +Flink supports using SQL `CREATE TABLE` statements to register tables. One can define the table name, +the table schema, and the table options for connecting to an external system. + +See the [SQL section for more information about creating a table]({{< ref "docs/dev/table/sql/create" >}}#create-table). + +The following code shows a full example of how to connect to Kafka for reading and writing JSON records. + +{{< tabs "6d4f00e3-0a94-4ebd-b6b5-c5171851b500" >}} +{{< tab "SQL" >}} +```sql +CREATE TABLE MyUserTable ( + -- declare the schema of the table + `user` BIGINT, + `message` STRING, + `rowtime` TIMESTAMP(3) METADATA FROM 'timestamp', -- use a metadata column to access Kafka's record timestamp + `proctime` AS PROCTIME(), -- use a computed column to define a proctime attribute + WATERMARK FOR `rowtime` AS `rowtime` - INTERVAL '5' SECOND -- use a WATERMARK statement to define a rowtime attribute +) WITH ( + -- declare the external system to connect to + 'connector' = 'kafka', + 'topic' = 'topic_name', + 'scan.startup.mode' = 'earliest-offset', + 'properties.bootstrap.servers' = 'localhost:9092', + 'format' = 'json' -- declare a format for this system +) +``` +{{< /tab >}} +{{< /tabs >}} + +The desired connection properties are converted into string-based key-value pairs. [Factories]({{< ref "docs/dev/table/sourcessinks" >}}) +will create configured table sources, table sinks, and corresponding formats from the key-value pairs +based on factory identifiers (`kafka` and `json` in this example). All factories that can be found via +Java's [Service Provider Interfaces (SPI)](https://docs.oracle.com/javase/tutorial/sound/SPI-intro.html) +are taken into account when searching for exactly one matching factory for each component. + +If no factory can be found or multiple factories match for the given properties, an exception will be +thrown with additional information about considered factories and supported properties. + + +Transform table connector/format resources +-------- + +Flink uses Java's [Service Provider Interfaces (SPI)](https://docs.oracle.com/javase/tutorial/sound/SPI-intro.html) to load the table connector/format factories by their identifiers. Since the SPI resource file named `org.apache.flink.table.factories.Factory` for every table connector/format is under the same directory `META-INF/services`, these resource files will override each other when build the uber-jar of the project which uses more than one table connector/format, which will cause Flink to fail to load table connector/format factories. + +In this situation, the recommended way is transforming these resource files under the directory `META-INF/services` by [ServicesResourceTransformer](https://maven.apache.org/plugins/maven-shade-plugin/examples/resource-transformers.html) of maven shade plugin. Given the pom.xml file content of example that contains connector `flink-sql-connector-hive-3.1.2` and format `flink-parquet` in a project. + +```xml + + 4.0.0 + org.example + myProject + 1.0-SNAPSHOT + + + + + org.apache.flink + flink-sql-connector-hive-3.1.2_{{< scala_version >}} + {{< version >}} + + + + org.apache.flink + flink-parquet_{{< scala_version >}}< + {{< version >}} + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + shade + package + + shade + + + + + + + + + + + + + +``` + +After configured the `ServicesResourceTransformer`, the table connector/format resource files under the directory `META-INF/services` would be merged rather than overwritten each other when build the uber-jar of above project. + +{{< top >}} + +Schema Mapping +------------ + +The body clause of a SQL `CREATE TABLE` statement defines the names and types of physical columns, +constraints and watermarks. Flink doesn't hold the data, thus the schema definition only declares how +to map physical columns from an external system to Flink’s representation. The mapping may not be +mapped by names, it depends on the implementation of formats and connectors. For example, a MySQL database +table is mapped by field names (not case sensitive), and a CSV filesystem is mapped by field order +(field names can be arbitrary). This will be explained in every connector. + +The following example shows a simple schema without time attributes and one-to-one field mapping +of input/output to table columns. + +{{< tabs "0c267c40-32ef-4a00-b4eb-fa39bfe3f14d" >}} +{{< tab "SQL" >}} +```sql +CREATE TABLE MyTable ( + MyField1 INT, + MyField2 STRING, + MyField3 BOOLEAN +) WITH ( + ... +) +``` +{{< /tab >}} +{{< /tabs >}} + +### Metadata + +Some connectors and formats expose additional metadata fields that can be accessed in metadata columns +next to the physical payload columns. See the [`CREATE TABLE` section]({{< ref "docs/dev/table/sql/create" >}}#columns) +for more information about metadata columns. + +### Primary Key + +Primary key constraints tell that a column or a set of columns of a table are unique and they do not contain nulls. Primary key uniquely identifies a row in a table. + +The primary key of a source table is a metadata information for optimization. The primary key of a sink table is usually used by the sink implementation for upserting. + +SQL standard specifies that a constraint can either be ENFORCED or NOT ENFORCED. This controls if the constraint checks are performed on the incoming/outgoing data. Flink does not own the data the only mode we want to support is the NOT ENFORCED mode. Its up to the user to ensure that the query enforces key integrity. + +{{< tabs "9e32660c-868b-4b6a-9632-3b3ea482fe7d" >}} +{{< tab "SQL" >}} +```sql +CREATE TABLE MyTable ( + MyField1 INT, + MyField2 STRING, + MyField3 BOOLEAN, + PRIMARY KEY (MyField1, MyField2) NOT ENFORCED -- defines a primary key on columns +) WITH ( + ... +) +``` +{{< /tab >}} +{{< /tabs >}} + +### Time Attributes + +Time attributes are essential when working with unbounded streaming tables. Therefore both proctime and rowtime attributes can be defined as part of the schema. + +For more information about time handling in Flink and especially event-time, we recommend the general [event-time section]({{< ref "docs/dev/table/concepts/time_attributes" >}}). + +#### Proctime Attributes + +In order to declare a proctime attribute in the schema, you can use [Computed Column syntax]({{< ref "docs/dev/table/sql/create" >}}#create-table) to declare a computed column which is generated from `PROCTIME()` builtin function. +The computed column is a virtual column which is not stored in the physical data. + +{{< tabs "5d1f475b-a002-4e85-84f4-00ab0a55a548" >}} +{{< tab "SQL" >}} +```sql +CREATE TABLE MyTable ( + MyField1 INT, + MyField2 STRING, + MyField3 BOOLEAN + MyField4 AS PROCTIME() -- declares a proctime attribute +) WITH ( + ... +) +``` +{{< /tab >}} +{{< /tabs >}} + +#### Rowtime Attributes + +In order to control the event-time behavior for tables, Flink provides predefined timestamp extractors and watermark strategies. + +Please refer to [CREATE TABLE statements]({{< ref "docs/dev/table/sql/create" >}}#create-table) for more information about defining time attributes in DDL. + +The following timestamp extractors are supported: + +{{< tabs "b40272ba-b259-4a26-9651-815006b283e7" >}} +{{< tab "DDL" >}} +```sql +-- use the existing TIMESTAMP(3) field in schema as the rowtime attribute +CREATE TABLE MyTable ( + ts_field TIMESTAMP(3), + WATERMARK FOR ts_field AS ... +) WITH ( + ... +) + +-- use system functions or UDFs or expressions to extract the expected TIMESTAMP(3) rowtime field +CREATE TABLE MyTable ( + log_ts STRING, + ts_field AS TO_TIMESTAMP(log_ts), + WATERMARK FOR ts_field AS ... +) WITH ( + ... +) +``` +{{< /tab >}} +{{< /tabs >}} + +The following watermark strategies are supported: + +{{< tabs "e004ebfb-75b1-4d81-80ff-ac5420744b75" >}} +{{< tab "DDL" >}} +```sql +-- Sets a watermark strategy for strictly ascending rowtime attributes. Emits a watermark of the +-- maximum observed timestamp so far. Rows that have a timestamp bigger to the max timestamp +-- are not late. +CREATE TABLE MyTable ( + ts_field TIMESTAMP(3), + WATERMARK FOR ts_field AS ts_field +) WITH ( + ... +) + +-- Sets a watermark strategy for ascending rowtime attributes. Emits a watermark of the maximum +-- observed timestamp so far minus 1. Rows that have a timestamp bigger or equal to the max timestamp +-- are not late. +CREATE TABLE MyTable ( + ts_field TIMESTAMP(3), + WATERMARK FOR ts_field AS ts_field - INTERVAL '0.001' SECOND +) WITH ( + ... +) + +-- Sets a watermark strategy for rowtime attributes which are out-of-order by a bounded time interval. +-- Emits watermarks which are the maximum observed timestamp minus the specified delay, e.g. 2 seconds. +CREATE TABLE MyTable ( + ts_field TIMESTAMP(3), + WATERMARK FOR ts_field AS ts_field - INTERVAL '2' SECOND +) WITH ( + ... +) +``` +{{< /tab >}} +{{< /tabs >}} + +Make sure to always declare both timestamps and watermarks. Watermarks are required for triggering time-based operations. + +### SQL Types + +Please see the [Data Types]({{< ref "docs/dev/table/types" >}}) page about how to declare a type in SQL. + +{{< top >}} diff --git a/docs/content.zh/docs/connectors/table/print.md b/docs/content.zh/docs/connectors/table/print.md new file mode 100644 index 0000000000000..6354389d70705 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/print.md @@ -0,0 +1,146 @@ +--- +title: Print +weight: 14 +type: docs +aliases: + - /zh/dev/table/connectors/print.html +--- + + +# Print SQL 连接器 + +{{< label "Sink" >}} + +Print 连接器允许将每一行写入标准输出流或者标准错误流。 + +设计目的: + +- 简单的流作业测试。 +- 对生产调试带来极大便利。 + +四种 format 选项: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    打印内容条件 1条件 2
    标识符:任务 ID> 输出数据
    需要提供前缀打印标识符parallelism > 1
    标识符> 输出数据
    需要提供前缀打印标识符parallelism == 1
    任务 ID> 输出数据
    不需要提供前缀打印标识符parallelism > 1
    输出数据
    不需要提供前缀打印标识符parallelism == 1
    + +输出字符串格式为 "$row_kind(f0,f1,f2...)",row_kind是一个 `RowKind` 类型的短字符串,例如:"+I(1,1)"。 + +Print 连接器是内置的。 + +注意 在任务运行时使用 Print Sinks 打印记录,你需要注意观察任务日志。 + +如何创建一张基于 Print 的表 +---------------- + +```sql +CREATE TABLE print_table ( + f0 INT, + f1 INT, + f2 STRING, + f3 DOUBLE +) WITH ( + 'connector' = 'print' +) +``` + +或者,也可以通过 [LIKE子句]({{< ref "docs/dev/table/sql/create" >}}#create-table) 基于已有表的结构去创建新表。 + +{{< tabs "0baef2bc-71e4-4507-9152-349bdf2420a4" >}} +{{< tab "SQL" >}} +```sql +CREATE TABLE print_table WITH ('connector' = 'print') +LIKE source_table (EXCLUDING ALL) +``` +{{< /tab >}} +{{< /tabs >}} + +连接器参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值数据类型描述
    connector
    必选(none)String指定要使用的连接器,此处应为 'print'
    print-identifier
    可选(none)String配置一个标识符作为输出数据的前缀。
    standard-error
    可选falseBoolean如果 format 需要打印为标准错误而不是标准输出,则为 True 。
    sink.parallelism
    可选(none)Integer为 Print sink operator 定义并行度。默认情况下,并行度由框架决定,和链在一起的上游 operator 一致。
    diff --git a/docs/content.zh/docs/connectors/table/upsert-kafka.md b/docs/content.zh/docs/connectors/table/upsert-kafka.md new file mode 100644 index 0000000000000..d269e6717e506 --- /dev/null +++ b/docs/content.zh/docs/connectors/table/upsert-kafka.md @@ -0,0 +1,265 @@ +--- +title: Upsert Kafka +weight: 4 +type: docs +aliases: + - /zh/dev/table/connectors/upsert-kafka.html +--- + + +# Upsert Kafka SQL 连接器 + +{{< label "Scan Source: Unbounded" >}} +{{< label "Sink: Streaming Upsert Mode" >}} + +Upsert Kafka 连接器支持以 upsert 方式从 Kafka topic 中读取数据并将数据写入 Kafka topic。 + +作为 source,upsert-kafka 连接器生产 changelog 流,其中每条数据记录代表一个更新或删除事件。更准确地说,数据记录中的 value 被解释为同一 key 的最后一个 value 的 UPDATE,如果有这个 key(如果不存在相应的 key,则该更新被视为 INSERT)。用表来类比,changelog 流中的数据记录被解释为 UPSERT,也称为 INSERT/UPDATE,因为任何具有相同 key 的现有行都被覆盖。另外,value 为空的消息将会被视作为 DELETE 消息。 + +作为 sink,upsert-kafka 连接器可以消费 changelog 流。它会将 INSERT/UPDATE_AFTER 数据作为正常的 Kafka 消息写入,并将 DELETE 数据以 value 为空的 Kafka 消息写入(表示对应 key 的消息被删除)。Flink 将根据主键列的值对数据进行分区,从而保证主键上的消息有序,因此同一主键上的更新/删除消息将落在同一分区中。 + +依赖 +------------ + +{{< sql_download_table "upsert-kafka" >}} + +完整示例 +---------------- + +下面的示例展示了如何创建和使用 Upsert Kafka 表: + +```sql +CREATE TABLE pageviews_per_region ( + user_region STRING, + pv BIGINT, + uv BIGINT, + PRIMARY KEY (user_region) NOT ENFORCED +) WITH ( + 'connector' = 'upsert-kafka', + 'topic' = 'pageviews_per_region', + 'properties.bootstrap.servers' = '...', + 'key.format' = 'avro', + 'value.format' = 'avro' +); + +CREATE TABLE pageviews ( + user_id BIGINT, + page_id BIGINT, + viewtime TIMESTAMP, + user_region STRING, + WATERMARK FOR viewtime AS viewtime - INTERVAL '2' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'pageviews', + 'properties.bootstrap.servers' = '...', + 'format' = 'json' +); + +-- 计算 pv、uv 并插入到 upsert-kafka sink +INSERT INTO pageviews_per_region +SELECT + user_region, + COUNT(*), + COUNT(DISTINCT user_id) +FROM pageviews +GROUP BY user_region; + +``` + +注意 确保在 DDL 中定义主键。 + +Available Metadata +------------------ + +See the [regular Kafka connector]({{< ref "docs/connectors/datastream/kafka" >}}#available-metadata) for a list +of all available metadata fields. + +连接器参数 +---------------- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    参数是否必选默认值数据类型描述
    connector
    必选(none)String指定要使用的连接器,Upsert Kafka 连接器使用:'upsert-kafka'
    topic
    必选(none)String用于读取和写入的 Kafka topic 名称。
    properties.bootstrap.servers
    必选(none)String以逗号分隔的 Kafka brokers 列表。
    properties.*
    可选(none)String + 该选项可以传递任意的 Kafka 参数。选项的后缀名必须匹配定义在 Kafka 参数文档中的参数名。 + Flink 会自动移除 选项名中的 "properties." 前缀,并将转换后的键名以及值传入 KafkaClient。 例如,你可以通过 'properties.allow.auto.create.topics' = 'false' + 来禁止自动创建 topic。 但是,某些选项,例如'key.deserializer''value.deserializer' 是不允许通过该方式传递参数,因为 Flink 会重写这些参数的值。 +
    key.format
    必选(none)String用于对 Kafka 消息中 key 部分序列化和反序列化的格式。key 字段由 PRIMARY KEY 语法指定。支持的格式包括 'csv''json''avro'。请参考}}">格式页面以获取更多详细信息和格式参数。 +
    key.fields-prefix
    optional(none)StringDefines a custom prefix for all fields of the key format to avoid name clashes with fields + of the value format. By default, the prefix is empty. If a custom prefix is defined, both the + table schema and 'key.fields' will work with prefixed names. When constructing the + data type of the key format, the prefix will be removed and the non-prefixed names will be used + within the key format. Please note that this option requires that 'value.fields-include' + must be set to 'EXCEPT_KEY'. +
    value.format
    必选(none)String用于对 Kafka 消息中 value 部分序列化和反序列化的格式。支持的格式包括 'csv''json''avro'。请参考}}">格式页面以获取更多详细信息和格式参数。 +
    value.fields-include
    必选'ALL'String控制哪些字段应该出现在 value 中。可取值: +
      +
    • ALL:消息的 value 部分将包含 schema 中所有的字段,包括定义为主键的字段。
    • +
    • EXCEPT_KEY:记录的 value 部分包含 schema 的所有字段,定义为主键的字段除外。
    • +
    +
    sink.parallelism
    可选(none)Integer定义 upsert-kafka sink 算子的并行度。默认情况下,由框架确定并行度,与上游链接算子的并行度保持一致。
    sink.buffer-flush.max-rows
    可选0Integer缓存刷新前,最多能缓存多少条记录。当 sink 收到很多同 key 上的更新时,缓存将保留同 key 的最后一条记录,因此 sink 缓存能帮助减少发往 Kafka topic 的数据量,以及避免发送潜在的 tombstone 消息。 + 可以通过设置为 '0' 来禁用它。默认,该选项是未开启的。注意,如果要开启 sink 缓存,需要同时设置 'sink.buffer-flush.max-rows' + 和 'sink.buffer-flush.interval' 两个选项为大于零的值。
    sink.buffer-flush.interval
    可选0Duration缓存刷新的间隔时间,超过该时间后异步线程将刷新缓存数据。当 sink 收到很多同 key 上的更新时,缓存将保留同 key 的最后一条记录,因此 sink 缓存能帮助减少发往 Kafka topic 的数据量,以及避免发送潜在的 tombstone 消息。 + 可以通过设置为 '0' 来禁用它。默认,该选项是未开启的。注意,如果要开启 sink 缓存,需要同时设置 'sink.buffer-flush.max-rows' + 和 'sink.buffer-flush.interval' 两个选项为大于零的值。
    + +特性 +---------------- + +### Key and Value Formats + +See the [regular Kafka connector]({{< ref "docs/connectors/datastream/kafka" >}}#key-and-value-formats) for more +explanation around key and value formats. However, note that this connector requires both a key and +value format where the key fields are derived from the `PRIMARY KEY` constraint. + +The following example shows how to specify and configure key and value formats. The format options are +prefixed with either the `'key'` or `'value'` plus format identifier. + +```sql +CREATE TABLE KafkaTable ( + `ts` TIMESTAMP(3) METADATA FROM 'timestamp', + `user_id` BIGINT, + `item_id` BIGINT, + `behavior` STRING, + PRIMARY KEY (`user_id`) NOT ENFORCED +) WITH ( + 'connector' = 'upsert-kafka', + ... + + 'key.format' = 'json', + 'key.json.ignore-parse-errors' = 'true', + + 'value.format' = 'json', + 'value.json.fail-on-missing-field' = 'false', + 'value.fields-include' = 'EXCEPT_KEY' +) +``` + + +### 主键约束 + +Upsert Kafka 始终以 upsert 方式工作,并且需要在 DDL 中定义主键。在具有相同主键值的消息按序存储在同一个分区的前提下,在 changlog source 定义主键意味着 在物化后的 changelog 上主键具有唯一性。定义的主键将决定哪些字段出现在 Kafka 消息的 key 中。 + +### 一致性保证 + +默认情况下,如果[启用 checkpoint]({{< ref "docs/dev/datastream/fault-tolerance/checkpointing" >}}#enabling-and-configuring-checkpointing),Upsert Kafka sink 会保证至少一次将数据插入 Kafka topic。 + +这意味着,Flink 可以将具有相同 key 的重复记录写入 Kafka topic。但由于该连接器以 upsert 的模式工作,该连接器作为 source 读入时,可以确保具有相同主键值下仅最后一条消息会生效。因此,upsert-kafka 连接器可以像 [HBase sink]({{< ref "docs/connectors/table/hbase" >}}) 一样实现幂等写入。 + +### 为每个分区生成相应的 watermark + +Flink 支持根据 Upsert Kafka 的 每个分区的数据特性发送相应的 watermark。当使用这个特性的时候,watermark 是在 Kafka consumer 内部生成的。 合并每个分区 +生成的 watermark 的方式和 stream shuffle 的方式是一致的。 数据源产生的 watermark 是取决于该 consumer 负责的所有分区中当前最小的 watermark。如果该 +consumer 负责的部分分区是 idle 的,那么整体的 watermark 并不会前进。在这种情况下,可以通过设置合适的 [table.exec.source.idle-timeout]({{< ref "docs/dev/table/config" >}}#table-exec-source-idle-timeout) +来缓解这个问题。 + +如想获得更多细节,请查阅 [Kafka watermark strategies]({{< ref "docs/dev/datastream/event-time/generating_watermarks" >}}#watermark-strategies-and-the-kafka-connector). + +数据类型映射 +---------------- + +Upsert Kafka 用字节存储消息的 key 和 value,因此没有 schema 或数据类型。消息按格式进行序列化和反序列化,例如:csv、json、avro。因此数据类型映射表由指定的格式确定。请参考[格式]({{< ref "docs/connectors/table/formats/overview" >}})页面以获取更多详细信息。 + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/_index.md b/docs/content.zh/docs/deployment/_index.md new file mode 100644 index 0000000000000..e7d0c95f80d2e --- /dev/null +++ b/docs/content.zh/docs/deployment/_index.md @@ -0,0 +1,25 @@ +--- +title: Deployment +icon: +bold: true +bookCollapseSection: true +weight: 7 +--- + diff --git a/docs/content.zh/docs/deployment/advanced/_index.md b/docs/content.zh/docs/deployment/advanced/_index.md new file mode 100644 index 0000000000000..cf0dc11458325 --- /dev/null +++ b/docs/content.zh/docs/deployment/advanced/_index.md @@ -0,0 +1,23 @@ +--- +title: Advanced +bookCollapseSection: true +weight: 10 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/deployment/advanced/external_resources.md b/docs/content.zh/docs/deployment/advanced/external_resources.md new file mode 100644 index 0000000000000..95144c2b2aa82 --- /dev/null +++ b/docs/content.zh/docs/deployment/advanced/external_resources.md @@ -0,0 +1,342 @@ +--- +title: 扩展资源 +weight: 2 +type: docs +aliases: + - /zh/deployment/advanced/external_resources.html + - /zh/ops/external_resources.html +--- + + +# 扩展资源框架 + +许多计算任务需要使用除了 CPU 与内存外的资源,如用深度学习场景需要使用 GPU 来进行加速。为了支持这种扩展资源,Flink 提供了一个扩展资源框架。 +该框架支持从底层资源管理系统(如 Kubernetes)请求各种类型的资源,并向算子提供使用这些资源所需的信息。该框架以插件形式支持不同的资源类型。 +目前 Flink 仅内置了支持 GPU 资源的插件,你可以为你想使用的资源类型实现第三方插件。 + + + + + +# 扩展资源框架做了什么 + +扩展资源(External Resource)框架主要做了以下两件事: + + - 根据你的配置,在 Flink 从底层资源管理系统中申请资源时,设置与扩展资源相关的请求字段 + + - 为算子提供使用这些资源所需要的*信息* + +当 Flink 部署在资源管理系统(Kubernetes、Yarn)上时,扩展资源框架将确保分配的 Pod、Container 包含所需的扩展资源。目前,许多资源管理系统都支持扩展资源。 +例如,Kubernetes 从 v1.10 开始通过 [Device Plugin](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/) 机制支持 GPU、FPGA 等资源调度,Yarn 从 2.10 和 3.1 开始支持 GPU 和 FPGA 的调度。 +目前,扩展资源框架并不支持 Mesos 模式。在 Standalone 模式下,由用户负责确保扩展资源的可用性。 + +扩展资源框架向算子提供扩展资源相关*信息*,这些信息由你配置的扩展资源 *Driver* 生成,包含了使用扩展资源所需要的基本属性。 + + + +# 启用扩展资源框架 + +为了启用扩展资源框架来使用扩展资源,你需要: + + - 为该扩展资源准备扩展资源框架的*插件* + + - 为该扩展资源设置相关的配置 + + - 在你的算子中,从 `RuntimeContext` 来获取扩展资源的*信息*并使用这些资源 + + + +## 准备插件 + +你需要为使用的扩展资源准备插件,并将其放入 Flink 发行版的 `plugins/` 文件夹中, 参看 [Flink Plugins]({{< ref "docs/deployment/filesystems/plugins" >}})。 +Flink 提供了第一方的 [GPU 资源插件](#plugin-for-gpu-resources)。你同样可以为你所使用的扩展资源实现自定义插件[实现自定义插件](#implement-a-plugin-for-your-custom-resource-type)。 + +## 配置项 + +首先,你需要使用分隔符“;”将所有使用的扩展资源类型的资源名称添加到**扩展资源列表(配置键“external-resources”)**中,例如,“external-resources: gpu;fpga”定义了两个扩展资源“gpu”和“fpga”。 +只有此处定义了扩展资源名称(**\**),相应的资源才会在扩展资源框架中生效。 + +对于每个扩展资源,有以下配置选项。下面的所有配置选项中的 **\** 对应于**扩展资源列表**中列出的名称: + + - **数量** (`external..amount`):需要从外部系统请求的扩展资源的数量。 + + - **Yarn 中的扩展资源配置键** (`external-resource..yarn.config-key`):*可选配置*。如果配置该项,扩展资源框架将把这个键添加到 Yarn 的容器请求的资源配置中,该键对应的值将被设置为`external-resource..amount`。 + + - **Kubernetes 中的扩展资源配置键** (`external-resource..kubernetes.config-key`):*可选配置*。 + 如果配置该项,扩展资源框架将添加 `resources.limits.` 和 `resources.requests.` 到 TaskManager 的主容器配置中,对应的值将被设置为 `external-resource..amount`。 + + - **Driver 工厂类** (`external-resource..driver-factory.class`):*可选配置*。定义由 **\** 标识的扩展资源对应的工厂类名。如果配置该项,该工厂类将被用于实例化扩展资源框架中所需要的 *drivers*。 + 如果没有配置,扩展资源依然会在其他配置正确时在存在于 `TaskManager`,只是算子在这种情况下无法从 `RuntimeContext` 中拿到该资源的信息。 + + - **Driver 自定义参数** (`external-resource..param.`):*可选配置*。由 **\** 标识的扩展资源的自定义配置选项的命名模式。只有遵循此模式的配置才会传递到该扩展资源的工厂类。 + +示例配置,该配置定义两个扩展资源: + +```bash +external-resources: gpu;fpga # 定义两个扩展资源,“gpu”和“fpga”。 + +external-resource.gpu.driver-factory.class: org.apache.flink.externalresource.gpu.GPUDriverFactory # 定义 GPU 资源对应 Driver 的工厂类。 +external-resource.gpu.amount: 2 # 定义每个 TaskManager 所需的 GPU 数量。 +external-resource.gpu.param.discovery-script.args: --enable-coordination # 自定义参数 discovery-script.args,它将被传递到 GPU 对应的 Driver 中。 + +external-resource.fpga.driver-factory.class: org.apache.flink.externalresource.fpga.FPGADriverFactory # 定义 FPGA 资源对应 Driver 的工厂类。 +external-resource.fpga.amount: 1 # 定义每个 TaskManager 所需的 FPGA 数量。 +external-resource.fpga.yarn.config-key: yarn.io/fpga # 定义 FPGA 在 Yarn 中对应的配置键。 +``` + +## 使用扩展资源 + +为了使用扩展资源,算子需要从 `RuntimeContext` 获取 `ExternalResourceInfo` 集合。 `ExternalResourceInfo` 包含了使用扩展资源所需的信息,可以使用 `getProperty` 检索这些信息。 +其中具体包含哪些属性以及如何使用这些属性取决于特定的扩展资源插件。 + +算子可以通过 `getExternalResourceInfos(String resourceName)` 从 `RuntimeContext` 或 `FunctionContext` 中获取特定扩展资源的 `ExternalResourceInfo`。 +此处的 `resourceName` 应与在扩展资源列表中定义的名称相同。具体用法如下: + +{{< tabs "5e1a48c8-14ae-4836-b5fd-84879c4bf36d" >}} +{{< tab "Java" >}} +```java +public class ExternalResourceMapFunction extends RichMapFunction { + private static final String RESOURCE_NAME = "foo"; + + @Override + public String map(String value) { + Set externalResourceInfos = getRuntimeContext().getExternalResourceInfos(RESOURCE_NAME); + List addresses = new ArrayList<>(); + externalResourceInfos.iterator().forEachRemaining(externalResourceInfo -> + addresses.add(externalResourceInfo.getProperty("address").get())); + // map function with addresses. + // ... + } +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +class ExternalResourceMapFunction extends RichMapFunction[(String, String)] { + var RESOURCE_NAME = "foo" + + override def map(value: String): String = { + val externalResourceInfos = getRuntimeContext().getExternalResourceInfos(RESOURCE_NAME) + val addresses = new util.ArrayList[String] + externalResourceInfos.asScala.foreach( + externalResourceInfo => addresses.add(externalResourceInfo.getProperty("address").get())) + + // map function with addresses. + // ... + } +} +``` +{{< /tab >}} +{{< /tabs >}} + +`ExternalResourceInfo` 中包含一个或多个键-值对,其键值表示资源的不同维度。你可以通过 `ExternalResourceInfo#getKeys` 获取所有的键。 + +
    + 提示:目前,RuntimeContext#getExternalResourceInfos 返回的信息对所有算子都是可用的。 +
    + + + +# 为你所使用的扩展资源实现自定义插件 + +要为你所使用的扩展资源实现自定义插件,你需要: + + - 添加你自定义的扩展资源 Driver ,该 Driver 需要实现 `org.apache.flink.api.common.externalresource.ExternalResourceDriver` 接口。 + + - 添加用来实例化 *Driver* 的工厂类,该工厂类需要实现 `org.apache.flink.api.common.externalresource.ExternalResourceDriverFactory` 接口。 + + - 添加服务入口。创建 `META-INF/services/org.apache.flink.api.common.externalresource.ExternalResourceDriverFactory` 文件,其中包含了 *Driver* 对应工厂类的类名(更多细节请参看 [Java Service Loader](https://docs.oracle.com/javase/8/docs/api/java/util/ServiceLoader.html))。 + +例如,要为名为“FPGA”的扩展资源实现插件,你首先需要实现 `FPGADriver` 和 `FPGADriverFactory`: + +{{< tabs "b44c0b2c-52ef-4281-8a93-40ca3843c3b8" >}} +{{< tab "Java" >}} +```java +public class FPGADriver implements ExternalResourceDriver { + @Override + public Set retrieveResourceInfo(long amount) { + // return the information set of "FPGA" + } +} + +public class FPGADriverFactory implements ExternalResourceDriverFactory { + @Override + public ExternalResourceDriver createExternalResourceDriver(Configuration config) { + return new FPGADriver(); + } +} + +// Also implement FPGAInfo which contains basic properties of "FPGA" resource. +public class FPGAInfo implements ExternalResourceInfo { + @Override + public Optional getProperty(String key) { + // return the property with the given key. + } + + @Override + public Collection getKeys() { + // return all property keys. + } +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +class FPGADriver extends ExternalResourceDriver { + override def retrieveResourceInfo(amount: Long): Set[FPGAInfo] = { + // return the information set of "FPGA" + } +} + +class FPGADriverFactory extends ExternalResourceDriverFactory { + override def createExternalResourceDriver(config: Configuration): ExternalResourceDriver = { + new FPGADriver() + } +} + +// Also implement FPGAInfo which contains basic properties of "FPGA" resource. +class FPGAInfo extends ExternalResourceInfo { + override def getProperty(key: String): Option[String] = { + // return the property with the given key. + } + + override def getKeys(): util.Collection[String] = { + // return all property keys. + } +} +``` +{{< /tab >}} +{{< /tabs >}} + +在 `META-INF/services/` 中创建名为 `org.apache.flink.api.common.externalresource.ExternalResourceDriverFactory` 的文件,向其中写入工厂类名,如 `your.domain.FPGADriverFactory`。 + +之后,将 `FPGADriver`,`FPGADriverFactory`,`META-INF/services/` 和所有外部依赖打入 jar 包。在你的 Flink 发行版的 `plugins/` 文件夹中创建一个名为“fpga”的文件夹,将打好的 jar 包放入其中。 +更多细节请查看 [Flink Plugin]({{< ref "docs/deployment/filesystems/plugins" >}})。 + +
    + 提示: 扩展资源由运行在同一台机器上的所有算子共享。社区可能会在未来的版本中支持外部资源隔离。 +
    + +# 已支持的扩展资源插件 + +目前,Flink提供 GPU 资源插件。 + + + +## GPU 插件 + +我们为 GPU 提供了第一方插件。该插件利用一个脚本来发现 GPU 设备的索引,该索引可通过“index”从 `ExternalResourceInfo` 中获取。我们提供了一个默认脚本,可以用来发现 NVIDIA GPU。您还可以提供自定义脚本。 + +我们提供了[一个示例程序](https://github.com/apache/flink/blob/master/flink-examples/flink-examples-streaming/src/main/java/org/apache/flink/streaming/examples/gpu/MatrixVectorMul.java),展示了如何在 Flink 中使用 GPU 资源来做矩阵-向量乘法。 + +
    + 提示:目前,对于所有算子,RuntimeContext#getExternalResourceInfos 会返回同样的资源信息。也即,在同一个 TaskManager 中运行的所有算子都可以访问同一组 GPU 设备。扩展资源目前没有算子级别的隔离。 +
    + +### 前置准备 + +要使 GPU 资源可访问,根据您的环境,需要满足以下先决条件: + + - 对于 Standalone 模式,集群管理员应确保已安装 NVIDIA 驱动程序,并且集群中所有节点上的 GPU 资源都是可访问的。 + + - 对于 Yarn 上部署,管理员需要配置 Yarn 集群使其[支持 GPU 调度](https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/UsingGpus.html)。 + 请注意,所需的 Hadoop 版本是 2.10+ 和 3.1+。 + + - 对于 Kubernetes 上部署,管理员需要保证 NVIDIA GPU 的 [Device Plugin](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/) 已在集群上安装。 + 请注意,所需的 Kubernetes 版本是 1.10+。目前,Kubernetes只支持 NVIDIA GPU 和 AMD GPU。Flink 只提供了 NVIDIA GPU 的脚本,但你可以提供支持 AMD GPU 的自定义脚本,参看 [发现脚本](#discovery-script)。 + +### 在计算任务中使用 GPU 资源 + +如[启用扩展资源框架](#enable-the-external-resource-framework-for-your-workload)中所述,要使用 GPU 资源,还需要执行两项操作: + + - 为 GPU 资源进行相关配置。 + + - 在算子中获取 GPU 资源的*信息*,其中包含键为“index”的 GPU 索引。 + +### 配置项 + +对于 GPU 插件,你需要指定的扩展资源框架配置: + + - `external-resources`:你需要将 GPU 的扩展资源名称(例如“gpu”)加到该列表中。 + + - `external-resource..amount`:每个 TaskManager 中的 GPU 数量。 + + - `external-resource..yarn.config-key`:对于 Yarn,GPU 的配置键是 `yarn.io/gpu`。请注意,Yarn 目前只支持 NVIDIA GPU。 + + - `external-resource..kubernetes.config-key`:对于 Kubernetes,GPU 的配置键是 `.com/gpu`。 + 目前,“nvidia”和“amd”是两个支持的 GPU 品牌。请注意,如果你使用 AMD GPU,你需要提供一个自定义的[发现脚本](#discovery-script)。 + + - `external-resource..driver-factory.class`:需要设置为 org.apache.flink.externalresource.gpu.GPUDriverFactory。 + +此外,GPU 插件还有一些专有配置: + + - `external-resource..param.discovery-script.path`:[发现脚本](#discovery-script)的文件路径。 + 它既可以是绝对路径,也可以是相对路径,如果定义了“FLINK_HOME”,该路径将相对于“FLINK_HOME”,否则相对于当前目录。如果没有显式配置该项,GPU 插件将使用默认脚本。 + + - `external-resource..param.discovery-script.args`:传递给发现脚本的参数。对于默认的发现脚本,请参见[默认脚本](#default-script)以获取可用参数。 + +GPU 插件示例配置: + +```bash +external-resources: gpu +external-resource.gpu.driver-factory.class: org.apache.flink.externalresource.gpu.GPUDriverFactory # 定义 GPU 资源的工厂类。 +external-resource.gpu.amount: 2 # 定义每个 TaskManager 的 GPU 数量。 +external-resource.gpu.param.discovery-script.path: plugins/external-resource-gpu/nvidia-gpu-discovery.sh +external-resource.gpu.param.discovery-script.args: --enable-coordination # 自定义参数,将被传递到 GPU 的 Driver 中。 + +external-resource.gpu.yarn.config-key: yarn.io/gpu # for Yarn + +external-resource.gpu.kubernetes.config-key: nvidia.com/gpu # for Kubernetes +``` + + + +### 发现脚本 + +`GPUDriver` 利用发现脚本来发现 GPU 资源并生成 GPU 资源信息。 + + + +#### 默认脚本 + +我们为 NVIDIA GPU 提供了一个默认脚本,位于 Flink 发行版的 `plugins/external-resource-gpu/nvidia-gpu-discovery.sh`。 +该脚本通过 `nvidia-smi` 工具获取当前可见 GPU 的索引。它尝试返回一个 GPU 索引列表,其大小由 `external-resource..amount` 指定,如果 GPU 数量不足,则以非零退出。 + +在 Standalone 模式中,多个 TaskManager 可能位于同一台机器上,并且每个 GPU 设备对所有 TaskManager 都是可见的。 +默认脚本提供 GPU 协调模式,在这种模式下,脚本利用文件来同步 GPU 的分配情况,并确保每个GPU设备只能由一个TaskManager进程使用。相关参数为: + + - `--enable-coordination-mode`:启用 GPU 协调模式。默认情况下不启用。 + + - `--coordination-file filePath`:用于同步 GPU 资源分配状态的文件路径。默认路径为 `/var/tmp/flink-gpu-coordination`。 + +
    + 提示:协调模式只确保一个 GPU 设备不会被同一个 Flink 集群的多个 TaskManager 共享。不同 Flink 集群间(具有不同的协调文件)或非 Flink 应用程序仍然可以使用相同的 GPU 设备。 +
    + +#### 自定义脚本 + +你可以提供一个自定义的发现脚本来满足你的特殊需求,例如使用 AMD GPU。请确保自定义脚本的的路径正确配置(`external-resource..param.discovery-script.path`)并且 Flink 可以访问。自定义的发现脚本需要: + + - `GPUDriver` 将 GPU 数量(由 `external-resource..amount` 定义)作为第一个参数传递到脚本中。 + `external-resource..param.discovery-script.args` 中自定义的参数会被附加在后面。 + + - 脚本需返回可用 GPU 索引的列表,用逗号分隔。空白的索引将被忽略。 + + - 脚本可以通过以非零退出来表示其未正确执行。在这种情况下,算子将不会得到 GPU 资源相关信息。 diff --git a/docs/content.zh/docs/deployment/advanced/historyserver.md b/docs/content.zh/docs/deployment/advanced/historyserver.md new file mode 100644 index 0000000000000..27944bd4d9441 --- /dev/null +++ b/docs/content.zh/docs/deployment/advanced/historyserver.md @@ -0,0 +1,109 @@ +--- +title: "History Server" +weight: 3 +type: docs +aliases: + - /zh/deployment/advanced/historyserver.html + - /zh/monitoring/historyserver.html +--- + + +# History Server + +Flink 提供了 history server,可以在相应的 Flink 集群关闭之后查询已完成作业的统计信息。 + +此外,它暴露了一套 REST API,该 API 接受 HTTP 请求并返回 JSON 格式的数据。 + + + + + +## 概览 + +HistoryServer 允许查询 JobManager 存档的已完成作业的状态和统计信息。 + +在配置 HistoryServer *和* JobManager 之后,你可以使用相应的脚本来启动和停止 HistoryServer: + +```shell +# 启动或者停止 HistoryServer +bin/historyserver.sh (start|start-foreground|stop) +``` + +默认情况下,此服务器绑定到 `localhost` 的 `8082` 端口。 + +目前,只能将 HistoryServer 作为独立的进程运行。 + + + +## 配置参数 + +配置项 `jobmanager.archive.fs.dir` 和 `historyserver.archive.fs.refresh-interval` 需要根据 `作业存档目录` 和 `刷新作业存档目录的时间间隔` 进行调整。 + +**JobManager** + +已完成作业的存档在 JobManager 上进行,将已存档的作业信息上传到文件系统目录中。你可以在 `flink-conf.yaml` 文件中通过 `jobmanager.archive.fs.dir` 设置一个目录存档已完成的作业。 + +```yaml +# 上传已完成作业信息的目录 +jobmanager.archive.fs.dir: hdfs:///completed-jobs +``` + +**HistoryServer** + +可以通过 `historyserver.archive.fs.dir` 设置 HistoryServer 监视以逗号分隔的目录列表。定期轮询已配置的目录以查找新的存档;轮询间隔可以通过 `historyserver.archive.fs.refresh-interval` 来配置。 + +```yaml +# 监视以下目录中已完成的作业 +historyserver.archive.fs.dir: hdfs:///completed-jobs + +# 每 10 秒刷新一次 +historyserver.archive.fs.refresh-interval: 10000 +``` + +所包含的存档被下载缓存在本地文件系统中。本地目录通过 `historyserver.web.tmpdir` 配置。 + +请查看配置页面以获取[配置选项的完整列表]({{< ref "docs/deployment/config" >}}#history-server)。 + + + +## 可用的请求 + +以下是可用且带有示例 JSON 响应的请求列表。所有请求格式样例均为 `http://hostname:8082/jobs`,下面我们仅列出了 URLs 的 *path* 部分。 +尖括号中的值为变量,例如作业 `7684be6004e4e955c2a558a9bc463f65` 的 +`http://hostname:port/jobs//exceptions` 请求须写为 `http://hostname:port/jobs/7684be6004e4e955c2a558a9bc463f65/exceptions`。 + + - `/config` + - `/jobs/overview` + - `/jobs/` + - `/jobs//vertices` + - `/jobs//config` + - `/jobs//exceptions` + - `/jobs//accumulators` + - `/jobs//vertices/` + - `/jobs//vertices//subtasktimes` + - `/jobs//vertices//taskmanagers` + - `/jobs//vertices//accumulators` + - `/jobs//vertices//subtasks/accumulators` + - `/jobs//vertices//subtasks/` + - `/jobs//vertices//subtasks//attempts/` + - `/jobs//vertices//subtasks//attempts//accumulators` + - `/jobs//plan` + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/advanced/logging.md b/docs/content.zh/docs/deployment/advanced/logging.md new file mode 100644 index 0000000000000..e276ffca8fd72 --- /dev/null +++ b/docs/content.zh/docs/deployment/advanced/logging.md @@ -0,0 +1,126 @@ +--- +title: 日志 +weight: 4 +type: docs +aliases: + - /zh/deployment/advanced/logging.html + - /zh/monitoring/logging.html +--- + + +# 如何使用日志记录 + +Flink 中的日志记录是使用 slf4j 日志接口实现的。使用 log4j2 作为底层日志框架。我们也支持了 logback 日志配置,只要将其配置文件作为参数传递给 JVM 即可。愿意使用 logback 而不是 log4j2 的用户只需排除 log4j2 的依赖(或从 lib/ 文件夹中删除它)即可。 + + + + + +## 配置 Log4j2 + +Log4j2 是使用配置文件指定的。在 Flink 的使用中,该文件通常命名为 `log4j.properties`。我们使用 `-Dlog4j.configurationFile=` 参数将该文件的文件名和位置传递给 JVM。 + +Flink 附带以下默认日志配置文件: + +- `log4j-cli.properties`:由 Flink 命令行客户端使用(例如 `flink run`)(不包括在集群上执行的代码) +- `log4j-session.properties`:Flink 命令行客户端在启动 YARN 或 Kubernetes session 时使用(`yarn-session.sh`,`kubernetes-session.sh`) +- `log4j.properties`:作为 JobManager/TaskManager 日志配置使用(standalone 和 YARN 两种模式下皆使用) + + + +### 与 Log4j1 的兼容性 + +Flink 附带了 [Log4j API bridge](https://logging.apache.org/log4j/log4j-2.2/log4j-1.2-api/index.html),使得现有作业能够继续使用 log4j1 的接口。 + +如果你有基于 Log4j 的自定义配置文件或代码,请查看官方 Log4j [兼容性](https://logging.apache.org/log4j/2.x/manual/compatibility.html)和[迁移](https://logging.apache.org/log4j/2.x/manual/migration.html)指南。 + + + +## 配置 Log4j1 + +要将 Flink 与 Log4j1 一起使用,必须确保: +- Classpath 中不存在 `org.apache.logging.log4j:log4j-core`,`org.apache.logging.log4j:log4j-slf4j-impl` 和 `org.apache.logging.log4j:log4j-1.2-api`, +- 且 Classpath 中存在 `log4j:log4j`,`org.slf4j:slf4j-log4j12`,`org.apache.logging.log4j:log4j-to-slf4j` 和 `org.apache.logging.log4j:log4j-api`。 + +在 IDE 中使用 log4j1,你必须在 pom 文件中使用上述 `Classpath 中存在的 jars` 依赖项替换 `Classpath 中不存在的 jars` 依赖项,并尽可能在传递依赖于 `Classpath 中不存在的 jars` 的依赖项上添加排除 `Classpath 中不存在的 jars` 配置。 + +对于 Flink 发行版,这意味着你必须 +- 从 `lib` 目录中移除 `log4j-core`,`log4j-slf4j-impl` 和 `log4j-1.2-api` jars, +- 向 `lib` 目录中添加 `log4j`,`slf4j-log4j12` 和 `log4j-to-slf4j` jars, +- 用兼容的 Log4j1 版本替换 `conf` 目录中的所有 log4j 配置文件。 + + + +## 配置 logback + +对于用户和开发人员来说,控制日志框架非常重要。日志框架的配置完全由配置文件完成。必须通过设置环境参数 `-Dlogback.configurationFile=` 或将 `logback.xml` 放在 classpath 中来指定配置文件。`conf` 目录包含一个 `logback.xml` 文件,该文件可以修改,如果使用附带的启动脚本在 IDE 之外启动 Flink 则会使用该日志配置文件。提供的 `logback.xml` 具有以下格式: + +```xml + + + ${log.file} + false + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{60} %X{sourceThread} - %msg%n + + + + + + + +``` + +例如,为了控制 `org.apache.flink.runtime.jobgraph.JobGraph` 的日志记录级别,必须将以下行添加到配置文件中。 + +```xml + +``` + +有关配置日志的更多信息,请参见 [LOGback 手册](http://logback.qos.ch/manual/configuration.html)。 + + + +## 开发人员的最佳实践 + +Slf4j 的 loggers 通过调用 `LoggerFactory` 的 `getLogger()` 方法创建 + +```java +import org.slf4j.LoggerFactory +import org.slf4j.Logger + +Logger LOG = LoggerFactory.getLogger(Foobar.class) +``` + +为了最大限度地利用 slf4j,建议使用其占位符机制。使用占位符可以避免不必要的字符串构造,以防日志级别设置得太高而不会记录消息。占位符的语法如下: + +```java +LOG.info("This message contains {} placeholders. {}", 2, "Yippie"); +``` + +占位符也可以和要记录的异常一起使用。 + +```java +catch(Exception exception){ + LOG.error("An {} occurred.", "error", exception); +} +``` + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/cli.md b/docs/content.zh/docs/deployment/cli.md new file mode 100644 index 0000000000000..13d8bb603ad46 --- /dev/null +++ b/docs/content.zh/docs/deployment/cli.md @@ -0,0 +1,503 @@ +--- +title: 命令行界面 +weight: 5 +type: docs +aliases: + - /zh/deployment/cli.html + - /zh/apis/cli.html +--- + + +# 命令行界面 + +Flink provides a Command-Line Interface (CLI) `bin/flink` to run programs that +are packaged as JAR files and to control their execution. The CLI is part of any +Flink setup, available in local single node setups and in distributed setups. +It connects to the running JobManager specified in `conf/flink-config.yaml`. + + + +## Job Lifecycle Management + +A prerequisite for the commands listed in this section to work is to have a running Flink deployment +like [Kubernetes]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}), +[YARN]({{< ref "docs/deployment/resource-providers/yarn" >}}) or any other option available. Feel free to +[start a Flink cluster locally]({{< ref "docs/deployment/resource-providers/standalone/overview" >}}#starting-a-standalone-cluster-session-mode) +to try the commands on your own machine. + +### Submitting a Job + +Submitting a job means uploading the job's JAR and related dependencies to the Flink cluster and +initiating the job execution. For the sake of this example, we select a long-running job like +`examples/streaming/StateMachineExample.jar`. Feel free to select any other JAR archive from the +`examples/` folder or deploy your own job. +```bash +$ ./bin/flink run \ + --detached \ + ./examples/streaming/StateMachineExample.jar +``` +Submitting the job using `--detached` will make the command return after the submission is done. +The output contains (besides other things) the ID of the newly submitted job. +``` +Usage with built-in data generator: StateMachineExample [--error-rate ] [--sleep ] +Usage with Kafka: StateMachineExample --kafka-topic [--brokers ] +Options for both the above setups: + [--backend ] + [--checkpoint-dir ] + [--async-checkpoints ] + [--incremental-checkpoints ] + [--output OR null for stdout] + +Using standalone source with error rate 0.000000 and sleep delay 1 millis + +Job has been submitted with JobID cca7bc1061d61cf15238e92312c2fc20 +``` +The usage information printed lists job-related parameters that can be added to the end of the job +submission command if necessary. For the purpose of readability, we assume that the returned JobID is +stored in a variable `JOB_ID` for the commands below: +```bash +$ export JOB_ID="cca7bc1061d61cf15238e92312c2fc20" +``` + +There is another action called `run-application` available to run the job in +[Application Mode]({{< ref "docs/deployment/overview" >}}#application-mode). This documentation does not address +this action individually as it works similarly to the `run` action in terms of the CLI frontend. + +### Job Monitoring + +You can monitor any running jobs using the `list` action: +```bash +$ ./bin/flink list +``` +``` +Waiting for response... +------------------ Running/Restarting Jobs ------------------- +30.11.2020 16:02:29 : cca7bc1061d61cf15238e92312c2fc20 : State machine job (RUNNING) +-------------------------------------------------------------- +No scheduled jobs. +``` +Jobs that were submitted but not started, yet, would be listed under "Scheduled Jobs". + +### Creating a Savepoint + +[Savepoints]({{< ref "docs/ops/state/savepoints" >}}) can be created to save the current state a job is +in. All that's needed is the JobID: +```bash +$ ./bin/flink savepoint \ + $JOB_ID \ + /tmp/flink-savepoints +``` +``` +Triggering savepoint for job cca7bc1061d61cf15238e92312c2fc20. +Waiting for response... +Savepoint completed. Path: file:/tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab +You can resume your program from this savepoint with the run command. +``` +The savepoint folder is optional and needs to be specified if +[state.savepoints.dir]({{< ref "docs/deployment/config" >}}#state-savepoints-dir) isn't set. + +The path to the savepoint can be used later on to [restart the Flink job](#starting-a-job-from-a-savepoint). + +#### Disposing a Savepoint + +The `savepoint` action can be also used to remove savepoints. `--dispose` with the corresponding +savepoint path needs to be added: +```bash +$ ./bin/flink savepoint \ + --dispose \ + /tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab \ + $JOB_ID +``` +``` +Disposing savepoint '/tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab'. +Waiting for response... +Savepoint '/tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab' disposed. +``` + +If you use custom state instances (for example custom reducing state or RocksDB state), you have to +specify the path to the program JAR with which the savepoint was triggered. Otherwise, you will run +into a `ClassNotFoundException`: +```bash +$ ./bin/flink savepoint \ + --dispose \ + --jarfile +``` + +Triggering the savepoint disposal through the `savepoint` action does not only remove the data from +the storage but makes Flink clean up the savepoint-related metadata as well. + +### Terminating a Job + +#### Stopping a Job Gracefully Creating a Final Savepoint + +Another action for stopping a job is `stop`. It is a more graceful way of stopping a running streaming +job as the `stop` flows from source to sink. When the user requests to stop a job, all sources will +be requested to send the last checkpoint barrier that will trigger a savepoint, and after the successful +completion of that savepoint, they will finish by calling their `cancel()` method. + +```bash +$ ./bin/flink stop \ + --savepointPath /tmp-flink-savepoints \ + $JOB_ID +``` +``` +Suspending job "cca7bc1061d61cf15238e92312c2fc20" with a savepoint. +Savepoint completed. Path: file:/tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab +``` +We have to use `--savepointPath` to specify the savepoint folder if +[state.savepoints.dir]({{< ref "docs/deployment/config" >}}#state-savepoints-dir) isn't set. + +If the `--drain` flag is specified, then a `MAX_WATERMARK` will be emitted before the last checkpoint +barrier. This will make all registered event-time timers fire, thus flushing out any state that +is waiting for a specific watermark, e.g. windows. The job will keep running until all sources properly +shut down. This allows the job to finish processing all in-flight data, which can produce some +records to process after the savepoint taken while stopping. + +{{< hint danger >}} +Use the `--drain` flag if you want to terminate the job permanently. +If you want to resume the job at a later point in time, then do not drain the pipeline because it could lead to incorrect results when the job is resumed. +{{< /hint >}} + +#### Cancelling a Job Ungracefully + +Cancelling a job can be achieved through the `cancel` action: +```bash +$ ./bin/flink cancel $JOB_ID +``` +``` +Cancelling job cca7bc1061d61cf15238e92312c2fc20. +Cancelled job cca7bc1061d61cf15238e92312c2fc20. +``` +The corresponding job's state will be transitioned from `Running` to `Cancelled`. Any computations +will be stopped. + +{{< hint danger >}} +The `--withSavepoint` flag allows creating a savepoint as part of the job cancellation. +This feature is deprecated. +Use the [stop](#stopping-a-job-gracefully-creating-a-final-savepoint) action instead. +{{< /hint >}} + +### Starting a Job from a Savepoint + +Starting a job from a savepoint can be achieved using the `run` (and `run-application`) action. +```bash +$ ./bin/flink run \ + --detached \ + --fromSavepoint /tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab \ + ./examples/streaming/StateMachineExample.jar +``` +``` +Usage with built-in data generator: StateMachineExample [--error-rate ] [--sleep ] +Usage with Kafka: StateMachineExample --kafka-topic [--brokers ] +Options for both the above setups: + [--backend ] + [--checkpoint-dir ] + [--async-checkpoints ] + [--incremental-checkpoints ] + [--output OR null for stdout] + +Using standalone source with error rate 0.000000 and sleep delay 1 millis + +Job has been submitted with JobID 97b20a0a8ffd5c1d656328b0cd6436a6 +``` + +See how the command is equal to the [initial run command](#submitting-a-job) except for the +`--fromSavepoint` parameter which is used to refer to the state of the +[previously stopped job](#stopping-a-job-gracefully-creating-a-final-savepoint). A new JobID is +generated that can be used to maintain the job. + +By default, we try to match the whole savepoint state to the job being submitted. If you want to +allow to skip savepoint state that cannot be restored with the new job you can set the +`--allowNonRestoredState` flag. You need to allow this if you removed an operator from your program +that was part of the program when the savepoint was triggered and you still want to use the savepoint. + +```bash +$ ./bin/flink run \ + --fromSavepoint \ + --allowNonRestoredState ... +``` +This is useful if your program dropped an operator that was part of the savepoint. + +{{< top >}} + +## CLI Actions + +Here's an overview of actions supported by Flink's CLI tool: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ActionPurpose
    run + This action executes jobs. It requires at least the jar containing the job. Flink- + or job-related arguments can be passed if necessary. +
    run-application + This action executes jobs in }}#application-mode"> + Application Mode. Other than that, it requires the same parameters as the + run action. +
    info + This action can be used to print an optimized execution graph of the passed job. Again, + the jar containing the job needs to be passed. +
    list + This action lists all running or scheduled jobs. +
    savepoint + This action can be used to create or disposing savepoints for a given job. It might be + necessary to specify a savepoint directory besides the JobID, if the + }}#state-savepoints-dir">state.savepoints.dir + parameter was not specified in conf/flink-config.yaml. +
    cancel + This action can be used to cancel running jobs based on their JobID. +
    stop + This action combines the cancel and + savepoint actions to stop a running job + but also create a savepoint to start from again. +
    + +A more fine-grained description of all actions and their parameters can be accessed through `bin/flink --help` +or the usage information of each individual action `bin/flink --help`. + +{{< top >}} + +## Advanced CLI + +### REST API + +The Flink cluster can be also managed using the [REST API]({{< ref "docs/ops/rest_api" >}}). The commands +described in previous sections are a subset of what is offered by Flink's REST endpoints. Therefore, +tools like `curl` can be used to get even more out of Flink. + +### Selecting Deployment Targets + +Flink is compatible with multiple cluster management frameworks like +[Kubernetes]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}) or +[YARN]({{< ref "docs/deployment/resource-providers/yarn" >}}) which are described in more detail in the +Resource Provider section. Jobs can be submitted in different [Deployment Modes]({{< ref "docs/deployment/overview" >}}#deployment-modes). +The parameterization of a job submission differs based on the underlying framework and Deployment Mode. + +`bin/flink` offers a parameter `--target` to handle the different options. In addition to that, jobs +have to be submitted using either `run` (for [Session]({{< ref "docs/deployment/overview" >}}#session-mode) +and [Per-Job Mode]({{< ref "docs/deployment/overview" >}}#per-job-mode)) or `run-application` (for +[Application Mode]({{< ref "docs/deployment/overview" >}}#application-mode)). See the following summary of +parameter combinations: +* YARN + * `./bin/flink run --target yarn-session`: Submission to an already running Flink on YARN cluster + * `./bin/flink run --target yarn-per-job`: Submission spinning up a Flink on YARN cluster in Per-Job Mode + * `./bin/flink run-application --target yarn-application`: Submission spinning up Flink on YARN cluster in Application Mode +* Kubernetes + * `./bin/flink run --target kubernetes-session`: Submission to an already running Flink on Kubernetes cluster + * `./bin/flink run-application --target kubernetes-application`: Submission spinning up a Flink on Kubernetes cluster in Application Mode +* Mesos + * `./bin/flink run --target remote`: Submission to an already running Flink on Mesos cluster +* Standalone: + * `./bin/flink run --target local`: Local submission using a MiniCluster in Session Mode + * `./bin/flink run --target remote`: Submission to an already running Flink cluster + +The `--target` will overwrite the [execution.target]({{< ref "docs/deployment/config" >}}#execution-target) +specified in the `config/flink-config.yaml`. + +For more details on the commands and the available options, please refer to the Resource Provider-specific +pages of the documentation. + +### Submitting PyFlink Jobs + +Currently, users are able to submit a PyFlink job via the CLI. It does not require to specify the +JAR file path or the entry main class, which is different from the Java job submission. + +{{< hint info >}} +When submitting Python job via `flink run`, Flink will run the command "python". Please run the following command to confirm that the python executable in current environment points to a supported Python version of 3.6+. +{{< /hint >}} +```bash +$ python --version +# the version printed here must be 3.6+ +``` + +The following commands show different PyFlink job submission use-cases: + +- Run a PyFlink job: +```bash +$ ./bin/flink run --python examples/python/table/batch/word_count.py +``` + +- Run a PyFlink job with additional source and resource files. Files specified in `--pyFiles` will be +added to the `PYTHONPATH` and, therefore, available in the Python code. +```bash +$ ./bin/flink run \ + --python examples/python/table/batch/word_count.py \ + --pyFiles file:///user.txt,hdfs:///$namenode_address/username.txt +``` + +- Run a PyFlink job which will reference Java UDF or external connectors. JAR file specified in `--jarfile` will be uploaded +to the cluster. +```bash +$ ./bin/flink run \ + --python examples/python/table/batch/word_count.py \ + --jarfile +``` + +- Run a PyFlink job with pyFiles and the main entry module specified in `--pyModule`: +```bash +$ ./bin/flink run \ + --pyModule batch.word_count \ + --pyFiles examples/python/table/batch +``` + +- Submit a PyFlink job on a specific JobManager running on host `` (adapt the command accordingly): +```bash +$ ./bin/flink run \ + --jobmanager :8081 \ + --python examples/python/table/batch/word_count.py +``` + +- Run a PyFlink job using a [YARN cluster in Per-Job Mode]({{< ref "docs/deployment/resource-providers/yarn" >}}#per-job-cluster-mode): +```bash +$ ./bin/flink run \ + --target yarn-per-job + --python examples/python/table/batch/word_count.py +``` + +- Run a PyFlink application on a native Kubernetes cluster having the cluster ID ``, it requires a docker image with PyFlink installed, please refer to [Enabling PyFlink in docker]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#enabling-python): +```bash +$ ./bin/flink run-application \ + --target kubernetes-application \ + --parallelism 8 \ + -Dkubernetes.cluster-id= \ + -Dtaskmanager.memory.process.size=4096m \ + -Dkubernetes.taskmanager.cpu=2 \ + -Dtaskmanager.numberOfTaskSlots=4 \ + -Dkubernetes.container.image= \ + --pyModule word_count \ + --pyFiles /opt/flink/examples/python/table/batch/word_count.py +``` + +To learn more available options, please refer to [Kubernetes]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}) +or [YARN]({{< ref "docs/deployment/resource-providers/yarn" >}}) which are described in more detail in the +Resource Provider section. + +Besides `--pyFiles`, `--pyModule` and `--python` mentioned above, there are also some other Python +related options. Here's an overview of all the Python related options for the actions +`run` and `run-application` supported by Flink's CLI tool: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    OptionDescription
    -py,--python + Python script with the program entry. The dependent resources can be configured + with the --pyFiles option. +
    -pym,--pyModule + Python module with the program entry point. + This option must be used in conjunction with --pyFiles. +
    -pyfs,--pyFiles + Attach custom files for job. The standard resource file suffixes such as .py/.egg/.zip/.whl or directory are all supported. + These files will be added to the PYTHONPATH of both the local client and the remote python UDF worker. + Files suffixed with .zip will be extracted and added to PYTHONPATH. + Comma (',') could be used as the separator to specify multiple files + (e.g., --pyFiles file:///tmp/myresource.zip,hdfs:///$namenode_address/myresource2.zip). +
    -pyarch,--pyArchives + Add python archive files for job. The archive files will be extracted to the working directory + of python UDF worker. Currently only zip-format is supported. For each archive file, a target directory + be specified. If the target directory name is specified, the archive file will be extracted to a + directory with the specified name. Otherwise, the archive file will be extracted to a + directory with the same name of the archive file. The files uploaded via this option are accessible + via relative path. '#' could be used as the separator of the archive file path and the target directory + name. Comma (',') could be used as the separator to specify multiple archive files. + This option can be used to upload the virtual environment, the data files used in Python UDF + (e.g., --pyArchives file:///tmp/py37.zip,file:///tmp/data.zip#data --pyExecutable + py37.zip/py37/bin/python). The data files could be accessed in Python UDF, e.g.: + f = open('data/data.txt', 'r'). +
    -pyexec,--pyExecutable + Specify the path of the python interpreter used to execute the python UDF worker + (e.g.: --pyExecutable /usr/local/bin/python3). + The python UDF worker depends on Python 3.6+, Apache Beam (version == 2.27.0), + Pip (version >= 7.1.0) and SetupTools (version >= 37.0.0). + Please ensure that the specified environment meets the above requirements. +
    -pyreq,--pyRequirements + Specify the requirements.txt file which defines the third-party dependencies. + These dependencies will be installed and added to the PYTHONPATH of the python UDF worker. + A directory which contains the installation packages of these dependencies could be specified + optionally. Use '#' as the separator if the optional parameter exists + (e.g., --pyRequirements file:///tmp/requirements.txt#file:///tmp/cached_dir). +
    + +In addition to the command line options during submitting the job, it also supports to specify the +dependencies via configuration or Python API inside the code. Please refer to the +[dependency management]({{< ref "docs/dev/python/dependency_management" >}}) for more details. + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/config.md b/docs/content.zh/docs/deployment/config.md new file mode 100644 index 0000000000000..cf1c24bede36c --- /dev/null +++ b/docs/content.zh/docs/deployment/config.md @@ -0,0 +1,474 @@ +--- +title: "配置参数" +weight: 3 +type: docs +bookToc: false +aliases: + - /zh/deployment/config.html + - /zh/ops/config.html +--- + + +# 配置参数 + +All configuration is done in `conf/flink-conf.yaml`, which is expected to be a flat collection of [YAML key value pairs](http://www.yaml.org/spec/1.2/spec.html) with format `key: value`. + +The configuration is parsed and evaluated when the Flink processes are started. Changes to the configuration file require restarting the relevant processes. + +The out of the box configuration will use your default Java installation. You can manually set the environment variable `JAVA_HOME` or the configuration key `env.java.home` in `conf/flink-conf.yaml` if you want to manually override the Java runtime to use. + +You can specify a different configuration directory location by defining the `FLINK_CONF_DIR` environment variable. For resource providers which provide non-session deployments, you can specify per-job configurations this way. Make a copy of the `conf` directory from the Flink distribution and modify the settings on a per-job basis. Note that this is not supported in Docker or standalone Kubernetes deployments. On Docker-based deployments, you can use the `FLINK_PROPERTIES` environment variable for passing configuration values. + +On session clusters, the provided configuration will only be used for configuring [execution](#execution) parameters, e.g. configuration parameters affecting the job, not the underlying cluster. + +# Basic Setup + +The default configuration supports starting a single-node Flink session cluster without any changes. +The options in this section are the ones most commonly needed for a basic distributed Flink setup. + +**Hostnames / Ports** + +These options are only necessary for *standalone* application- or session deployments ([simple standalone]({{< ref "docs/deployment/resource-providers/standalone/overview" >}}) or [Kubernetes]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}})). + +If you use Flink with [Yarn]({{< ref "docs/deployment/resource-providers/yarn" >}}), [Mesos]({{< ref "docs/deployment/resource-providers/mesos" >}}), or the [*active* Kubernetes integration]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}), the hostnames and ports are automatically discovered. + + - `rest.address`, `rest.port`: These are used by the client to connect to Flink. Set this to the hostname where the JobManager runs, or to the hostname of the (Kubernetes) service in front of the JobManager's REST interface. + + - The `jobmanager.rpc.address` (defaults to *"localhost"*) and `jobmanager.rpc.port` (defaults to *6123*) config entries are used by the TaskManager to connect to the JobManager/ResourceManager. Set this to the hostname where the JobManager runs, or to the hostname of the (Kubernetes internal) service for the JobManager. This option is ignored on [setups with high-availability]({{< ref "docs/deployment/ha/overview" >}}) where the leader election mechanism is used to discover this automatically. + +**Memory Sizes** + +The default memory sizes support simple streaming/batch applications, but are too low to yield good performance for more complex applications. + + - `jobmanager.memory.process.size`: Total size of the *JobManager* (JobMaster / ResourceManager / Dispatcher) process. + - `taskmanager.memory.process.size`: Total size of the TaskManager process. + +The total sizes include everything. Flink will subtract some memory for the JVM's own memory requirements (metaspace and others), and divide and configure the rest automatically between its components (JVM Heap, Off-Heap, for Task Managers also network, managed memory etc.). + +These value are configured as memory sizes, for example *1536m* or *2g*. + +**Parallelism** + + - `taskmanager.numberOfTaskSlots`: The number of slots that a TaskManager offers *(default: 1)*. Each slot can take one task or pipeline. + Having multiple slots in a TaskManager can help amortize certain constant overheads (of the JVM, application libraries, or network connections) across parallel tasks or pipelines. See the [Task Slots and Resources]({{< ref "docs/concepts/flink-architecture" >}}#task-slots-and-resources) concepts section for details. + + Running more smaller TaskManagers with one slot each is a good starting point and leads to the best isolation between tasks. Dedicating the same resources to fewer larger TaskManagers with more slots can help to increase resource utilization, at the cost of weaker isolation between the tasks (more tasks share the same JVM). + + - `parallelism.default`: The default parallelism used when no parallelism is specified anywhere *(default: 1)*. + +**Checkpointing** + +You can configure checkpointing directly in code within your Flink job or application. Putting these values here in the configuration defines them as defaults in case the application does not configure anything. + + - `state.backend`: The state backend to use. This defines the data structure mechanism for taking snapshots. Common values are `filesystem` or `rocksdb`. + - `state.checkpoints.dir`: The directory to write checkpoints to. This takes a path URI like *s3://mybucket/flink-app/checkpoints* or *hdfs://namenode:port/flink/checkpoints*. + - `state.savepoints.dir`: The default directory for savepoints. Takes a path URI, similar to `state.checkpoints.dir`. + +**Web UI** + + - `web.submit.enable`: Enables uploading and starting jobs through the Flink UI *(true by default)*. Please note that even when this is disabled, session clusters still accept jobs through REST requests (HTTP calls). This flag only guards the feature to upload jobs in the UI. + - `web.cancel.enable`: Enables canceling jobs through the Flink UI *(true by default)*. Please note that even when this is disabled, session clusters still cancel jobs through REST requests (HTTP calls). This flag only guards the feature to cancel jobs in the UI. + - `web.upload.dir`: The directory where to store uploaded jobs. Only used when `web.submit.enable` is true. + +**Other** + + - `io.tmp.dirs`: The directories where Flink puts local data, defaults to the system temp directory (`java.io.tmpdir` property). If a list of directories is configured, Flink will rotate files across the directories. + + The data put in these directories include by default the files created by RocksDB, spilled intermediate results (batch algorithms), and cached jar files. + + This data is NOT relied upon for persistence/recovery, but if this data gets deleted, it typically causes a heavyweight recovery operation. It is hence recommended to set this to a directory that is not automatically periodically purged. + + Yarn, Mesos, and Kubernetes setups automatically configure this value to the local working directories by default. + +---- +---- + +# Common Setup Options + +*Common options to configure your Flink application or cluster.* + +### Hosts and Ports + +Options to configure hostnames and ports for the different Flink components. + +The JobManager hostname and port are only relevant for standalone setups without high-availability. +In that setup, the config values are used by the TaskManagers to find (and connect to) the JobManager. +In all highly-available setups, the TaskManagers discover the JobManager via the High-Availability-Service (for example ZooKeeper). + +Setups using resource orchestration frameworks (K8s, Yarn, Mesos) typically use the framework's service discovery facilities. + +You do not need to configure any TaskManager hosts and ports, unless the setup requires the use of specific port ranges or specific network interfaces to bind to. + +{{< generated/common_host_port_section >}} + +### Fault Tolerance + +These configuration options control Flink's restart behaviour in case of failures during the execution. +By configuring these options in your `flink-conf.yaml`, you define the cluster's default restart strategy. + +The default restart strategy will only take effect if no job specific restart strategy has been configured via the `ExecutionConfig`. + +{{< generated/restart_strategy_configuration >}} + +**Fixed Delay Restart Strategy** + +{{< generated/fixed_delay_restart_strategy_configuration >}} + +**Failure Rate Restart Strategy** + +{{< generated/failure_rate_restart_strategy_configuration >}} + +### Checkpoints and State Backends + +These options control the basic setup of state backends and checkpointing behavior. + +The options are only relevant for jobs/applications executing in a continuous streaming fashion. +Jobs/applications executing in a batch fashion do not use state backends and checkpoints, but different internal data structures that are optimized for batch processing. + +{{< generated/common_state_backends_section >}} + +### High Availability + +High-availability here refers to the ability of the JobManager process to recover from failures. + +The JobManager ensures consistency during recovery across TaskManagers. For the JobManager itself to recover consistently, an external service must store a minimal amount of recovery metadata (like "ID of last committed checkpoint"), as well as help to elect and lock which JobManager is the leader (to avoid split-brain situations). + +{{< generated/common_high_availability_section >}} + +**Options for high-availability setups with ZooKeeper** + +{{< generated/common_high_availability_zk_section >}} + +### Memory Configuration + +These configuration values control the way that TaskManagers and JobManagers use memory. + +Flink tries to shield users as much as possible from the complexity of configuring the JVM for data-intensive processing. +In most cases, users should only need to set the values `taskmanager.memory.process.size` or `taskmanager.memory.flink.size` (depending on how the setup), and possibly adjusting the ratio of JVM heap and Managed Memory via `taskmanager.memory.managed.fraction`. The other options below can be used for performance tuning and fixing memory related errors. + +For a detailed explanation of how these options interact, +see the documentation on [TaskManager]({{< ref "docs/deployment/memory/mem_setup_tm" >}}) and +[JobManager]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}} ) memory configurations. + +{{< generated/common_memory_section >}} + +### Miscellaneous Options + +{{< generated/common_miscellaneous_section >}} + +---- +---- + +# Security + +Options for configuring Flink's security and secure interaction with external systems. + +### SSL + +Flink's network connections can be secured via SSL. Please refer to the [SSL Setup Docs]({{< ref "docs/deployment/security/security-ssl" >}}) for detailed setup guide and background. + +{{< generated/security_ssl_section >}} + + +### Auth with External Systems + +**ZooKeeper Authentication / Authorization** + +These options are necessary when connecting to a secured ZooKeeper quorum. + +{{< generated/security_auth_zk_section >}} + +**Kerberos-based Authentication / Authorization** + +Please refer to the [Flink and Kerberos Docs]({{< ref "docs/deployment/security/security-kerberos" >}}) for a setup guide and a list of external system to which Flink can authenticate itself via Kerberos. + +{{< generated/security_auth_kerberos_section >}} + +---- +---- + +# Resource Orchestration Frameworks + +This section contains options related to integrating Flink with resource orchestration frameworks, like Kubernetes, Yarn, Mesos, etc. + +Note that is not always necessary to integrate Flink with the resource orchestration framework. +For example, you can easily deploy Flink applications on Kubernetes without Flink knowing that it runs on Kubernetes (and without specifying any of the Kubernetes config options here.) See [this setup guide]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}}) for an example. + +The options in this section are necessary for setups where Flink itself actively requests and releases resources from the orchestrators. + +### YARN + +{{< generated/yarn_config_configuration >}} + +### Kubernetes + +{{< generated/kubernetes_config_configuration >}} + +### Mesos + +{{< hint warning >}} +Apache Mesos support was deprecated in Flink 1.13 and is subject to removal in the future (see +[FLINK-22352](https://issues.apache.org/jira/browse/FLINK-22352) for further details). +{{< /hint >}} + +{{< generated/mesos_configuration >}} + +**Mesos TaskManager** + +{{< generated/mesos_task_manager_configuration >}} + +---- +---- + +# State Backends + +Please refer to the [State Backend Documentation]({{< ref "docs/ops/state/state_backends" >}}) for background on State Backends. + +### RocksDB State Backend + +These are the options commonly needed to configure the RocksDB state backend. See the [Advanced RocksDB Backend Section](#advanced-rocksdb-state-backends-options) for options necessary for advanced low level configurations and trouble-shooting. + +{{< generated/state_backend_rocksdb_section >}} + +---- +---- + +# Metrics + +Please refer to the [metrics system documentation]({{< ref "docs/ops/metrics" >}}) for background on Flink's metrics infrastructure. + +{{< generated/metric_configuration >}} + +### RocksDB Native Metrics + +Flink can report metrics from RocksDB's native code, for applications using the RocksDB state backend. +The metrics here are scoped to the operators and then further broken down by column family; values are reported as unsigned longs. + +{{< hint warning >}} +Enabling RocksDB's native metrics may cause degraded performance and should be set carefully. +{{< /hint >}} + +{{< generated/rocksdb_native_metric_configuration >}} + +---- +---- + +# History Server + +The history server keeps the information of completed jobs (graphs, runtimes, statistics). To enable it, you have to enable "job archiving" in the JobManager (`jobmanager.archive.fs.dir`). + +See the [History Server Docs]({{< ref "docs/deployment/advanced/historyserver" >}}) for details. + +{{< generated/history_server_configuration >}} + +---- +---- + +# Experimental + +*Options for experimental features in Flink.* + +### Queryable State + +*Queryable State* is an experimental features that gives lets you access Flink's internal state like a key/value store. +See the [Queryable State Docs]({{< ref "docs/dev/datastream/fault-tolerance/queryable_state" >}}) for details. + +{{< generated/queryable_state_configuration >}} + +---- +---- + +# Client + +{{< generated/client_configuration >}} + +---- +---- + +# Execution + +{{< generated/deployment_configuration >}} +{{< generated/savepoint_config_configuration >}} +{{< generated/execution_configuration >}} + +### Pipeline + +{{< generated/pipeline_configuration >}} +{{< generated/stream_pipeline_configuration >}} + +### Checkpointing + +{{< generated/execution_checkpointing_configuration >}} + +---- +---- + +# Debugging & Expert Tuning + +
    + The options below here are meant for expert users and for fixing/debugging problems. Most setups should not need to configure these options. +
    + +### Class Loading + +Flink dynamically loads the code for jobs submitted to a session cluster. In addition, Flink tries to hide many dependencies in the classpath from the application. This helps to reduce dependency conflicts between the application code and the dependencies in the classpath. + +Please refer to the [Debugging Classloading Docs]({{< ref "docs/ops/debugging/debugging_classloading" >}}) for details. + +{{< generated/expert_class_loading_section >}} + +### Advanced Options for the debugging + +{{< generated/expert_debugging_and_tuning_section >}} + +### Advanced State Backends Options + +{{< generated/expert_state_backends_section >}} + +### State Backends Latency Tracking Options + +{{< generated/state_backend_latency_tracking_section >}} + +### Advanced RocksDB State Backends Options + +Advanced options to tune RocksDB and RocksDB checkpoints. + +{{< generated/expert_rocksdb_section >}} + +**RocksDB Configurable Options** + +These options give fine-grained control over the behavior and resoures of ColumnFamilies. +With the introduction of `state.backend.rocksdb.memory.managed` and `state.backend.rocksdb.memory.fixed-per-slot` (Apache Flink 1.10), it should be only necessary to use the options here for advanced performance tuning. These options here can also be specified in the application program via `RocksDBStateBackend.setRocksDBOptions(RocksDBOptionsFactory)`. + +{{< generated/rocksdb_configurable_configuration >}} + +### Advanced Fault Tolerance Options + +*These parameters can help with problems related to failover and to components erroneously considering each other as failed.* + +{{< generated/expert_fault_tolerance_section >}} + +### Advanced Cluster Options + +{{< generated/expert_cluster_section >}} + +### Advanced Scheduling Options + +*These parameters can help with fine-tuning scheduling for specific situations.* + +{{< generated/expert_scheduling_section >}} + +### Advanced High-availability Options + +{{< generated/expert_high_availability_section >}} + +### Advanced High-availability ZooKeeper Options + +{{< generated/expert_high_availability_zk_section >}} + +### Advanced High-availability Kubernetes Options + +{{< generated/expert_high_availability_k8s_section >}} + +### Advanced SSL Security Options + +{{< generated/expert_security_ssl_section >}} + +### Advanced Options for the REST endpoint and Client + +{{< generated/expert_rest_section >}} + +### Advanced Options for Flink Web UI + +{{< generated/web_configuration >}} + +### Full JobManager Options + +**JobManager** + +{{< generated/all_jobmanager_section >}} + +**Blob Server** + +The Blob Server is a component in the JobManager. It is used for distribution of objects that are too large to be attached to a RPC message and that benefit from caching (like Jar files or large serialized code objects). + +{{< generated/blob_server_configuration >}} + +**ResourceManager** + +These configuration keys control basic Resource Manager behavior, independent of the used resource orchestration management framework (YARN, Mesos, etc.) + +{{< generated/resource_manager_configuration >}} + +### Full TaskManagerOptions + +{{< generated/all_taskmanager_section >}} + +**Data Transport Network Stack** + +These options are for the network stack that handles the streaming and batch data exchanges between TaskManagers. + +{{< generated/all_taskmanager_network_section >}} + +### RPC / Akka + +Flink uses Akka for RPC between components (JobManager/TaskManager/ResourceManager). +Flink does not use Akka for data transport. + +{{< generated/akka_configuration >}} + +---- +---- + +# JVM and Logging Options + +{{< generated/environment_configuration >}} + +# Forwarding Environment Variables + +You can configure environment variables to be set on the JobManager and TaskManager processes started on Yarn/Mesos. + + - `containerized.master.env.`: Prefix for passing custom environment variables to Flink's JobManager process. + For example for passing LD_LIBRARY_PATH as an env variable to the JobManager, set containerized.master.env.LD_LIBRARY_PATH: "/usr/lib/native" + in the flink-conf.yaml. + + - `containerized.taskmanager.env.`: Similar to the above, this configuration prefix allows setting custom environment variables for the workers (TaskManagers). + +---- +---- + +# Deprecated Options + +These options relate to parts of Flink that are not actively developed any more. +These options may be removed in a future release. + +**DataSet API Optimizer** + +{{< generated/optimizer_configuration >}} + +**DataSet API Runtime Algorithms** + +{{< generated/algorithm_configuration >}} + +**DataSet File Sinks** + +{{< generated/deprecated_file_sinks_section >}} + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/elastic_scaling.md b/docs/content.zh/docs/deployment/elastic_scaling.md new file mode 100644 index 0000000000000..e4046ce38d752 --- /dev/null +++ b/docs/content.zh/docs/deployment/elastic_scaling.md @@ -0,0 +1,156 @@ +--- +title: Elastic Scaling +weight: 5 +type: docs + +--- + + +# Elastic Scaling + +Apache Flink allows you to rescale your jobs. You can do this manually by stopping the job and restarting from the savepoint created during shutdown with a different parallelism. + +This page describes options where Flink automatically adjusts the parallelism instead. + +## Reactive Mode + +{{< hint info >}} +Reactive mode is an MVP ("minimum viable product") feature. The Flink community is actively looking for feedback by users through our mailing lists. Please check the limitations listed on this page. +{{< /hint >}} + +Reactive Mode configures a job so that it always uses all resources available in the cluster. Adding a TaskManager will scale up your job, removing resources will scale it down. Flink will manage the parallelism of the job, always setting it to the highest possible values. + +Reactive Mode restarts a job on a rescaling event, restoring it from the latest completed checkpoint. This means that there is no overhead of creating a savepoint (which is needed for manually rescaling a job). Also, the amount of data that is reprocessed after rescaling depends on the checkpointing interval, and the restore time depends on the state size. + +The Reactive Mode allows Flink users to implement a powerful autoscaling mechanism, by having an external service monitor certain metrics, such as consumer lag, aggregate CPU utilization, throughput or latency. As soon as these metrics are above or below a certain threshold, additional TaskManagers can be added or removed from the Flink cluster. This could be implemented through changing the [replica factor](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#replicas) of a Kubernetes deployment, or an [autoscaling group](https://docs.aws.amazon.com/autoscaling/ec2/userguide/AutoScalingGroup.html) on AWS. This external service only needs to handle the resource allocation and deallocation. Flink will take care of keeping the job running with the resources available. + +### Getting started + +If you just want to try out Reactive Mode, follow these instructions. They assume that you are deploying Flink on a single machine. + +```bash + +# these instructions assume you are in the root directory of a Flink distribution. + +# Put Job into lib/ directory +cp ./examples/streaming/TopSpeedWindowing.jar lib/ +# Submit Job in Reactive Mode +./bin/standalone-job.sh start -Dscheduler-mode=reactive -Dexecution.checkpointing.interval="10s" -j org.apache.flink.streaming.examples.windowing.TopSpeedWindowing +# Start first TaskManager +./bin/taskmanager.sh start +``` + +Let's quickly examine the used submission command: +- `./bin/standalone-job.sh start` deploys Flink in [Application Mode]({{< ref "docs/deployment/overview" >}}#application-mode) +- `-Dscheduler-mode=reactive` enables Reactive Mode. +- `-Dexecution.checkpointing.interval="10s"` configure checkpointing and restart strategy. +- the last argument is passing the Job's main class name. + +You have now started a Flink job in Reactive Mode. The [web interface](http://localhost:8081) shows that the job is running on one TaskManager. If you want to scale up the job, simply add another TaskManager to the cluster: +```bash +# Start additional TaskManager +./bin/taskmanager.sh start +``` + +To scale down, remove a TaskManager instance. +```bash +# Remove a TaskManager +./bin/taskmanager.sh stop +``` + +### Usage + +#### Configuration + +To enable Reactive Mode, you need to configure `scheduler-mode` to `reactive`. + +The **parallelism of individual operators in a job will be determined by the scheduler**. It is not configurable +and will be ignored if explicitly set, either on individual operators or the entire job. + +The only way of influencing the parallelism is by setting a max parallelism for an operator +(which will be respected by the scheduler). The maxParallelism is bounded by 2^15 (32768). +If you do not set a max parallelism for individual operators or the entire job, the +[default parallelism rules]({{< ref "docs/dev/execution/parallel" >}}#setting-the-maximum-parallelism) will be applied, +potentially applying lower bounds than the max possible value. As with the default scheduling mode, please take +the [best practices for parallelism]({{< ref "docs/ops/production_ready" >}}#set-an-explicit-max-parallelism) into consideration. + +Note that such a high max parallelism might affect performance of the job, since more internal structures are needed to maintain [some internal structures](https://flink.apache.org/features/2017/07/04/flink-rescalable-state.html) of Flink. + +When enabling Reactive Mode, the [`jobmanager.adaptive-scheduler.resource-wait-timeout`]({{< ref "docs/deployment/config">}}#jobmanager-adaptive-scheduler-resource-wait-timeout) configuration key will default to `-1`. This means that the JobManager will run forever waiting for sufficient resources. +If you want the JobManager to stop after a certain time without enough TaskManagers to run the job, configure `jobmanager.adaptive-scheduler.resource-wait-timeout`. + +With Reactive Mode enabled, the [`jobmanager.adaptive-scheduler.resource-stabilization-timeout`]({{< ref "docs/deployment/config">}}#jobmanager-adaptive-scheduler-resource-stabilization-timeout) configuration key will default to `0`: Flink will start runnning the job, as soon as there are sufficient resources available. +In scenarios where TaskManagers are not connecting at the same time, but slowly one after another, this behavior leads to a job restart whenever a TaskManager connects. Increase this configuration value if you want to wait for the resources to stabilize before scheduling the job. +Additionally, one can configure [`jobmanager.adaptive-scheduler.min-parallelism-increase`]({{< ref "docs/deployment/config">}}#jobmanager-adaptive-scheduler-min-parallelism-increase): This configuration option specifices the minumum amount of additional, aggregate parallelism increase before triggering a scale-up. For example if you have a job with a source (parallelism=2) and a sink (parallelism=2), the aggregate parallelism is 4. By default, the configuration key is set to 1, so any increase in the aggregate parallelism will trigger a restart. + +#### Recommendations + +- **Configure periodic checkpointing for stateful jobs**: Reactive mode restores from the latest completed checkpoint on a rescale event. If no periodic checkpointing is enabled, your program will lose its state. Checkpointing also configures a **restart strategy**. Reactive Mode will respect the configured restarting strategy: If no restarting strategy is configured, reactive mode will fail your job, instead of scaling it. + +- Downscaling in Reactive Mode might cause longer stalls in your processing because Flink waits for the heartbeat between JobManager and the stopped TaskManager(s) to time out. You will see that your Flink job is stuck for roughly 50 seconds before redeploying your job with a lower parallelism. + + The default timeout is configured to 50 seconds. Adjust the [`heartbeat.timeout`]({{< ref "docs/deployment/config">}}#heartbeat-timeout) configuration to a lower value, if your infrastructure permits this. Setting a low heartbeat timeout can lead to failures if a TaskManager fails to respond to a heartbeat, for example due to a network congestion or a long garbage collection pause. Note that the [`heartbeat.interval`]({{< ref "docs/deployment/config">}}#heartbeat-interval) always needs to be lower than the timeout. + + +### Limitations + +Since Reactive Mode is a new, experimental feature, not all features supported by the default scheduler are also available with Reactive Mode (and its adaptive scheduler). The Flink community is working on addressing these limitations. + +- **Deployment is only supported as a standalone application deployment**. Active resource providers (such as native Kubernetes, YARN or Mesos) are explicitly not supported. Standalone session clusters are not supported either. The application deployment is limited to single job applications. + + The only supported deployment options are [Standalone in Application Mode]({{< ref "docs/deployment/resource-providers/standalone/overview" >}}#application-mode) ([described](#getting-started) on this page), [Docker in Application Mode]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#application-mode-on-docker) and [Standalone Kubernetes Application Cluster]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}}#deploy-application-cluster). + +The [limitations of Adaptive Scheduler](#limitations-1) also apply to Reactive Mode. + + +## Adaptive Scheduler + +{{< hint warning >}} +Using Adaptive Scheduler directly (not through Reactive Mode) is only advised for advanced users because slot allocation on a session cluster with multiple jobs is not defined. +{{< /hint >}} + +The Adaptive Scheduler can adjust the parallelism of a job based on available slots. It will automatically reduce the parallelism if not enough slots are available to run the job with the originally configured parallelism; be it due to not enough resources being available at the time of submission, or TaskManager outages during the job execution. If new slots become available the job will be scaled up again, up to the configured parallelism. +In Reactive Mode (see above) the configured parallelism is ignored and treated as if it was set to infinity, letting the job always use as many resources as possible. +You can also use Adaptive Scheduler without Reactive Mode, but there are some practical limitations: +- If you are using Adaptive Scheduler on a session cluster, there are no guarantees regarding the distribution of slots between multiple running jobs in the same session. + +One benefit of the Adaptive Scheduler over the default scheduler is that it can handle TaskManager losses gracefully, since it would just scale down in these cases. + +### Usage + +The following configuration parameters need to be set: + +- `jobmanager.scheduler: adaptive`: Change from the default scheduler to adaptive scheduler +- `cluster.declarative-resource-management.enabled` Declarative resource management must be enabled (enabled by default). + +The behavior of Adaptive Scheduler is configured by [all configuration options containing `adaptive-scheduler`]({{< ref "docs/deployment/config">}}#advanced-scheduling-options) in their name. + +### Limitations + +- **Streaming jobs only**: The first version of Adaptive Scheduler runs with streaming jobs only. When submitting a batch job, we will automatically fall back to the default scheduler. +- **No support for [local recovery]({{< ref "docs/ops/state/large_state_tuning">}}#task-local-recovery)**: Local recovery is a feature that schedules tasks to machines so that the state on that machine gets re-used if possible. The lack of this feature means that Adaptive Scheduler will always need to download the entire state from the checkpoint storage. +- **No support for partial failover**: Partial failover means that the scheduler is able to restart parts ("regions" in Flink's internals) of a failed job, instead of the entire job. This limitation impacts only recovery time of embarrassingly parallel jobs: Flink's default scheduler can restart failed parts, while Adaptive Scheduler will restart the entire job. +- **Limited integration with Flink's Web UI**: Adaptive Scheduler allows that a job's parallelism can change over its lifetime. The web UI only shows the current parallelism the job. +- **Limited Job metrics**: With the exception of `numRestarts` all [availability]({{< ref "docs/ops/metrics" >}}#availability) and [checkpointing]({{< ref "docs/ops/metrics" >}}#checkpointing) metrics with the `Job` scope are not working correctly. +- **Unused slots**: If the max parallelism for slot sharing groups is not equal, slots offered to Adaptive Scheduler might be unused. +- Scaling events trigger job and task restarts, which will increase the number of Task attempts. + + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/filesystems/_index.md b/docs/content.zh/docs/deployment/filesystems/_index.md new file mode 100644 index 0000000000000..e77dee7bead36 --- /dev/null +++ b/docs/content.zh/docs/deployment/filesystems/_index.md @@ -0,0 +1,23 @@ +--- +title: File Systems +bookCollapseSection: true +weight: 6 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/deployment/filesystems/azure.md b/docs/content.zh/docs/deployment/filesystems/azure.md new file mode 100644 index 0000000000000..728882126023c --- /dev/null +++ b/docs/content.zh/docs/deployment/filesystems/azure.md @@ -0,0 +1,82 @@ +--- +title: Azure Blob 存储 +weight: 4 +type: docs +aliases: + - /zh/deployment/filesystems/azure.html + - /zh/ops/filesystems/azure +--- + + +# Azure Blob 存储 + +[Azure Blob 存储](https://docs.microsoft.com/en-us/azure/storage/) 是一项由 Microsoft 管理的服务,能提供多种应用场景下的云存储。 +Azure Blob 存储可与 Flink 一起使用以**读取**和**写入数据**,以及与[流 State Backend]({{< ref "docs/ops/state/state_backends" >}}) 结合使用。 + + + +通过以下格式指定路径,Azure Blob 存储对象可类似于普通文件使用: + +```plain +wasb://@$.blob.core.windows.net/ + +// SSL 加密访问 +wasbs://@$.blob.core.windows.net/ +``` + +参见以下代码了解如何在 Flink 作业中使用 Azure Blob 存储: + +```java +// 读取 Azure Blob 存储 +env.readTextFile("wasb://@$.blob.core.windows.net/"); + +// 写入 Azure Blob 存储 +stream.writeAsText("wasb://@$.blob.core.windows.net/") + +// 将 Azure Blob 存储用作 FsStatebackend +env.setStateBackend(new FsStateBackend("wasb://@$.blob.core.windows.net/")); +``` + +### Shaded Hadoop Azure Blob 存储文件系统 + +为使用 flink-azure-fs-hadoop,在启动 Flink 之前,将对应的 JAR 文件从 opt 目录复制到 Flink 发行版中的 plugin 目录下的一个文件夹中,例如: + +```bash +mkdir ./plugins/azure-fs-hadoop +cp ./opt/flink-azure-fs-hadoop-{{< version >}}.jar ./plugins/azure-fs-hadoop/ +``` + +`flink-azure-fs-hadoop` 为使用 *wasb://* 和 *wasbs://* (SSL 加密访问) 的 URI 注册了默认的文件系统包装器。 + +### 凭据配置 +Hadoop 的 Azure 文件系统支持通过 Hadoop 配置来配置凭据,如 [Hadoop Azure Blob Storage 文档](https://hadoop.apache.org/docs/current/hadoop-azure/index.html#Configuring_Credentials) 所述。 +为方便起见,Flink 将所有的 Flink 配置添加 `fs.azure` 键前缀后转发至文件系统的 Hadoop 配置中。因此,可通过以下方法在 `flink-conf.yaml` 中配置 Azure Blob 存储密钥: + +```yaml +fs.azure.account.key..blob.core.windows.net: +``` + +或者通过在 `flink-conf.yaml` 中设置以下配置键,将文件系统配置为从环境变量 `AZURE_STORAGE_KEY` 读取 Azure Blob 存储密钥: + +```yaml +fs.azure.account.keyprovider..blob.core.windows.net: org.apache.flink.fs.azurefs.EnvironmentVariableKeyProvider +``` + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/filesystems/common.md b/docs/content.zh/docs/deployment/filesystems/common.md new file mode 100644 index 0000000000000..a9d73a6cc0f5a --- /dev/null +++ b/docs/content.zh/docs/deployment/filesystems/common.md @@ -0,0 +1,66 @@ +--- +title: "通用配置" +weight: 1 +type: docs +aliases: + - /zh/deployment/filesystems/common.html + - /zh/ops/filesystems.html +--- + + +# 通用配置 + +Apache Flink 提供了一些对所有文件系统均适用的基本配置。 + + + +## 默认文件系统 + +如果文件路径未明确指定文件系统的 scheme(和 authority),将会使用默认的 scheme(和 authority): + +```yaml +fs.default-scheme: +``` + +例如默认的文件系统配置为 `fs.default-scheme: hdfs://localhost:9000/`,则文件路径 `/user/hugo/in.txt` 将被处理为 `hdfs://localhost:9000/user/hugo/in.txt`。 + +## 连接限制 + +如果文件系统不能处理大量并发读/写操作或连接,可以为文件系统同时打开的总连接数设置上限。 + +例如在一个大型 Flink 任务建立 checkpoint 时,具有少量 RPC handler 的小型 HDFS 集群可能会由于建立了过多的连接而过载。 + +要限制文件系统的连接数,可将下列配置添加至 Flink 配置中。设置限制的文件系统由其 scheme 指定: + +```yaml +fs..limit.total: (数量,0/-1 表示无限制) +fs..limit.input: (数量,0/-1 表示无限制) +fs..limit.output: (数量,0/-1 表示无限制) +fs..limit.timeout: (毫秒,0 表示无穷) +fs..limit.stream-timeout: (毫秒,0 表示无穷) +``` + +输入和输出连接(流)的数量可以分别进行限制(`fs..limit.input` 和 `fs..limit.output`),也可以限制并发流的总数(`fs..limit.total`)。如果文件系统尝试打开更多的流,操作将被阻塞直至某些流关闭。如果打开流的时间超过 `fs..limit.timeout`,则流打开失败。 + +为避免不活动的流占满整个连接池(阻止新连接的建立),可以在配置中添加无活动超时时间,如果连接至少在 `fs..limit.stream-timeout` 时间内没有读/写操作,则连接会被强制关闭。 + +连接数是按每个 TaskManager/文件系统来进行限制的。因为文件系统的创建是按照 scheme 和 authority 进行的,所以不同的 authority 具有独立的连接池,例如 `hdfs://myhdfs:50010/` 和 `hdfs://anotherhdfs:4399/` 会有单独的连接池。 + +{{< top >}} \ No newline at end of file diff --git a/docs/content.zh/docs/deployment/filesystems/oss.md b/docs/content.zh/docs/deployment/filesystems/oss.md new file mode 100644 index 0000000000000..07dfb38396b8a --- /dev/null +++ b/docs/content.zh/docs/deployment/filesystems/oss.md @@ -0,0 +1,92 @@ +--- +title: 阿里云 OSS +weight: 3 +type: docs +aliases: + - /zh/deployment/filesystems/oss.html + - /zh/ops/filesystems/oss.html +--- + + +# 阿里云对象存储服务 (OSS) + +## OSS:对象存储服务 + +[阿里云对象存储服务](https://www.aliyun.com/product/oss) (Aliyun OSS) 使用广泛,尤其在中国云用户中十分流行,能提供多种应用场景下的云对象存储。OSS 可与 Flink 一起使用以读取与存储数据,以及与[流 State Backend]({{< ref "docs/ops/state/state_backends" >}}) 结合使用。 + + + +通过以下格式指定路径,OSS 对象可类似于普通文件使用: + +```plain +oss:/// +``` + +以下代码展示了如何在 Flink 作业中使用 OSS: + +```java +// 读取 OSS bucket +env.readTextFile("oss:///"); + +// 写入 OSS bucket +stream.writeAsText("oss:///") + +// 将 OSS 用作 FsStatebackend +env.setStateBackend(new FsStateBackend("oss:///")); +``` + +### Shaded Hadoop OSS 文件系统 + +为使用 `flink-oss-fs-hadoop`,在启动 Flink 之前,将对应的 JAR 文件从 `opt` 目录复制到 Flink 发行版中的 `plugin` 目录下的一个文件夹中,例如: + +```bash +mkdir ./plugins/oss-fs-hadoop +cp ./opt/flink-oss-fs-hadoop-{{< version >}}.jar ./plugins/oss-fs-hadoop/ +``` + +`flink-oss-fs-hadoop` 为使用 *oss://* scheme 的 URI 注册了默认的文件系统包装器。 + +#### 配置设置 + +在设置好 OSS 文件系统包装器之后,需要添加一些配置以保证 Flink 有权限访问 OSS buckets。 + +为了简单使用,可直接在 `flink-conf.yaml` 中使用与 Hadoop `core-site.xml` 相同的配置关键字。 + +可在 [Hadoop OSS 文档](http://hadoop.apache.org/docs/current/hadoop-aliyun/tools/hadoop-aliyun/index.html) 中查看配置关键字。 + +一些配置必须添加至 `flink-conf.yaml` (**在 Hadoop OSS 文档中定义的其它配置为用作性能调优的高级配置**): + +```yaml +fs.oss.endpoint: 连接的 Aliyun OSS endpoint +fs.oss.accessKeyId: Aliyun access key ID +fs.oss.accessKeySecret: Aliyun access key secret +``` + +备选的 `CredentialsProvider` 也可在 `flink-conf.yaml` 中配置,例如: +```yaml +# 从 OSS_ACCESS_KEY_ID 和 OSS_ACCESS_KEY_SECRET 读取凭据 (Credentials) +fs.oss.credentials.provider: com.aliyun.oss.common.auth.EnvironmentVariableCredentialsProvider +``` + +其余的凭据提供者(credential providers)可在[这里](https://github.com/aliyun/aliyun-oss-java-sdk/tree/master/src/main/java/com/aliyun/oss/common/auth)中找到。 + + + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/filesystems/overview.md b/docs/content.zh/docs/deployment/filesystems/overview.md new file mode 100644 index 0000000000000..8c206fe1e013e --- /dev/null +++ b/docs/content.zh/docs/deployment/filesystems/overview.md @@ -0,0 +1,112 @@ +--- +title: "文件系统" +weight: 1 +type: docs +aliases: + - /zh/deployment/filesystems/ + - /zh/ops/filesystems/common.html + - /zh/ops/filesystems/index.html +--- + + +# 文件系统 + +Apache Flink 使用文件系统来消费和持久化地存储数据,以处理应用结果以及容错与恢复。以下是一些最常用的文件系统:*本地存储*,*hadoop-compatible*,*Amazon S3*,*MapR FS*,*阿里云 OSS* 和 *Azure Blob Storage*。 + +文件使用的文件系统通过其 URI Scheme 指定。例如 `file:///home/user/text.txt` 表示一个在本地文件系统中的文件,`hdfs://namenode:50010/data/user/text.txt` 表示一个在指定 HDFS 集群中的文件。 + +文件系统在每个进程实例化一次,然后进行缓存/池化,从而避免每次创建流时的配置开销,并强制执行特定的约束,如连接/流的限制。 + + + +## 本地文件系统 + +Flink 原生支持本地机器上的文件系统,包括任何挂载到本地文件系统的 NFS 或 SAN 驱动器,默认即可使用,无需额外配置。本地文件可通过 *file://* URI Scheme 引用。 + +## 外部文件系统 + +Apache Flink 支持下列文件系统: + - [**Amazon S3**]({{< ref "docs/deployment/filesystems/s3" >}}) 对象存储由 `flink-s3-fs-presto` 和 `flink-s3-fs-hadoop` 两种替代实现提供支持。这两种实现都是独立的,没有依赖项。 + + - **MapR FS** 文件系统适配器已在 Flink 的主发行版中通过 *maprfs://* URI Scheme 支持。MapR 库需要在 classpath 中指定(例如在 `lib` 目录中)。 + + - **[阿里云对象存储]({{< ref "docs/deployment/filesystems/oss" >}})**由 `flink-oss-fs-hadoop` 支持,并通过 *oss://* URI scheme 使用。该实现基于 [Hadoop Project](https://hadoop.apache.org/),但其是独立的,没有依赖项。 + + - **[Azure Blob Storage]({{< ref "docs/deployment/filesystems/azure" >}})** 由`flink-azure-fs-hadoop` 支持,并通过 *wasb(s)://* URI scheme 使用。该实现基于 [Hadoop Project](https://hadoop.apache.org/),但其是独立的,没有依赖项。 + +除 **MapR FS** 之外,上述文件系统可以并且需要作为[插件]({{< ref "docs/deployment/filesystems/plugins" >}})使用。 + +使用外部文件系统时,在启动 Flink 之前需将对应的 JAR 文件从 `opt` 目录复制到 Flink 发行版 `plugin` 目录下的某一文件夹中,例如: + +```bash +mkdir ./plugins/s3-fs-hadoop +cp ./opt/flink-s3-fs-hadoop-{{ site.version }}.jar ./plugins/s3-fs-hadoop/ +``` + +注意 文件系统的[插件]({{< ref "docs/deployment/filesystems/plugins" >}})机制在 Flink 版本 1.9 中引入,以支持每个插件专有 Java 类加载器,并避免类隐藏机制。您仍然可以通过旧机制使用文件系统,即将对应的 JAR 文件复制到 `lib` 目录中,或使用您自己的实现方式,但是从版本 1.10 开始,**S3 插件必须通过插件机制加载**,因为这些插件不再被隐藏(版本 1.10 之后类不再被重定位),旧机制不再可用。 + +尽可能通过基于[插件]({{< ref "docs/deployment/filesystems/plugins" >}})的加载机制使用支持的文件系统。未来的 Flink 版本将不再支持通过 `lib` 目录加载文件系统组件。 + +## 添加新的外部文件系统实现 + +文件系统由类 `org.apache.flink.core.fs.FileSystem` 表示,该类定义了访问与修改文件系统中文件与对象的方法。 + +要添加一个新的文件系统: + + - 添加文件系统实现,它应是 `org.apache.flink.core.fs.FileSystem` 的子类。 + - 添加 Factory 类,以实例化该文件系统并声明文件系统所注册的 scheme, 它应是 `org.apache.flink.core.fs.FileSystemFactory` 的子类。 + - 添加 Service Entry。创建文件 `META-INF/services/org.apache.flink.core.fs.FileSystemFactory`,文件中包含文件系统 Factory 类的类名。 + (更多细节请查看 [Java Service Loader docs](https://docs.oracle.com/javase/8/docs/api/java/util/ServiceLoader.html)) + +在插件检索时,文件系统 Factory 类会由一个专用的 Java 类加载器加载,从而避免与其他类或 Flink 组件冲突。在文件系统实例化和文件系统调用时,应使用该类加载器。 + +警告 实际上这表示您的实现应避免使用 `Thread.currentThread().getContextClassLoader()` 类加载器。 + +## Hadoop 文件系统 (HDFS) 及其其他实现 + +所有 Flink 无法找到直接支持的文件系统均将回退为 Hadoop。 +当 flink-runtime 和 Hadoop 类包含在 classpath 中时,所有的 Hadoop 文件系统将自动可用。 + + +因此,Flink 无缝支持所有实现 `org.apache.hadoop.fs.FileSystem` 接口的 Hadoop 文件系统和所有兼容 Hadoop 的文件系统 (Hadoop-compatible file system, HCFS): + - HDFS (已测试) + - [Google Cloud Storage Connector for Hadoop](https://cloud.google.com/hadoop/google-cloud-storage-connector)(已测试) + - [Alluxio](http://alluxio.org/)(已测试,参见下文的配置详细信息) + - [XtreemFS](http://www.xtreemfs.org/)(已测试) + - FTP via [Hftp](http://hadoop.apache.org/docs/r1.2.1/hftp.html)(未测试) + - HAR(未测试) + - ... + +Hadoop 配置须在 `core-site.xml` 文件中包含所需文件系统的实现。可查看 **[Alluxio 的示例](#alluxio)**。 + +除非有其他的需要,建议使用 Flink 内置的文件系统。在某些情况下,如通过配置 Hadoop `core-site.xml` 中的 `fs.defaultFS` 属性将文件系统作为 YARN 的资源存储时,可能需要直接使用 Hadoop 文件系统。 + +### Alluxio + +在 `core-site.xml` 文件中添加以下条目以支持 Alluxio: + +```xml + + fs.alluxio.impl + alluxio.hadoop.FileSystem + +``` + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/filesystems/plugins.md b/docs/content.zh/docs/deployment/filesystems/plugins.md new file mode 100644 index 0000000000000..962a9f1e62ee6 --- /dev/null +++ b/docs/content.zh/docs/deployment/filesystems/plugins.md @@ -0,0 +1,127 @@ +--- +title: "Plugins" +weight: 5 +type: docs +aliases: + - /zh/deployment/filesystems/plugins.html + - /zh/ops/plugins.html +--- + + +# Plugins + +Plugins facilitate a strict separation of code through restricted classloaders. Plugins cannot +access classes from other plugins or from Flink that have not been specifically whitelisted. This +strict isolation allows plugins to contain conflicting versions of the same library without the need +to relocate classes or to converge to common versions. Currently, file systems and metric reporters are pluggable +but in the future, connectors, formats, and even user code should also be pluggable. + + + +## Isolation and plugin structure + +Plugins reside in their own folders and can consist of several jars. The names of the plugin folders +are arbitrary. + +``` +flink-dist +├── conf +├── lib +... +└── plugins + ├── s3 + │ ├── aws-credential-provider.jar + │ └── flink-s3-fs-hadoop.jar + └── azure + └── flink-azure-fs-hadoop.jar +``` + +Each plugin is loaded through its own classloader and completely isolated from any other plugin. +Hence, the `flink-s3-fs-hadoop` and `flink-azure-fs-hadoop` can depend on different conflicting +library versions. There is no need to relocate any class during the creation of fat jars (shading). + +Plugins may access certain whitelisted packages from Flink's `lib/` folder. In particular, all +necessary service provider interfaces (SPI) are loaded through the system classloader, so that no +two versions of `org.apache.flink.core.fs.FileSystem` exist at any given time, even if users +accidentally bundle it in their fat jar. This singleton class requirement is strictly necessary so +that the Flink runtime has an entry point into the plugin. Service classes are discovered through +the `java.util.ServiceLoader`, so make sure to retain the service definitions in `META-INF/services` +during shading. + +Note *Currently, more Flink core classes are still +accessible from plugins as we flesh out the SPI system.* + +Furthermore, the most common logger frameworks are whitelisted, such that logging is uniformly +possible across Flink core, plugins, and user code. + +## File Systems + +All [file systems]({{< ref "docs/deployment/filesystems/overview" >}}) **except MapR** are pluggable. That means they can and should +be used as plugins. To use a pluggable file system, copy the corresponding JAR file from the `opt` +directory to a directory under `plugins` directory of your Flink distribution before starting Flink, +e.g. + +```bash +mkdir ./plugins/s3-fs-hadoop +cp ./opt/flink-s3-fs-hadoop-{{< version >}}.jar ./plugins/s3-fs-hadoop/ +``` + +{{< hint warning >}} +The s3 file systems (`flink-s3-fs-presto` and +`flink-s3-fs-hadoop`) can only be used as plugins as we already removed the relocations. Placing them in libs/ will result in system failures. +{{< /hint >}} + +{{< hint warning >}} +Because of the [strict isolation](#isolation-and-plugin-structure), file systems do not have access to credential providers in lib/ +anymore. Please add any needed providers to the respective plugin folder. +{{< /hint >}} + + + + + +## Metric Reporters + +All [metric reporters]({{< ref "docs/deployment/metric_reporters" >}}) that Flink provides can be used as plugins. +See the [metrics]({{< ref "docs/ops/metrics" >}}) documentation for more details. + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/filesystems/s3.md b/docs/content.zh/docs/deployment/filesystems/s3.md new file mode 100644 index 0000000000000..07c48bcbdbfa1 --- /dev/null +++ b/docs/content.zh/docs/deployment/filesystems/s3.md @@ -0,0 +1,142 @@ +--- +title: Amazon S3 +weight: 2 +type: docs +aliases: + - /zh/deployment/filesystems/s3.html + - /zh/ops/filesystems/s3.html +--- + + +# Amazon S3 + +[Amazon Simple Storage Service](http://aws.amazon.com/s3/) (Amazon S3) 提供用于多种场景的云对象存储。S3 可与 Flink 一起使用以读取、写入数据,并可与 [流的 **State backends**]({{< ref "docs/ops/state/state_backends" >}}) 相结合使用。 + + + +通过以下格式指定路径,S3 对象可类似于普通文件使用: + +```plain +s3:/// +``` + +Endpoint 可以是一个文件或目录,例如: + +```java +// 读取 S3 bucket +env.readTextFile("s3:///"); + +// 写入 S3 bucket +stream.writeAsText("s3:///"); + +// 使用 S3 作为 FsStatebackend +env.setStateBackend(new FsStateBackend("s3:///")); +``` + +注意这些例子并*不详尽*,S3 同样可以用在其他场景,包括 [JobManager 高可用配置]({{< ref "docs/deployment/ha/overview" >}}) 或 [RocksDBStateBackend]({{< ref "docs/ops/state/state_backends" >}}#the-rocksdbstatebackend),以及所有 Flink 需要使用文件系统 URI 的位置。 + +在大部分使用场景下,可使用 `flink-s3-fs-hadoop` 或 `flink-s3-fs-presto` 两个独立且易于设置的 S3 文件系统插件。然而在某些情况下,例如使用 S3 作为 YARN 的资源存储目录时,可能需要配置 Hadoop S3 文件系统。 + +### Hadoop/Presto S3 文件系统插件 + +{% panel **注意:** 如果您在使用 [Flink on EMR](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-flink.html),您无需手动对此进行配置。 %} + +Flink 提供两种文件系统用来与 S3 交互:`flink-s3-fs-presto` 和 `flink-s3-fs-hadoop`。两种实现都是独立的且没有依赖项,因此使用时无需将 Hadoop 添加至 classpath。 + + - `flink-s3-fs-presto`,通过 *s3://* 和 *s3p://* 两种 scheme 使用,基于 [Presto project](https://prestodb.io/)。 + 可以使用[和 Presto 文件系统相同的配置项](https://prestodb.io/docs/0.187/connector/hive.html#amazon-s3-configuration)进行配置,方式为将配置添加到 `flink-conf.yaml` 文件中。如果要在 S3 中使用 checkpoint,推荐使用 Presto S3 文件系统。 + + - `flink-s3-fs-hadoop`,通过 *s3://* 和 *s3a://* 两种 scheme 使用, 基于 [Hadoop Project](https://hadoop.apache.org/)。 + 本文件系统可以使用类似 [Hadoop S3A 的配置项](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#S3A)进行配置,方式为将配置添加到 `flink-conf.yaml` 文件中。 + + 例如,Hadoop 有 `fs.s3a.connection.maximum` 的配置选项。 如果你想在 Flink 程序中改变该配置的值,你需要将配置 `s3.connection.maximum: xyz` 添加到 `flink-conf.yaml` 文件中。Flink 会内部将其转换成配置 `fs.s3a.connection.maximum`。 而无需通过 Hadoop 的 XML 配置文件来传递参数。 + + 另外,它是唯一支持 [StreamingFileSink]({{< ref "docs/connectors/datastream/streamfile_sink" >}}) 和 [FileSink]({{< ref "docs/connectors/datastream/file_sink" >}}) 的 S3 文件系统。 + +`flink-s3-fs-hadoop` 和 `flink-s3-fs-presto` 都为 *s3://* scheme 注册了默认的文件系统包装器,`flink-s3-fs-hadoop` 另外注册了 *s3a://*,`flink-s3-fs-presto` 注册了 *s3p://*,因此二者可以同时使用。 +例如某作业使用了 [StreamingFileSink]({{< ref "docs/connectors/datastream/streamfile_sink" >}}),它仅支持 Hadoop,但建立 checkpoint 使用 Presto。在这种情况下,建议明确地使用 *s3a://* 作为 sink (Hadoop) 的 scheme,checkpoint (Presto) 使用 *s3p://*。这一点对于 [FileSink]({{< ref "docs/connectors/datastream/file_sink" >}}) 同样成立。 + +在启动 Flink 之前,将对应的 JAR 文件从 `opt` 复制到 Flink 发行版的 `plugins` 目录下,以使用 `flink-s3-fs-hadoop` 或 `flink-s3-fs-presto`。 + +```bash +mkdir ./plugins/s3-fs-presto +cp ./opt/flink-s3-fs-presto-{{< version >}}.jar ./plugins/s3-fs-presto/ +``` + +#### 配置访问凭据 + +在设置好 S3 文件系统包装器后,您需要确认 Flink 具有访问 S3 Bucket 的权限。 + +##### Identity and Access Management (IAM)(推荐使用) + +建议通过 [Identity and Access Management (IAM)](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) 来配置 AWS 凭据。可使用 IAM 功能为 Flink 实例安全地提供访问 S3 Bucket 所需的凭据。关于配置的细节超出了本文档的范围,请参考 AWS 用户手册中的 [IAM Roles](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html) 部分。 + +如果配置正确,则可在 AWS 中管理对 S3 的访问,而无需为 Flink 分发任何访问密钥(Access Key)。 + +##### 访问密钥(Access Key)(不推荐) + +可以通过**访问密钥对(access and secret key)**授予 S3 访问权限。请注意,根据 [Introduction of IAM roles](https://blogs.aws.amazon.com/security/post/Tx1XG3FX6VMU6O5/A-safer-way-to-distribute-AWS-credentials-to-EC2),不推荐使用该方法。 + + `s3.access-key` 和 `s3.secret-key` 均需要在 Flink 的 `flink-conf.yaml` 中进行配置: + +```yaml +s3.access-key: your-access-key +s3.secret-key: your-secret-key +``` + +## 配置非 S3 访问点 + +S3 文件系统还支持兼容 S3 的对象存储服务,如 [IBM's Cloud Object Storage](https://www.ibm.com/cloud/object-storage) 和 [Minio](https://min.io/)。可在 `flink-conf.yaml` 中配置使用的访问点: + +```yaml +s3.endpoint: your-endpoint-hostname +``` + +## 配置路径样式的访问 + +某些兼容 S3 的对象存储服务可能没有默认启用虚拟主机样式的寻址。这种情况下需要在 `flink-conf.yaml` 中添加配置以启用路径样式的访问: + +```yaml +s3.path.style.access: true +``` + +## S3 文件系统的熵注入 + +内置的 S3 文件系统 (`flink-s3-fs-presto` and `flink-s3-fs-hadoop`) 支持熵注入。熵注入是通过在关键字开头附近添加随机字符,以提高 AWS S3 bucket 可扩展性的技术。 + +如果熵注入被启用,路径中配置好的字串将会被随机字符所替换。例如路径 `s3://my-bucket/checkpoints/_entropy_/dashboard-job/` 将会被替换成类似于 `s3://my-bucket/checkpoints/gf36ikvg/dashboard-job/` 的路径。 +**这仅在使用熵注入选项创建文件时启用!** +否则将完全删除文件路径中的 entropy key。更多细节请参见 [FileSystem.create(Path, WriteOption)](https://ci.apache.org/projects/flink/flink-docs-release-1.6/api/java/org/apache/flink/core/fs/FileSystem.html#create-org.apache.flink.core.fs.Path-org.apache.flink.core.fs.FileSystem.WriteOptions-)。 + +{% panel **注意:** 目前 Flink 运行时仅对 checkpoint 数据文件使用熵注入选项。所有其他文件包括 chekcpoint 元数据与外部 URI 都不使用熵注入,以保证 checkpoint URI 的可预测性。 %} + +配置 *entropy key* 与 *entropy length* 参数以启用熵注入: + +``` +s3.entropy.key: _entropy_ +s3.entropy.length: 4 (default) + +``` + +`s3.entropy.key` 定义了路径中被随机字符替换掉的字符串。不包含 entropy key 路径将保持不变。 +如果文件系统操作没有经过 *"熵注入"* 写入,entropy key 字串将被直接移除。 +`s3.entropy.length` 定义了用于熵注入的随机字母/数字字符的数量。 + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/ha/_index.md b/docs/content.zh/docs/deployment/ha/_index.md new file mode 100644 index 0000000000000..3eb199e032282 --- /dev/null +++ b/docs/content.zh/docs/deployment/ha/_index.md @@ -0,0 +1,23 @@ +--- +title: High Availablity +bookCollapseSection: true +weight: 6 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/deployment/ha/kubernetes_ha.md b/docs/content.zh/docs/deployment/ha/kubernetes_ha.md new file mode 100644 index 0000000000000..8bfb0eeaa09c0 --- /dev/null +++ b/docs/content.zh/docs/deployment/ha/kubernetes_ha.md @@ -0,0 +1,89 @@ +--- +title: Kubernetes HA Services +weight: 3 +type: docs +aliases: + - /zh/deployment/ha/kubernetes_ha.html +--- + + +# Kubernetes HA Services + +Flink's Kubernetes HA services use [Kubernetes](https://kubernetes.io/) for high availability services. + +Kubernetes high availability services can only be used when deploying to Kubernetes. +Consequently, they can be configured when using [standalone Flink on Kubernetes]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}}) or the [native Kubernetes integration]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}) + +## Prerequisites + +In order to use Flink's Kubernetes HA services you must fulfill the following prerequisites: + +- Kubernetes >= 1.9. +- Service account with permissions to create, edit, delete ConfigMaps. + Take a look at how to configure a service account for [Flink's native Kubernetes integration]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}#rbac) and [standalone Flink on Kubernetes]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}}#kubernetes-high-availability-services) for more information. + + +## Configuration + +In order to start an HA-cluster you have to configure the following configuration keys: + +- [high-availability]({{< ref "docs/deployment/config" >}}#high-availability-1) (required): +The `high-availability` option has to be set to `KubernetesHaServicesFactory`. + +```yaml +high-availability: org.apache.flink.kubernetes.highavailability.KubernetesHaServicesFactory +``` + +- [high-availability.storageDir]({{< ref "docs/deployment/config" >}}#high-availability-storagedir) (required): +JobManager metadata is persisted in the file system `high-availability.storageDir` and only a pointer to this state is stored in Kubernetes. + +```yaml +high-availability.storageDir: s3:///flink/recovery +``` + +The `storageDir` stores all metadata needed to recover a JobManager failure. + +- [kubernetes.cluster-id]({{< ref "docs/deployment/config" >}}#kubernetes-cluster-id) (required): +In order to identify the Flink cluster, you have to specify a `kubernetes.cluster-id`. + +```yaml +kubernetes.cluster-id: cluster1337 +``` + +### Example configuration + +Configure high availability mode in `conf/flink-conf.yaml`: + +```yaml +kubernetes.cluster-id: +high-availability: org.apache.flink.kubernetes.highavailability.KubernetesHaServicesFactory +high-availability.storageDir: hdfs:///flink/recovery +``` + +{{< top >}} + +## High availability data clean up + +To keep HA data while restarting the Flink cluster, simply delete the deployment (via `kubectl delete deployment `). +All the Flink cluster related resources will be deleted (e.g. JobManager Deployment, TaskManager pods, services, Flink conf ConfigMap). +HA related ConfigMaps will be retained because they do not set the owner reference. +When restarting the cluster, all previously running jobs will be recovered and restarted from the latest successful checkpoint. + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/ha/overview.md b/docs/content.zh/docs/deployment/ha/overview.md new file mode 100644 index 0000000000000..0c5034b4e9f7c --- /dev/null +++ b/docs/content.zh/docs/deployment/ha/overview.md @@ -0,0 +1,81 @@ +--- +title: 概览 +weight: 1 +type: docs +aliases: + - /zh/deployment/ha/ + - /zh/ops/jobmanager_high_availability.html + - /zh/deployment/jobmanager_high_availability.html +--- + + +# High Availability + +JobManager High Availability (HA) hardens a Flink cluster against JobManager failures. +This feature ensures that a Flink cluster will always continue executing your submitted jobs. + +## JobManager High Availability + +The JobManager coordinates every Flink deployment. +It is responsible for both *scheduling* and *resource management*. + +By default, there is a single JobManager instance per Flink cluster. +This creates a *single point of failure* (SPOF): if the JobManager crashes, no new programs can be submitted and running programs fail. + +With JobManager High Availability, you can recover from JobManager failures and thereby eliminate the *SPOF*. +You can configure high availability for every cluster deployment. +See the [list of available high availability services](#high-availability-services) for more information. + +### How to make a cluster highly available + +The general idea of JobManager High Availability is that there is a *single leading JobManager* at any time and *multiple standby JobManagers* to take over leadership in case the leader fails. +This guarantees that there is *no single point of failure* and programs can make progress as soon as a standby JobManager has taken leadership. + +As an example, consider the following setup with three JobManager instances: + +{{< img src="/fig/jobmanager_ha_overview.png" class="center" >}} + +Flink's [high availability services](#high-availability-services) encapsulate the required services to make everything work: +* **Leader election**: Selecting a single leader out of a pool of `n` candidates +* **Service discovery**: Retrieving the address of the current leader +* **State persistence**: Persisting state which is required for the successor to resume the job execution (JobGraphs, user code jars, completed checkpoints) + +{{< top >}} + +## High Availability Services + +Flink ships with two high availability service implementations: + +* [ZooKeeper]({{< ref "docs/deployment/ha/zookeeper_ha" >}}): +ZooKeeper HA services can be used with every Flink cluster deployment. +They require a running ZooKeeper quorum. + +* [Kubernetes]({{< ref "docs/deployment/ha/kubernetes_ha" >}}): +Kubernetes HA services only work when running on Kubernetes. + +{{< top >}} + +## High Availability data lifecycle + +In order to recover submitted jobs, Flink persists metadata and the job artifacts. +The HA data will be kept until the respective job either succeeds, is cancelled or fails terminally. +Once this happens, all the HA data, including the metadata stored in the HA services, will be deleted. + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/ha/zookeeper_ha.md b/docs/content.zh/docs/deployment/ha/zookeeper_ha.md new file mode 100644 index 0000000000000..3ab017e58aa97 --- /dev/null +++ b/docs/content.zh/docs/deployment/ha/zookeeper_ha.md @@ -0,0 +1,136 @@ +--- +title: ZooKeeper HA Services +weight: 2 +type: docs +aliases: + - /zh/deployment/ha/zookeeper_ha.html +--- + + +# ZooKeeper HA Services + +Flink's ZooKeeper HA services use [ZooKeeper](http://zookeeper.apache.org) for high availability services. + +Flink leverages **[ZooKeeper](http://zookeeper.apache.org)** for *distributed coordination* between all running JobManager instances. +ZooKeeper is a separate service from Flink, which provides highly reliable distributed coordination via leader election and light-weight consistent state storage. +Check out [ZooKeeper's Getting Started Guide](http://zookeeper.apache.org/doc/current/zookeeperStarted.html) for more information about ZooKeeper. +Flink includes scripts to [bootstrap a simple ZooKeeper](#bootstrap-zookeeper) installation. + +## Configuration + +In order to start an HA-cluster you have to configure the following configuration keys: + +- [high-availability]({{< ref "docs/deployment/config" >}}#high-availability-1) (required): +The `high-availability` option has to be set to `zookeeper`. + +
    high-availability: zookeeper
    + +- [high-availability.storageDir]({{< ref "docs/deployment/config" >}}#high-availability-storagedir) (required): +JobManager metadata is persisted in the file system `high-availability.storageDir` and only a pointer to this state is stored in ZooKeeper. + +
    high-availability.storageDir: hdfs:///flink/recovery
    + + The `storageDir` stores all metadata needed to recover a JobManager failure. + +- [high-availability.zookeeper.quorum]({%link deployment/config.md %}#high-availability-zookeeper-quorum) (required): +A *ZooKeeper quorum* is a replicated group of ZooKeeper servers, which provide the distributed coordination service. + +
    high-availability.zookeeper.quorum: address1:2181[,...],addressX:2181
    + + Each `addressX:port` refers to a ZooKeeper server, which is reachable by Flink at the given address and port. + +- [high-availability.zookeeper.path.root]({{< ref "docs/deployment/config" >}}#high-availability-zookeeper-path-root) (recommended): +The *root ZooKeeper node*, under which all cluster nodes are placed. + +
    high-availability.zookeeper.path.root: /flink
    + +- [high-availability.cluster-id]({{< ref "docs/deployment/config" >}}#high-availability-cluster-id) (recommended): +The *cluster-id ZooKeeper node*, under which all required coordination data for a cluster is placed. + +
    high-availability.cluster-id: /default_ns # important: customize per cluster
    + + **Important**: + You should not set this value manually when running on YARN, native Kubernetes or on another cluster manager. + In those cases a cluster-id is being automatically generated. + If you are running multiple Flink HA clusters on bare metal, you have to manually configure separate cluster-ids for each cluster. + +### Example configuration + +Configure high availability mode and ZooKeeper quorum in `conf/flink-conf.yaml`: + +```bash +high-availability: zookeeper +high-availability.zookeeper.quorum: localhost:2181 +high-availability.zookeeper.path.root: /flink +high-availability.cluster-id: /cluster_one # important: customize per cluster +high-availability.storageDir: hdfs:///flink/recovery +``` + +{{< top >}} + +## Configuring for ZooKeeper Security + +If ZooKeeper is running in secure mode with Kerberos, you can override the following configurations in `flink-conf.yaml` as necessary: + +```bash +# default is "zookeeper". If the ZooKeeper quorum is configured +# with a different service name then it can be supplied here. + +zookeeper.sasl.service-name: zookeeper + +# default is "Client". The value needs to match one of the values +# configured in "security.kerberos.login.contexts". +zookeeper.sasl.login-context-name: Client +``` + +For more information on Flink configuration for Kerberos security, please refer to the [security section of the Flink configuration page]({{< ref "docs/deployment/config" >}}#security). +You can also find further details on [how Flink sets up Kerberos-based security internally]({{< ref "docs/deployment/security/security-kerberos" >}}). + +{{< top >}} + +## ZooKeeper Versions + +Flink ships with separate ZooKeeper clients for 3.4 and 3.5, with 3.4 being in the `lib` directory of the distribution +and thus used by default, whereas 3.5 is placed in the `opt` directory. + +The 3.5 client allows you to secure the ZooKeeper connection via SSL, but _may_ not work with 3.4- ZooKeeper installations. + +You can control which version is used by Flink by placing either jar in the `lib` directory. + +{{< top >}} + +## Bootstrap ZooKeeper + +If you don't have a running ZooKeeper installation, you can use the helper scripts, which ship with Flink. + +There is a ZooKeeper configuration template in `conf/zoo.cfg`. +You can configure the hosts to run ZooKeeper on with the `server.X` entries, where X is a unique ID of each server: + +```bash +server.X=addressX:peerPort:leaderPort +[...] +server.Y=addressY:peerPort:leaderPort +``` + +The script `bin/start-zookeeper-quorum.sh` will start a ZooKeeper server on each of the configured hosts. +The started processes start ZooKeeper servers via a Flink wrapper, which reads the configuration from `conf/zoo.cfg` and makes sure to set some required configuration values for convenience. +In production setups, it is recommended to manage your own ZooKeeper installation. + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/memory/_index.md b/docs/content.zh/docs/deployment/memory/_index.md new file mode 100644 index 0000000000000..6bf26fc91fc57 --- /dev/null +++ b/docs/content.zh/docs/deployment/memory/_index.md @@ -0,0 +1,23 @@ +--- +title: 内存配置 +bookCollapseSection: true +weight: 4 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/deployment/memory/mem_migration.md b/docs/content.zh/docs/deployment/memory/mem_migration.md new file mode 100644 index 0000000000000..c4b76deaa1597 --- /dev/null +++ b/docs/content.zh/docs/deployment/memory/mem_migration.md @@ -0,0 +1,310 @@ +--- +title: "升级指南" +weight: 7 +type: docs +aliases: + - /zh/deployment/memory/mem_migration.html + - /zh/ops/memory/mem_migration.html +--- + + +# 升级指南 + +在 *1.10* 和 *1.11* 版本中,Flink 分别对 [TaskManager]({{< ref "docs/deployment/memory/mem_setup_tm" >}}) 和 [JobManager]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}) 的内存配置方法做出了较大的改变。 +部分配置参数被移除了,或是语义上发生了变化。 +本篇升级指南将介绍如何将 [*Flink 1.9 及以前版本*](https://ci.apache.org/projects/flink/flink-docs-release-1.9/ops/mem_setup.html)的 TaskManager 内存配置升级到 *Flink 1.10 及以后版本*, +以及如何将 *Flink 1.10 及以前版本*的 JobManager 内存配置升级到 *Flink 1.11 及以后版本*。 + +* toc + + +
    + 注意: 请仔细阅读本篇升级指南。 + 使用原本的和新的内存配制方法可能会使内存组成部分具有截然不同的大小。 + 未经调整直接沿用 Flink 1.10 以前版本的 TaskManager 配置文件或 Flink 1.11 以前版本的 JobManager 配置文件,可能导致应用的行为、性能发生变化,甚至造成应用执行失败。 +
    + +提示 +在 *1.10/1.11* 版本之前,Flink 不要求用户一定要配置 TaskManager/JobManager 内存相关的参数,因为这些参数都具有默认值。 +[新的内存配置]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)要求用户至少指定下列配置参数(或参数组合)的其中之一,否则 Flink 将无法启动。 + +|   **TaskManager:**   |   **JobManager:**   | +| :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | :-------------------------------------------------------------------------------- | +| [`taskmanager.memory.flink.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-flink-size) | [`jobmanager.memory.flink.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-flink-size) | +| [`taskmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-process-size) | [`jobmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-process-size) | +| [`taskmanager.memory.task.heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-task-heap-size) 和
    [`taskmanager.memory.managed.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-size) | [`jobmanager.memory.heap.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-heap-size) | + +
    + +Flink 自带的[默认 flink-conf.yaml](#default-configuration-in-flink-confyaml) 文件指定了 [`taskmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-process-size)(*>= 1.10*)和 [`jobmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-process-size) (*>= 1.11*),以便与此前的行为保持一致。 + +可以使用这张[电子表格](https://docs.google.com/spreadsheets/d/1mJaMkMPfDJJ-w6nMXALYmTc4XxiV30P5U7DzgwLkSoE)来估算和比较原本的和新的内存配置下的计算结果。 + + + +## 升级 TaskManager 内存配置 + + + +### 配置参数变化 + +本节简要列出了 *Flink 1.10* 引入的配置参数变化,并援引其他章节中关于如何升级到新配置参数的相关描述。 + +下列配置参数已被彻底移除,配置它们将不会产生任何效果。 + + + + + + + + + + + + + + + + + + + + + + +
    移除的配置参数备注
    taskmanager.memory.fraction
    + 请参考新配置参数 }}#taskmanager-memory-managed-fraction">taskmanager.memory.managed.fraction 的相关描述。 + 新的配置参数与被移除的配置参数在语义上有所差别,因此其配置值通常也需要做出适当调整。 + 请参考如何升级托管内存。 +
    taskmanager.memory.off-heap
    Flink 不再支持堆上的(On-Heap)托管内存。请参考如何升级托管内存
    taskmanager.memory.preallocate
    Flink 不再支持内存预分配,今后托管内存将都是惰性分配的。请参考如何升级托管内存
    + +下列配置参数将被弃用,出于向后兼容性考虑,配置它们将被解读成对应的新配置参数。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    弃用的配置参数对应的新配置参数
    taskmanager.heap.size
    + + 请参考如何升级总内存。 +
    taskmanager.memory.size
    }}#taskmanager-memory-managed-size">taskmanager.memory.managed.size。请参考如何升级托管内存
    taskmanager.network.memory.min
    }}#taskmanager-memory-network-min">taskmanager.memory.network.min
    taskmanager.network.memory.max
    }}#taskmanager-memory-network-max">taskmanager.memory.network.max
    taskmanager.network.memory.fraction
    }}#taskmanager-memory-network-fraction">taskmanager.memory.network.fraction
    + +尽管网络内存的配置参数没有发生太多变化,我们仍建议您检查其配置结果。 +网络内存的大小可能会受到其他内存部分大小变化的影响,例如总内存变化时,根据占比计算出的网络内存也可能发生变化。 +请参考[内存模型详解]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#detailed-memory-model)。 + +容器切除(Cut-Off)内存相关的配置参数(`containerized.heap-cutoff-ratio` 和 `containerized.heap-cutoff-min`)将不再对 TaskManager 进程生效。 +请参考[如何升级容器切除内存](#container-cut-off-memory)。 + +
    + +### 总内存(原堆内存) + +在原本的内存配置方法中,用于指定用于 Flink 的总内存的配置参数是 `taskmanager.heap.size` 或 `taskmanager.heap.mb`。 +尽管这两个参数以“堆(Heap)”命名,实际上它们指定的内存既包含了 JVM 堆内存,也包含了其他堆外内存部分。 +这两个配置参数目前已被弃用。 + +Flink 在 Mesos 上还有另一个具有同样语义的配置参数 `mesos.resourcemanager.tasks.mem`,目前也已经被弃用。 + +如果配置了上述弃用的参数,同时又没有配置与之对应的新配置参数,那它们将按如下规则对应到新的配置参数。 +* 独立部署模式(Standalone Deployment)下:Flink 总内存([`taskmanager.memory.flink.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-flink-size)) +* 容器化部署模式(Containerized Deployement)下(Yarn、Mesos):进程总内存([`taskmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-process-size)) + +建议您尽早使用新的配置参数取代启用的配置参数,它们在今后的版本中可能会被彻底移除。 + +请参考[如何配置总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory). + + + +### JVM 堆内存 + +此前,JVM 堆空间由托管内存(仅在配置为堆上时)及 Flink 用到的所有其他堆内存组成。 +这里的其他堆内存是由总内存减去所有其他非堆内存得到的。 +请参考[如何升级托管内存](#managed-memory)。 + +现在,如果仅配置了*Flink总内存*或*进程总内存*,JVM 的堆空间依然是根据总内存减去所有其他非堆内存得到的。 +请参考[如何配置总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)。 + +此外,你现在可以更直接地控制用于任务和算子的 JVM 的堆内存([`taskmanager.memory.task.heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-task-heap-size)),详见[任务堆内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#task-operator-heap-memory)。 +如果流处理作业选择使用 Heap State Backend([MemoryStateBackend]({{< ref "docs/ops/state/state_backends" >}}#memorystatebackend) +或 [FsStateBackend]({{< ref "docs/ops/state/state_backends" >}}#fsstatebackend)),那么它同样需要使用 JVM 堆内存。 + +Flink 现在总是会预留一部分 JVM 堆内存供框架使用([`taskmanager.memory.framework.heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-framework-heap-size))。 +请参考[框架内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#framework-memory)。 + + + +### 托管内存 + +请参考[如何配置托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)。 + + + +#### 明确的大小 + +原本用于指定明确的托管内存大小的配置参数(`taskmanager.memory.size`)已被弃用,与它具有相同语义的新配置参数为 [`taskmanager.memory.managed.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-size)。 +建议使用新的配置参数,原本的配置参数在今后的版本中可能会被彻底移除。 + + + +#### 占比 + +此前,如果不指定明确的大小,也可以将托管内存配置为占用总内存减去网络内存和容器切除内存(仅在 [Yarn]({{< ref "docs/deployment/resource-providers/yarn" >}}) 和 +[Mesos]({{< ref "docs/deployment/resource-providers/mesos" >}}) 上)之后剩余部分的固定比例(`taskmanager.memory.fraction`)。 +该配置参数已经被彻底移除,配置它不会产生任何效果。 +请使用新的配置参数 [`taskmanager.memory.managed.fraction`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-fraction)。 +在未通过 [`taskmanager.memory.managed.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-size) 指定明确大小的情况下,新的配置参数将指定[托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)在 [Flink 总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)中的所占比例。 + + + +#### RocksDB State Backend + +流处理作业如果选择使用 [RocksDBStateBackend]({{< ref "docs/ops/state/state_backends" >}}#rocksdbstatebackend),它使用的本地内存现在也被归为[托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)。 +默认情况下,RocksDB 将限制其内存用量不超过[托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)大小,以避免在 [Yarn]({{< ref "docs/deployment/resource-providers/yarn" >}}) 或 [Mesos]({{< ref "docs/deployment/resource-providers/mesos" >}}) 上容器被杀。你也可以通过设置 [state.backend.rocksdb.memory.managed]({{< ref "docs/deployment/config" >}}#state-backend-rocksdb-memory-managed) 来关闭 RocksDB 的内存控制。 +请参考[如何升级容器切除内存](#container-cut-off-memory)。 + + + +#### 其他变化 + +此外,Flink 1.10 对托管内存还引入了下列变化: +* [托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)现在总是在堆外。配置参数 `taskmanager.memory.off-heap` 已被彻底移除,配置它不会产生任何效果。 +* [托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)现在使用本地内存而非直接内存。这意味着托管内存将不在 JVM 直接内存限制的范围内。 +* [托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)现在总是惰性分配的。配置参数 `taskmanager.memory.preallocate` 已被彻底移除,配置它不会产生任何效果。 + + + +## 升级 JobManager 内存配置 + +在原本的内存配置方法中,用于指定 *JVM 堆内存* 的配置参数是: +* `jobmanager.heap.size` +* `jobmanager.heap.mb` + +尽管这两个参数以“堆(Heap)”命名,在此之前它们实际上只有在[独立部署模式]({{< ref "docs/deployment/resource-providers/standalone/overview" >}})才完全对应于 *JVM 堆内存*。 +在容器化部署模式下([Kubernetes]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}}) 和 [Yarn]({{< ref "docs/deployment/resource-providers/yarn" >}})),它们指定的内存还包含了其他堆外内存部分。 +*JVM 堆空间*的实际大小,是参数指定的大小减去容器切除(Cut-Off)内存后剩余的部分。 +容器切除内存在 *1.11* 及以上版本中已被彻底移除。 + +上述两个参数此前对 [Mesos]({{< ref "docs/deployment/resource-providers/mesos" >}}) 部署模式并不生效。 +Flink 在 Mesos 上启动 JobManager 进程时并未设置任何 JVM 内存参数。 +从 *1.11* 版本开始,Flink 将采用与[独立部署模式]({{< ref "docs/deployment/resource-providers/standalone/overview" >}})相同的方式设置这些参数。 + +这两个配置参数目前已被弃用。 +如果配置了上述弃用的参数,同时又没有配置与之对应的新配置参数,那它们将按如下规则对应到新的配置参数。 +* 独立部署模式(Standalone Deployment)、Mesos 部署模式下:JVM 堆内存([`jobmanager.memory.heap.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-heap-size)) +* 容器化部署模式(Containerized Deployement)下(Kubernetes、Yarn):进程总内存([`jobmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-process-size)) + +建议您尽早使用新的配置参数取代启用的配置参数,它们在今后的版本中可能会被彻底移除。 + +如果仅配置了 *Flink 总内存*或*进程总内存*,那么 [JVM 堆内存]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}#configure-jvm-heap)将是总内存减去其他内存部分后剩余的部分。 +请参考[如何配置总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)。 +此外,也可以通过配置 [`jobmanager.memory.heap.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-heap-size) 的方式直接指定 [JVM 堆内存]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}#configure-jvm-heap)。 + + + +## Flink JVM 进程内存限制 + +从 *1.10* 版本开始,Flink 通过设置相应的 JVM 参数,对 TaskManager 进程使用的 *JVM Metaspace* 和 *JVM 直接内存*进行限制。 +从 *1.11* 版本开始,Flink 同样对 JobManager 进程使用的 *JVM Metaspace* 进行限制。 +此外,还可以通过设置 [`jobmanager.memory.enable-jvm-direct-memory-limit`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-enable-jvm-direct-memory-limit) 对 JobManager 进程的 *JVM 直接内存*进行限制。 +请参考 [JVM 参数]({{< ref "docs/deployment/memory/mem_setup" >}}#jvm-parameters)。 + +Flink 通过设置上述 JVM 内存限制降低内存泄漏问题的排查难度,以避免出现[容器内存溢出]({{< ref "docs/deployment/memory/mem_trouble" >}}#container-memory-exceeded)等问题。 +请参考常见问题中关于 [JVM Metaspace]({{< ref "docs/deployment/memory/mem_trouble" >}}#outofmemoryerror-metaspace) 和 [JVM 直接内存]({{< ref "docs/deployment/memory/mem_trouble" >}}#outofmemoryerror-direct-buffer-memory) *OutOfMemoryError* 异常的描述。 + + + +## 容器切除(Cut-Off)内存 + +在容器化部署模式(Containerized Deployment)下,此前你可以指定切除内存。 +这部分内存将预留给所有未被 Flink 计算在内的内存开销。 +其主要来源是不受 Flink 直接管理的依赖使用的内存,例如 RocksDB、JVM 内部开销等。 +相应的配置参数(`containerized.heap-cutoff-ratio` 和 `containerized.heap-cutoff-min`)不再生效。 +新的内存配置方法引入了新的内存组成部分来具体描述这些内存用量。 + + + +### TaskManager + +流处理作业如果使用了 [RocksDBStateBackend]({{< ref "docs/ops/state/state_backends" >}}#the-rocksdbstatebackend),RocksDB 使用的本地内存现在将被归为[托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)。 +默认情况下,RocksDB 将限制其内存用量不超过[托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)大小。 +请同时参考[如何升级托管内存](#managed-memory)以及[如何配置托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)。 + +其他堆外(直接或本地)内存开销,现在可以通过下列配置参数进行设置: +* 任务堆外内存([`taskmanager.memory.task.off-heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-task-off-heap-size)) +* 框架堆外内存([`taskmanager.memory.framework.off-heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-framework-off-heap-size)) +* JVM Metaspace([`taskmanager.memory.jvm-metaspace.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-jvm-metaspace-size)) +* [JVM 开销]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#detailed-memory-model) + + + +### JobManager + +可以通过下列配置参数设置堆外(直接或本地)内存开销: +* 堆外内存 ([`jobmanager.memory.off-heap.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-off-heap-size)) +* JVM Metaspace ([`jobmanager.memory.jvm-metaspace.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-jvm-metaspace-size)) +* [JVM 开销]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}#detailed-configuration) + + + +## flink-conf.yaml 中的默认配置 + +本节描述 Flink 自带的默认 `flink-conf.yaml` 文件中的变化。 + +原本的 TaskManager 总内存(`taskmanager.heap.size`)被新的配置项 [`taskmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-process-size) 所取代。 +默认值从 1024Mb 增加到了 1728Mb。 + +原本的 JobManager 总内存(`jobmanager.heap.size`)被新的配置项 [`jobmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-process-size) 所取代。 +默认值从 1024Mb 增加到了 1600Mb。 + +请参考[如何配置总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)。 + +
    + 注意: 使用新的默认 `flink-conf.yaml` 可能会造成各内存部分的大小发生变化,从而产生性能变化。 +
    diff --git a/docs/content.zh/docs/deployment/memory/mem_setup.md b/docs/content.zh/docs/deployment/memory/mem_setup.md new file mode 100644 index 0000000000000..ca8b97036c1a4 --- /dev/null +++ b/docs/content.zh/docs/deployment/memory/mem_setup.md @@ -0,0 +1,157 @@ +--- +title: "配置 Flink 进程的内存" +weight: 2 +type: docs +aliases: + - /zh/deployment/memory/mem_setup.html + - /zh/memory/mem_setup.html + - /zh/ops/memory +--- + + +# 配置 Flink 进程的内存 + +Apache Flink 基于 JVM 的高效处理能力,依赖于其对各组件内存用量的细致掌控。 +考虑到用户在 Flink 上运行的应用的多样性,尽管社区已经努力为所有配置项提供合理的默认值,仍无法满足所有情况下的需求。 +为了给用户生产提供最大化的价值, Flink 允许用户在整体上以及细粒度上对集群的内存分配进行调整。 + +本文接下来介绍的内存配置方法适用于 *1.10* 及以上版本的 TaskManager 进程和 *1.11* 及以上版本的 JobManager 进程。 +Flink 在 *1.10* 和 *1.11* 版本中对内存配置部分进行了较大幅度的改动,从早期版本升级的用户请参考[升级指南]({{< ref "docs/deployment/memory/mem_migration" >}})。 + +
    + +## 配置总内存 + +Flink JVM 进程的*进程总内存(Total Process Memory)*包含了由 Flink 应用使用的内存(*Flink 总内存*)以及由运行 Flink 的 JVM 使用的内存。 +*Flink 总内存(Total Flink Memory)*包括 *JVM 堆内存(Heap Memory)*和*堆外内存(Off-Heap Memory)*。 +其中堆外内存包括*直接内存(Direct Memory)*和*本地内存(Native Memory)*。 + +{{< img src="/fig/process_mem_model.svg" width="300px" alt="Flink's process memory model" usemap="#process-mem-model" >}} + +
    + +配置 Flink 进程内存最简单的方法是指定以下两个配置项中的任意一个: + +|   **配置项**   |   **TaskManager 配置参数**   |   **JobManager 配置参数**   | +| :------------------------------------ | :---------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------- | +| Flink 总内存 | [`taskmanager.memory.flink.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-flink-size) | [`jobmanager.memory.flink.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-flink-size) | +| 进程总内存 | [`taskmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-process-size) | [`jobmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-process-size) | + +
    + +提示 +关于本地执行,请分别参考 [TaskManager]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#local-execution) 和 [JobManager]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}#local-execution) 的相关文档。 + +Flink 会根据默认值或其他配置参数自动调整剩余内存部分的大小。 +关于各内存部分的更多细节,请分别参考 [TaskManager]({{< ref "docs/deployment/memory/mem_setup_tm" >}}) 和 [JobManager]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}) 的相关文档。 + +对于[独立部署模式(Standalone Deployment)]({{< ref "docs/deployment/resource-providers/standalone/overview" >}}),如果你希望指定由 Flink 应用本身使用的内存大小,最好选择配置 *Flink 总内存*。 +*Flink 总内存*会进一步划分为 *JVM 堆内存*和*堆外内存*。 +更多详情请参考[如何为独立部署模式配置内存]({{< ref "docs/deployment/memory/mem_tuning" >}}#configure-memory-for-standalone-deployment)。 + +通过配置*进程总内存*可以指定由 Flink *JVM 进程*使用的总内存大小。 +对于容器化部署模式(Containerized Deployment),这相当于申请的容器(Container)大小,详情请参考[如何配置容器内存]({{< ref "docs/deployment/memory/mem_tuning" >}}#configure-memory-for-containers)([Kubernetes]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}})、[Yarn]({{< ref "docs/deployment/resource-providers/yarn" >}}) 或 [Mesos]({{< ref "docs/deployment/resource-providers/mesos" >}}))。 + +此外,还可以通过设置 *Flink 总内存*的特定内部组成部分的方式来进行内存配置。 +不同进程需要设置的内存组成部分是不一样的。 +详情请分别参考 [TaskManager]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#configure-heap-and-managed-memory) 和 [JobManager]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}#configure-jvm-heap) 的相关文档。 + +提示 +以上三种方式中,用户需要至少选择其中一种进行配置(本地运行除外),否则 Flink 将无法启动。 +这意味着,用户需要从以下无默认值的配置参数(或参数组合)中选择一个给出明确的配置: + +|   **TaskManager:**   |   **JobManager:**   | +| :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | :-------------------------------------------------------------------------------- | +| [`taskmanager.memory.flink.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-flink-size) | [`jobmanager.memory.flink.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-flink-size) | +| [`taskmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-process-size) | [`jobmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-process-size) | +| [`taskmanager.memory.task.heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-task-heap-size) 和
    [`taskmanager.memory.managed.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-size) | [`jobmanager.memory.heap.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-heap-size) | + +
    + +提示 +不建议同时设置*进程总内存*和 *Flink 总内存*。 +这可能会造成内存配置冲突,从而导致部署失败。 +额外配置其他内存部分时,同样需要注意可能产生的配置冲突。 + +
    + +## JVM 参数 + +Flink 进程启动时,会根据配置的和自动推导出的各内存部分大小,显式地设置以下 JVM 参数: + +|   **JVM 参数**   |   **TaskManager 取值**   |   **JobManager 取值**   | +| :---------------------------------------- | :------------------------------------------------- | :------------------------------------------------ | +| *-Xmx* 和 *-Xms* | 框架堆内存 + 任务堆内存 | JVM 堆内存 (\*) | +| *-XX:MaxDirectMemorySize*
    (TaskManager 始终设置,JobManager 见注释) | 框架堆外内存 + 任务堆外内存(\*\*) + 网络内存 | 堆外内存 (\*\*) (\*\*\*) | +| *-XX:MaxMetaspaceSize* | JVM Metaspace | JVM Metaspace | +(\*) 请记住,根据所使用的 GC 算法,你可能无法使用到全部堆内存。一些 GC 算法会为它们自身分配一定量的堆内存。这会导致[堆的指标]({{< ref "docs/ops/metrics" >}}#memory)返回一个不同的最大值。 +
    +(\*\*) 请注意,堆外内存也包括了用户代码使用的本地内存(非直接内存)。 +
    +(\*\*\*) 只有在 [`jobmanager.memory.enable-jvm-direct-memory-limit`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-enable-jvm-direct-memory-limit) 设置为 `true` 时,JobManager 才会设置 *JVM 直接内存限制*。 +

    + +相关内存部分的配置方法,请同时参考 [TaskManager]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#detailed-memory-model) 和 [JobManager]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}#detailed-configuration) 的详细内存模型。 + +
    + +## 受限的等比内存部分 + +本节介绍下列内存部分的配置方法,它们都可以通过指定在总内存中所占比例的方式进行配置,同时受限于相应的的最大/最小值范围。 +* *JVM 开销*:可以配置占用*进程总内存*的固定比例 +* *网络内存*:可以配置占用 *Flink 总内存*的固定比例(仅针对 TaskManager) + +相关内存部分的配置方法,请同时参考 [TaskManager]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#detailed-memory-model) 和 [JobManager]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}#detailed-configuration) 的详细内存模型。 + +这些内存部分的大小必须在相应的最大值、最小值范围内,否则 Flink 将无法启动。 +最大值、最小值具有默认值,也可以通过相应的配置参数进行设置。 +例如,如果仅配置下列参数: +- *进程总内存* = 1000Mb +- *JVM 开销最小值* = 64Mb +- *JVM 开销最大值* = 128Mb +- *JVM 开销占比* = 0.1 + +那么 *JVM 开销*的实际大小将会是 1000Mb x 0.1 = 100Mb,在 64-128Mb 的范围内。 + +如果将最大值、最小值设置成相同大小,那相当于明确指定了该内存部分的大小。 + +如果没有明确指定内存部分的大小,Flink 会根据总内存和占比计算出该内存部分的大小。 +计算得到的内存大小将受限于相应的最大值、最小值范围。 +例如,如果仅配置下列参数: +- *进程总内存* = 1000Mb +- *JVM 开销最小值* = 128Mb +- *JVM 开销最大值* = 256Mb +- *JVM 开销占比* = 0.1 + +那么 *JVM 开销*的实际大小将会是 128Mb,因为根据总内存和占比计算得到的内存大小 100Mb 小于最小值。 + +如果配置了总内存和其他内存部分的大小,那么 Flink 也有可能会忽略给定的占比。 +这种情况下,受限的等比内存部分的实际大小是总内存减去其他所有内存部分后剩余的部分。 +这样推导得出的内存大小必须符合最大值、最小值范围,否则 Flink 将无法启动。 +例如,如果仅配置下列参数: +- *进程总内存* = 1000Mb +- *任务堆内存* = 100Mb(或 JobManager 的 *JVM 堆内存*) +- *JVM 开销最小值* = 64Mb +- *JVM 开销最大值* = 256Mb +- *JVM 开销占比* = 0.1 + +*进程总内存*中所有其他内存部分均有默认大小,包括 TaskManager 的*托管内存*默认占比或 JobManager 的默认*堆外内存*。 +因此,*JVM 开销*的实际大小不是根据占比算出的大小(1000Mb x 0.1 = 100Mb),而是*进程总内存*中剩余的部分。 +这个剩余部分的大小必须在 64-256Mb 的范围内,否则将会启动失败。 diff --git a/docs/content.zh/docs/deployment/memory/mem_setup_jobmanager.md b/docs/content.zh/docs/deployment/memory/mem_setup_jobmanager.md new file mode 100644 index 0000000000000..107f04d11ac7d --- /dev/null +++ b/docs/content.zh/docs/deployment/memory/mem_setup_jobmanager.md @@ -0,0 +1,109 @@ +--- +title: "配置 JobManager 内存" +weight: 4 +type: docs +aliases: + - /zh/deployment/memory/mem_setup_jobmanager.html + - /zh/ops/memory/mem_setup_jobmanager.html +--- + + +# 配置 JobManager 内存 + +JobManager 是 Flink 集群的控制单元。 +它由三种不同的组件组成:ResourceManager、Dispatcher 和每个正在运行作业的 JobMaster。 +本篇文档将介绍 JobManager 内存在整体上以及细粒度上的配置方法。 + +本文接下来介绍的内存配置方法适用于 *1.11* 及以上版本。 +Flink 在 *1.11* 版本中对内存配置部分进行了较大幅度的改动,从早期版本升级的用户请参考[升级指南]({{< ref "docs/deployment/memory/mem_migration" >}})。 + +提示 +本篇内存配置文档仅针对 JobManager! +与 [TaskManager]({{< ref "docs/deployment/memory/mem_setup_tm" >}}) 相比,JobManager 具有相似但更加简单的内存模型。 + + + +## 配置总内存 + +配置 JobManager 内存最简单的方法就是进程的[配置总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)。 +[本地执行模式](#local-execution)下不需要为 JobManager 进行内存配置,配置参数将不会生效。 + + + +## 详细配置 + +{{< img src="/fig/process_mem_model.svg" width="300px" alt="Flink's process memory model" usemap="#process-mem-model" >}} + +
    + +如上图所示,下表中列出了 Flink JobManager 内存模型的所有组成部分,以及影响其大小的相关配置参数。 + +|   **组成部分**   |   **配置参数**   |   **描述**   | +| :------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [JVM 堆内存](#configure-jvm-heap) | [`jobmanager.memory.heap.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-heap-size) | JobManager 的 *JVM 堆内存*。 | +| [堆外内存](#configure-off-heap-memory) | [`jobmanager.memory.off-heap.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-off-heap-size) | JobManager 的*堆外内存(直接内存或本地内存)*。 | +| [JVM Metaspace]({{< ref "docs/deployment/memory/mem_setup" >}}#jvm-parameters) | [`jobmanager.memory.jvm-metaspace.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-jvm-metaspace-size) | Flink JVM 进程的 Metaspace。 | +| JVM 开销 | [`jobmanager.memory.jvm-overhead.min`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-jvm-overhead-min)
    [`jobmanager.memory.jvm-overhead.max`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-jvm-overhead-max)
    [`jobmanager.memory.jvm-overhead.fraction`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-jvm-overhead-fraction) | 用于其他 JVM 开销的本地内存,例如栈空间、垃圾回收空间等。该内存部分为基于[进程总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)的[受限的等比内存部分]({{< ref "docs/deployment/memory/mem_setup" >}}#capped-fractionated-components)。 | + +
    + +
    + +### 配置 JVM 堆内存 + +如[配置总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)中所述,另一种配置 JobManager 内存的方式是明确指定 *JVM 堆内存*的大小([`jobmanager.memory.heap.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-heap-size))。 +通过这种方式,用户可以更好地掌控用于以下用途的 *JVM 堆内存*大小。 +* Flink 框架 +* 在作业提交时(例如一些特殊的批处理 Source)及 Checkpoint 完成的回调函数中执行的用户代码 + +Flink 需要多少 *JVM 堆内存*,很大程度上取决于运行的作业数量、作业的结构及上述用户代码的需求。 + +提示 +如果已经明确设置了 *JVM 堆内存*,建议不要再设置*进程总内存*或 *Flink 总内存*,否则可能会造成内存配置冲突。 + +在启动 JobManager 进程时,Flink 启动脚本及客户端通过设置 JVM 参数 *-Xms* 和 *-Xmx* 来管理 JVM 堆空间的大小。 +请参考 [JVM 参数]({{< ref "docs/deployment/memory/mem_setup" >}}#jvm-parameters)。 + + + +### 配置堆外内存 + +*堆外内存*包括 *JVM 直接内存* 和 *本地内存*。 +可以通过配置参数 [`jobmanager.memory.enable-jvm-direct-memory-limit`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-enable-jvm-direct-memory-limit) 设置是否启用 *JVM 直接内存限制*。 +如果该配置项设置为 `true`,Flink 会根据配置的*堆外内存*大小设置 JVM 参数 *-XX:MaxDirectMemorySize*。 +请参考 [JVM 参数]({{< ref "docs/deployment/memory/mem_setup" >}}#jvm-parameters)。 + +可以通过配置参数 [`jobmanager.memory.off-heap.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-off-heap-size) 设置堆外内存的大小。 +如果遇到 JobManager 进程抛出 “OutOfMemoryError: Direct buffer memory” 的异常,可以尝试调大这项配置。 +请参考[常见问题]({{< ref "docs/deployment/memory/mem_trouble" >}}#outofmemoryerror-direct-buffer-memory)。 + +以下情况可能用到堆外内存: +* Flink 框架依赖(例如 Akka 的网络通信) +* 在作业提交时(例如一些特殊的批处理 Source)及 Checkpoint 完成的回调函数中执行的用户代码 + +提示 +如果同时配置了 [Flink 总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)和 [JVM 堆内存](#configure-jvm-heap),且没有配置*堆外内存*,那么*堆外内存*的大小将会是 [Flink 总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)减去[JVM 堆内存](#configure-jvm-heap)。 +这种情况下,*堆外内存*的默认大小将不会生效。 + + + +## 本地执行 + +如果你是在本地运行 Flink(例如在 IDE 中)而非创建一个集群,那么 JobManager 的内存配置将不会生效。 diff --git a/docs/content.zh/docs/deployment/memory/mem_setup_tm.md b/docs/content.zh/docs/deployment/memory/mem_setup_tm.md new file mode 100644 index 0000000000000..db7152fc3295c --- /dev/null +++ b/docs/content.zh/docs/deployment/memory/mem_setup_tm.md @@ -0,0 +1,205 @@ +--- +title: "配置 TaskManager 内存" +weight: 3 +type: docs +aliases: + - /zh/deployment/memory/mem_setup_tm.html + - /zh/ops/memory/mem_setup_tm.html +--- + + +# 配置 TaskManager 内存 + +Flink 的 TaskManager 负责执行用户代码。 +根据实际需求为 TaskManager 配置内存将有助于减少 Flink 的资源占用,增强作业运行的稳定性。 + +本文接下来介绍的内存配置方法适用于 *1.10* 及以上版本。 +Flink 在 1.10 版本中对内存配置部分进行了较大幅度的改动,从早期版本升级的用户请参考[升级指南]({{< ref "docs/deployment/memory/mem_migration" >}})。 + +提示 +本篇内存配置文档仅针对 TaskManager! +与 [JobManager]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}) 相比,TaskManager 具有相似但更加复杂的内存模型。 + + + +## 配置总内存 + +Flink JVM 进程的*进程总内存(Total Process Memory)*包含了由 Flink 应用使用的内存(*Flink 总内存*)以及由运行 Flink 的 JVM 使用的内存。 +其中,*Flink 总内存(Total Flink Memory)*包括 JVM 堆内存(Heap Memory)、*托管内存(Managed Memory)*以及其他直接内存(Direct Memory)或本地内存(Native Memory)。 + +{{< img src="/fig/simple_mem_model.svg" width="300px" alt="Simple memory model" usemap="#simple-mem-model" >}} + +
    + +如果你是在本地运行 Flink(例如在 IDE 中)而非创建一个集群,那么本文介绍的配置并非所有都是适用的,详情请参考[本地执行](#local-execution)。 + +其他情况下,配置 Flink 内存最简单的方法就是[配置总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)。 +此外,Flink 也支持[更细粒度的内存配置方式](#configure-heap-and-managed-memory)。 + +Flink 会根据默认值或其他配置参数自动调整剩余内存部分的大小。 +接下来的章节将介绍关于各内存部分的更多细节。 + +
    + +## 配置堆内存和托管内存 + +如[配置总内存](#configure-total-memory)中所述,另一种配置 Flink 内存的方式是同时设置[任务堆内存](#task-operator-heap-memory)和[托管内存](#managed-memory)。 +通过这种方式,用户可以更好地掌控用于 Flink 任务的 JVM 堆内存及 Flink 的[托管内存](#managed-memory)大小。 + +Flink 会根据默认值或其他配置参数自动调整剩余内存部分的大小。 +关于各内存部分的更多细节,请参考[相关文档](#detailed-memory-model)。 + +提示 +如果已经明确设置了任务堆内存和托管内存,建议不要再设置*进程总内存*或 *Flink 总内存*,否则可能会造成内存配置冲突。 + + + +### 任务(算子)堆内存 + +如果希望确保指定大小的 JVM 堆内存给用户代码使用,可以明确指定*任务堆内存*([`taskmanager.memory.task.heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-task-heap-size))。 +指定的内存将被包含在总的 JVM 堆空间中,专门用于 Flink 算子及用户代码的执行。 + + + +### 托管内存 + +*托管内存*是由 Flink 负责分配和管理的本地(堆外)内存。 +以下场景需要使用*托管内存*: +* 流处理作业中用于 [RocksDB State Backend]({{< ref "docs/ops/state/state_backends" >}}#the-rocksdbstatebackend)。 +* [批处理作业]({{< ref "docs/dev/dataset/overview" >}})中用于排序、哈希表及缓存中间结果。 +* 流处理和批处理作业中用于[在 Python 进程中执行用户自定义函数]({{< ref "docs/dev/python/table/udfs/python_udfs" >}})。 + +可以通过以下两种范式指定*托管内存*的大小: +* 通过 [`taskmanager.memory.managed.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-size) 明确指定其大小。 +* 通过 [`taskmanager.memory.managed.fraction`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-fraction) 指定在*Flink 总内存*中的占比。 + +当同时指定二者时,会优先采用指定的大小(Size)。 +若二者均未指定,会根据[默认占比]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-fraction)进行计算。 + +请同时参考[如何配置 State Backend 内存]({{< ref "docs/deployment/memory/mem_tuning" >}}#configure-memory-for-state-backends)以及[如何配置批处理作业内存]({{< ref "docs/deployment/memory/mem_tuning" >}}#configure-memory-for-batch-jobs)。 + + + +#### 消费者权重 + +对于包含不同种类的托管内存消费者的作业,可以进一步控制托管内存如何在消费者之间分配。 +通过 [`taskmanager.memory.managed.consumer-weights`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-consumer-weights) 可以为每一种类型的消费者指定一个权重,Flink 会按照权重的比例进行内存分配。 +目前支持的消费者类型包括: +* `DATAPROC`:用于流处理中的 RocksDB State Backend 和批处理中的内置算法。 +* `PYTHON`:用户 Python 进程。 + +例如,一个流处理作业同时使用到了 RocksDB State Backend 和 Python UDF,消费者权重设置为 `DATAPROC:70,PYTHON:30`,那么 Flink 会将 `70%` 的托管内存用于 RocksDB State Backend,`30%` 留给 Python 进程。 + +提示 +只有作业中包含某种类型的消费者时,Flink 才会为该类型分配托管内存。 +例如,一个流处理作业使用 Heap State Backend 和 Python UDF,消费者权重设置为 `DATAPROC:70,PYTHON:30`,那么 Flink 会将全部托管内存用于 Python 进程,因为 Heap State Backend 不使用托管内存。 + +提示 +对于未出现在消费者权重中的类型,Flink 将不会为其分配托管内存。 +如果缺失的类型是作业运行所必须的,则会引发内存分配失败。 +默认情况下,消费者权重中包含了所有可能的消费者类型。 +上述问题仅可能出现在用户显式地配置了消费者权重的情况下。 + + + +## 配置堆外内存(直接内存或本地内存) + +用户代码中分配的堆外内存被归为*任务堆外内存(Task Off-heap Memory)*,可以通过 [`taskmanager.memory.task.off-heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-task-off-heap-size) 指定。 + +提示 +你也可以调整[框架堆外内存(Framework Off-heap Memory)](#framework-memory)。 +这是一个进阶配置,建议仅在确定 Flink 框架需要更多的内存时调整该配置。 + +Flink 将*框架堆外内存*和*任务堆外内存*都计算在 JVM 的*直接内存*限制中,请参考 [JVM 参数]({{< ref "docs/deployment/memory/mem_setup" >}}#jvm-parameters)。 + +提示 +本地内存(非直接内存)也可以被归在*框架堆外内存*或*任务堆外内存*中,在这种情况下 JVM 的*直接内存*限制可能会高于实际需求。 + +提示 +*网络内存(Network Memory)*同样被计算在 JVM *直接内存*中。 +Flink 会负责管理网络内存,保证其实际用量不会超过配置大小。 +因此,调整*网络内存*的大小不会对其他堆外内存有实质上的影响。 + +请参考[内存模型详解](#detailed-memory-model)。 + + + +## 内存模型详解 + +
    + +{{< img src="/fig/detailed-mem-model.svg" width="300px" alt="Simple memory model" usemap="#simple-mem-model" >}} + + +
    + +如上图所示,下表中列出了 Flink TaskManager 内存模型的所有组成部分,以及影响其大小的相关配置参数。 + +|   **组成部分**   |   **配置参数**   |   **描述**   | +| :------------------------------------------------------------------| :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [框架堆内存(Framework Heap Memory)](#framework-memory) | [`taskmanager.memory.framework.heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-framework-heap-size) | 用于 Flink 框架的 JVM 堆内存(进阶配置)。 | +| [任务堆内存(Task Heap Memory)](#task-operator-heap-memory) | [`taskmanager.memory.task.heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-task-heap-size) | 用于 Flink 应用的算子及用户代码的 JVM 堆内存。 | +| [托管内存(Managed memory)](#managed-memory) | [`taskmanager.memory.managed.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-size)
    [`taskmanager.memory.managed.fraction`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-fraction) | 由 Flink 管理的用于排序、哈希表、缓存中间结果及 RocksDB State Backend 的本地内存。 | +| [框架堆外内存(Framework Off-heap Memory)](#framework-memory) | [`taskmanager.memory.framework.off-heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-framework-off-heap-size) | 用于 Flink 框架的[堆外内存(直接内存或本地内存)](#configure-off-heap-memory-direct-or-native)(进阶配置)。 | +| [任务堆外内存(Task Off-heap Memory)](#configure-off-heap-memory-direct-or-native)| [`taskmanager.memory.task.off-heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-task-off-heap-size) | 用于 Flink 应用的算子及用户代码的[堆外内存(直接内存或本地内存)](#configure-off-heap-memory-direct-or-native)。 | +| 网络内存(Network Memory) | [`taskmanager.memory.network.min`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-network-min)
    [`taskmanager.memory.network.max`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-network-max)
    [`taskmanager.memory.network.fraction`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-network-fraction) | 用于任务之间数据传输的直接内存(例如网络传输缓冲)。该内存部分为基于 [Flink 总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)的[受限的等比内存部分]({{< ref "docs/deployment/memory/mem_setup" >}}#capped-fractionated-components)。 | +| [JVM Metaspace]({{< ref "docs/deployment/memory/mem_setup" >}}#jvm-parameters) | [`taskmanager.memory.jvm-metaspace.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-jvm-metaspace-size) | Flink JVM 进程的 Metaspace。 | +| JVM 开销 | [`taskmanager.memory.jvm-overhead.min`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-jvm-overhead-min)
    [`taskmanager.memory.jvm-overhead.max`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-jvm-overhead-max)
    [`taskmanager.memory.jvm-overhead.fraction`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-jvm-overhead-fraction) | 用于其他 JVM 开销的本地内存,例如栈空间、垃圾回收空间等。该内存部分为基于[进程总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)的[受限的等比内存部分]({{< ref "docs/deployment/memory/mem_setup" >}}#capped-fractionated-components)。 | + +
    + +我们可以看到,有些内存部分的大小可以直接通过一个配置参数进行设置,有些则需要根据多个参数进行调整。 + +
    + +## 框架内存 + +通常情况下,不建议对*框架堆内存*和*框架堆外内存*进行调整。 +除非你非常肯定 Flink 的内部数据结构及操作需要更多的内存。 +这可能与具体的部署环境及作业结构有关,例如非常高的并发度。 +此外,Flink 的部分依赖(例如 Hadoop)在某些特定的情况下也可能会需要更多的直接内存或本地内存。 + +提示 +不管是堆内存还是堆外内存,Flink 中的框架内存和任务内存之间目前是没有隔离的。 +对框架和任务内存的区分,主要是为了在后续版本中做进一步优化。 + + + +## 本地执行 +如果你是将 Flink 作为一个单独的 Java 程序运行在你的电脑本地而非创建一个集群(例如在 IDE 中),那么只有下列配置会生效,其他配置参数则不会起到任何效果: + +|   **组成部分**   |   **配置参数**   |   **本地执行时的默认值**   | +| :------------------------------------------- | :---------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------ | +| 任务堆内存 | [`taskmanager.memory.task.heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-task-heap-size) | 无穷大 | +| 任务堆外内存 | [`taskmanager.memory.task.off-heap.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-task-off-heap-size) | 无穷大 | +| 托管内存 | [`taskmanager.memory.managed.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-managed-size) | 128Mb | +| 网络内存 | [`taskmanager.memory.network.min`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-network-min)
    [`taskmanager.memory.network.max`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-network-max) | 64Mb | + +
    + +本地执行模式下,上面列出的所有内存部分均可以但不是必须进行配置。 +如果未配置,则会采用默认值。 +其中,[任务堆内存](#task-operator-heap-memory)和*任务堆外内存*的默认值无穷大(*Long.MAX_VALUE* 字节),以及[托管内存](#managed-memory)的默认值 128Mb 均只针对本地执行模式。 + +提示 +这种情况下,任务堆内存的大小与实际的堆空间大小无关。 +该配置参数可能与后续版本中的进一步优化相关。 +本地执行模式下,JVM 堆空间的实际大小不受 Flink 掌控,而是取决于本地执行进程是如何启动的。 +如果希望控制 JVM 的堆空间大小,可以在启动进程时明确地指定相关的 JVM 参数,即 *-Xmx* 和 *-Xms*。 diff --git a/docs/content.zh/docs/deployment/memory/mem_trouble.md b/docs/content.zh/docs/deployment/memory/mem_trouble.md new file mode 100644 index 0000000000000..164f921924100 --- /dev/null +++ b/docs/content.zh/docs/deployment/memory/mem_trouble.md @@ -0,0 +1,79 @@ +--- +title: "常见问题" +weight: 6 +type: docs +aliases: + - /zh/deployment/memory/mem_trouble.html + - /zh/ops/memory/mem_trouble.html +--- + + +# 常见问题 + +## IllegalConfigurationException + +如果遇到从 *TaskExecutorProcessUtils* 或 *JobManagerProcessUtils* 抛出的 *IllegalConfigurationException* 异常,这通常说明您的配置参数中存在无效值(例如内存大小为负数、占比大于 1 等)或者配置冲突。 +请根据异常信息,确认出错的内存部分的相关文档及[配置信息]({{< ref "docs/deployment/config" >}}#memory-configuration)。 + +## OutOfMemoryError: Java heap space + +该异常说明 JVM 的堆空间过小。 +可以通过增大[总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)、TaskManager 的[任务堆内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#task-operator-heap-memory)、JobManager 的 [JVM 堆内存]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}#configure-jvm-heap)等方法来增大 JVM 堆空间。 + +提示 +也可以增大 TaskManager 的[框架堆内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#framework-memory)。 +这是一个进阶配置,只有在确认是 Flink 框架自身需要更多内存时才应该去调整。 + +## OutOfMemoryError: Direct buffer memory + +该异常通常说明 JVM 的*直接内存*限制过小,或者存在*直接内存泄漏(Direct Memory Leak)*。 +请确认用户代码及外部依赖中是否使用了 JVM *直接内存*,以及如果使用了直接内存,是否配置了足够的内存空间。 +可以通过调整堆外内存来增大直接内存限制。 +有关堆外内存的配置方法,请参考 [TaskManager]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#configure-off-heap-memory-direct-or-native)、[JobManager]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}}#configure-off-heap-memory) 以及 [JVM 参数]({{< ref "docs/deployment/memory/mem_setup" >}}#jvm-parameters)的相关文档。 + +## OutOfMemoryError: Metaspace + +该异常说明 [JVM Metaspace 限制]({{< ref "docs/deployment/memory/mem_setup" >}}#jvm-parameters)过小。 +可以尝试调整 [TaskManager]({{< ref "docs/deployment/config" >}}#taskmanager-memory-jvm-metaspace-size)、[JobManager]({{< ref "docs/deployment/config" >}}#jobmanager-memory-jvm-metaspace-size) 的 JVM Metaspace。 + +## IOException: Insufficient number of network buffers + +该异常仅与 TaskManager 相关。 + +该异常通常说明[网络内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#detailed-memory-model)过小。 +可以通过调整以下配置参数增大*网络内存*: +* [`taskmanager.memory.network.min`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-network-min) +* [`taskmanager.memory.network.max`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-network-max) +* [`taskmanager.memory.network.fraction`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-network-fraction) + +
    + +## 容器(Container)内存超用 + +如果 Flink 容器尝试分配超过其申请大小的内存(Yarn、Mesos 或 Kubernetes),这通常说明 Flink 没有预留出足够的本地内存。 +可以通过外部监控系统或者容器被部署环境杀掉时的错误信息判断是否存在容器内存超用。 + +对于 *JobManager* 进程,你还可以尝试启用 *JVM 直接内存限制*([`jobmanager.memory.enable-jvm-direct-memory-limit`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-enable-jvm-direct-memory-limit)),以排除 *JVM 直接内存泄漏*的可能性。 + +如果使用了 [RocksDBStateBackend]({{< ref "docs/ops/state/state_backends" >}}#rocksdbstatebackend) 且没有开启内存控制,也可以尝试增大 TaskManager 的[托管内存]({{< ref "docs/deployment/memory/mem_setup" >}}#managed-memory)。 + +此外,还可以尝试增大 [JVM 开销]({{< ref "docs/deployment/memory/mem_setup" >}}#capped-fractionated-components)。 + +请参考[如何配置容器内存]({{< ref "docs/deployment/memory/mem_tuning" >}}#configure-memory-for-containers)。 diff --git a/docs/content.zh/docs/deployment/memory/mem_tuning.md b/docs/content.zh/docs/deployment/memory/mem_tuning.md new file mode 100644 index 0000000000000..bfb0d4c751faa --- /dev/null +++ b/docs/content.zh/docs/deployment/memory/mem_tuning.md @@ -0,0 +1,89 @@ +--- +title: "调优指南" +weight: 5 +type: docs +aliases: + - /zh/deployment/memory/mem_tuning.html + - /zh/ops/memory/mem_tuning.html +--- + + +# 调优指南 + +本文在的基本的[配置指南]({{< ref "docs/deployment/memory/mem_setup" >}})的基础上,介绍如何根据具体的使用场景调整内存配置,以及在不同使用场景下分别需要重点关注哪些配置参数。 + + + +## 独立部署模式(Standalone Deployment)下的内存配置 + +[独立部署模式]({{< ref "docs/deployment/resource-providers/standalone/overview" >}})下,我们通常更关注 Flink 应用本身使用的内存大小。 +建议配置 [Flink 总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)([`taskmanager.memory.flink.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-flink-size) 或者 [`jobmanager.memory.flink.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-flink-size" >}}))或其组成部分。 +此外,如果出现 [Metaspace 不足的问题]({{< ref "docs/deployment/memory/mem_trouble" >}}#outofmemoryerror-metaspace),可以调整 *JVM Metaspace* 的大小。 + +这种情况下通常无需配置*进程总内存*,因为不管是 Flink 还是部署环境都不会对 *JVM 开销* 进行限制,它只与机器的物理资源相关。 + + + +## 容器(Container)的内存配置 + +在容器化部署模式(Containerized Deployment)下([Kubernetes]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}})、[Yarn]({{< ref "docs/deployment/resource-providers/yarn" >}}) 或 [Mesos]({{< ref "docs/deployment/resource-providers/mesos" >}})),建议配置[进程总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)([`taskmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#taskmanager-memory-process-size) 或者 [`jobmanager.memory.process.size`]({{< ref "docs/deployment/config" >}}#jobmanager-memory-process-size))。 +该配置参数用于指定分配给 Flink *JVM 进程*的总内存,也就是需要申请的容器大小。 + +提示 +如果配置了 *Flink 总内存*,Flink 会自动加上 JVM 相关的内存部分,根据推算出的*进程总内存*大小申请容器。 + +
    + 注意: 如果 Flink 或者用户代码分配超过容器大小的非托管的堆外(本地)内存,部署环境可能会杀掉超用内存的容器,造成作业执行失败。 +
    + +请参考[容器内存超用]({{< ref "docs/deployment/memory/mem_trouble" >}}#container-memory-exceeded)中的相关描述。 + +
    + +## State Backend 的内存配置 + +本章节内容仅与 TaskManager 相关。 + +在部署 Flink 流处理应用时,可以根据 [State Backend]({{< ref "docs/ops/state/state_backends" >}}) 的类型对集群的配置进行优化。 + +### Heap State Backend + +执行无状态作业或者使用 Heap State Backend([MemoryStateBackend]({{< ref "docs/ops/state/state_backends" >}}#memorystatebackend) +或 [FsStateBackend]({{< ref "docs/ops/state/state_backends" >}}#fsstatebackend))时,建议将[托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)设置为 0。 +这样能够最大化分配给 JVM 上用户代码的内存。 + +### RocksDB State Backend + +[RocksDBStateBackend]({{< ref "docs/ops/state/state_backends" >}}#rocksdbstatebackend) 使用本地内存。 +默认情况下,RocksDB 会限制其内存用量不超过用户配置的[*托管内存*]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)。 +因此,使用这种方式存储状态时,配置足够多的*托管内存*是十分重要的。 +如果你关闭了 RocksDB 的内存控制,那么在容器化部署模式下如果 RocksDB 分配的内存超出了申请容器的大小([进程总内存]({{< ref "docs/deployment/memory/mem_setup" >}}#configure-total-memory)),可能会造成 TaskExecutor 被部署环境杀掉。 +请同时参考[如何调整 RocksDB 内存]({{< ref "docs/ops/state/large_state_tuning" >}}#tuning-rocksdb-memory)以及 [state.backend.rocksdb.memory.managed]({{< ref "docs/deployment/config" >}}#state-backend-rocksdb-memory-managed)。 + + + +## 批处理作业的内存配置 + +Flink 批处理算子使用[托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)来提高处理效率。 +算子运行时,部分操作可以直接在原始数据上进行,而无需将数据反序列化成 Java 对象。 +这意味着[托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)对应用的性能具有实质上的影响。 +因此 Flink 会在不超过其配置限额的前提下,尽可能分配更多的[托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)。 +Flink 明确知道可以使用的内存大小,因此可以有效避免 `OutOfMemoryError` 的发生。 +当[托管内存]({{< ref "docs/deployment/memory/mem_setup_tm" >}}#managed-memory)不足时,Flink 会优雅地将数据落盘。 diff --git a/docs/content.zh/docs/deployment/metric_reporters.md b/docs/content.zh/docs/deployment/metric_reporters.md new file mode 100644 index 0000000000000..223f429a0d874 --- /dev/null +++ b/docs/content.zh/docs/deployment/metric_reporters.md @@ -0,0 +1,282 @@ +--- +title: "Metric Reporters" +weight: 7 +type: docs +aliases: + - /zh/deployment/metric_reporters.html +--- + + +# Metric Reporters + +Flink allows reporting metrics to external systems. +For more information about Flink's metric system go to the [metric system documentation]({{< ref "docs/ops/metrics" >}}). + + +## Reporter + +Metrics can be exposed to an external system by configuring one or several reporters in `conf/flink-conf.yaml`. These +reporters will be instantiated on each job and task manager when they are started. + +- `metrics.reporter..`: Generic setting `` for the reporter named ``. +- `metrics.reporter..class`: The reporter class to use for the reporter named ``. +- `metrics.reporter..factory.class`: The reporter factory class to use for the reporter named ``. +- `metrics.reporter..interval`: The reporter interval to use for the reporter named ``. +- `metrics.reporter..scope.delimiter`: The delimiter to use for the identifier (default value use `metrics.scope.delimiter`) for the reporter named ``. +- `metrics.reporter..scope.variables.excludes`: (optional) A semi-colon (;) separate list of variables that should be ignored by tag-based reporters (e.g., Prometheus, InfluxDB). +- `metrics.reporters`: (optional) A comma-separated include list of reporter names. By default all configured reporters will be used. + +All reporters must at least have either the `class` or `factory.class` property. Which property may/should be used depends on the reporter implementation. See the individual reporter configuration sections for more information. +Some reporters (referred to as `Scheduled`) allow specifying a reporting `interval`. +Below more settings specific to each reporter will be listed. + +Example reporter configuration that specifies multiple reporters: + +```yaml +metrics.reporters: my_jmx_reporter,my_other_reporter + +metrics.reporter.my_jmx_reporter.factory.class: org.apache.flink.metrics.jmx.JMXReporterFactory +metrics.reporter.my_jmx_reporter.port: 9020-9040 +metrics.reporter.my_jmx_reporter.scope.variables.excludes:job_id;task_attempt_num + +metrics.reporter.my_other_reporter.class: org.apache.flink.metrics.graphite.GraphiteReporter +metrics.reporter.my_other_reporter.host: 192.168.1.1 +metrics.reporter.my_other_reporter.port: 10000 + +``` + +**Important:** The jar containing the reporter must be accessible when Flink is started. Reporters that support the + `factory.class` property can be loaded as [plugins]({{< ref "docs/deployment/filesystems/plugins" >}}). Otherwise the jar must be placed + in the /lib folder. Reporters that are shipped with Flink (i.e., all reporters documented on this page) are available + by default. + +You can write your own `Reporter` by implementing the `org.apache.flink.metrics.reporter.MetricReporter` interface. +If the Reporter should send out reports regularly you have to implement the `Scheduled` interface as well. +By additionally implementing a `MetricReporterFactory` your reporter can also be loaded as a plugin. + +The following sections list the supported reporters. + +### JMX +#### (org.apache.flink.metrics.jmx.JMXReporter) + +You don't have to include an additional dependency since the JMX reporter is available by default +but not activated. + +Parameters: + +- `port` - (optional) the port on which JMX listens for connections. +In order to be able to run several instances of the reporter on one host (e.g. when one TaskManager is colocated with the JobManager) it is advisable to use a port range like `9250-9260`. +When a range is specified the actual port is shown in the relevant job or task manager log. +If this setting is set Flink will start an extra JMX connector for the given port/range. +Metrics are always available on the default local JMX interface. + +Example configuration: + +```yaml + +metrics.reporter.jmx.factory.class: org.apache.flink.metrics.jmx.JMXReporterFactory +metrics.reporter.jmx.port: 8789 + +``` + +Metrics exposed through JMX are identified by a domain and a list of key-properties, which together form the object name. + +The domain always begins with `org.apache.flink` followed by a generalized metric identifier. In contrast to the usual +identifier it is not affected by scope-formats, does not contain any variables and is constant across jobs. +An example for such a domain would be `org.apache.flink.job.task.numBytesOut`. + +The key-property list contains the values for all variables, regardless of configured scope formats, that are associated +with a given metric. +An example for such a list would be `host=localhost,job_name=MyJob,task_name=MyTask`. + +The domain thus identifies a metric class, while the key-property list identifies one (or multiple) instances of that metric. + +### Graphite +#### (org.apache.flink.metrics.graphite.GraphiteReporter) + +Parameters: + +- `host` - the Graphite server host +- `port` - the Graphite server port +- `protocol` - protocol to use (TCP/UDP) + +Example configuration: + +```yaml + +metrics.reporter.grph.factory.class: org.apache.flink.metrics.graphite.GraphiteReporterFactory +metrics.reporter.grph.host: localhost +metrics.reporter.grph.port: 2003 +metrics.reporter.grph.protocol: TCP +metrics.reporter.grph.interval: 60 SECONDS + +``` + +### InfluxDB +#### (org.apache.flink.metrics.influxdb.InfluxdbReporter) + +In order to use this reporter you must copy `/opt/flink-metrics-influxdb-{{< version >}}.jar` into the `plugins/influxdb` folder +of your Flink distribution. + +Parameters: + +{{< generated/influxdb_reporter_configuration >}} + +Example configuration: + +```yaml + +metrics.reporter.influxdb.factory.class: org.apache.flink.metrics.influxdb.InfluxdbReporterFactory +metrics.reporter.influxdb.scheme: http +metrics.reporter.influxdb.host: localhost +metrics.reporter.influxdb.port: 8086 +metrics.reporter.influxdb.db: flink +metrics.reporter.influxdb.username: flink-metrics +metrics.reporter.influxdb.password: qwerty +metrics.reporter.influxdb.retentionPolicy: one_hour +metrics.reporter.influxdb.consistency: ANY +metrics.reporter.influxdb.connectTimeout: 60000 +metrics.reporter.influxdb.writeTimeout: 60000 +metrics.reporter.influxdb.interval: 60 SECONDS + +``` + +The reporter would send metrics using http protocol to the InfluxDB server with the specified retention policy (or the default policy specified on the server). +All Flink metrics variables (see [List of all Variables]({{< ref "docs/ops/metrics" >}}#list-of-all-variables)) are exported as InfluxDB tags. + +### Prometheus +#### (org.apache.flink.metrics.prometheus.PrometheusReporter) + +Parameters: + +- `port` - (optional) the port the Prometheus exporter listens on, defaults to [9249](https://github.com/prometheus/prometheus/wiki/Default-port-allocations). In order to be able to run several instances of the reporter on one host (e.g. when one TaskManager is colocated with the JobManager) it is advisable to use a port range like `9250-9260`. +- `filterLabelValueCharacters` - (optional) Specifies whether to filter label value characters. If enabled, all characters not matching \[a-zA-Z0-9:_\] will be removed, otherwise no characters will be removed. Before disabling this option please ensure that your label values meet the [Prometheus requirements](https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels). + +Example configuration: + +```yaml + +metrics.reporter.prom.class: org.apache.flink.metrics.prometheus.PrometheusReporter + +``` + +Flink metric types are mapped to Prometheus metric types as follows: + +| Flink | Prometheus | Note | +| --------- |------------|------------------------------------------| +| Counter | Gauge |Prometheus counters cannot be decremented.| +| Gauge | Gauge |Only numbers and booleans are supported. | +| Histogram | Summary |Quantiles .5, .75, .95, .98, .99 and .999 | +| Meter | Gauge |The gauge exports the meter's rate. | + +All Flink metrics variables (see [List of all Variables]({{< ref "docs/ops/metrics" >}}#list-of-all-variables)) are exported to Prometheus as labels. + +### PrometheusPushGateway +#### (org.apache.flink.metrics.prometheus.PrometheusPushGatewayReporter) + +Parameters: + +{{< generated/prometheus_push_gateway_reporter_configuration >}} + +Example configuration: + +```yaml + +metrics.reporter.promgateway.class: org.apache.flink.metrics.prometheus.PrometheusPushGatewayReporter +metrics.reporter.promgateway.host: localhost +metrics.reporter.promgateway.port: 9091 +metrics.reporter.promgateway.jobName: myJob +metrics.reporter.promgateway.randomJobNameSuffix: true +metrics.reporter.promgateway.deleteOnShutdown: false +metrics.reporter.promgateway.groupingKey: k1=v1;k2=v2 +metrics.reporter.promgateway.interval: 60 SECONDS + +``` + +The PrometheusPushGatewayReporter pushes metrics to a [Pushgateway](https://github.com/prometheus/pushgateway), which can be scraped by Prometheus. + +Please see the [Prometheus documentation](https://prometheus.io/docs/practices/pushing/) for use-cases. + +### StatsD +#### (org.apache.flink.metrics.statsd.StatsDReporter) + +Parameters: + +- `host` - the StatsD server host +- `port` - the StatsD server port + +Example configuration: + +```yaml + +metrics.reporter.stsd.factory.class: org.apache.flink.metrics.statsd.StatsDReporterFactory +metrics.reporter.stsd.host: localhost +metrics.reporter.stsd.port: 8125 +metrics.reporter.stsd.interval: 60 SECONDS + +``` + +### Datadog +#### (org.apache.flink.metrics.datadog.DatadogHttpReporter) + +Note any variables in Flink metrics, such as ``, ``, ``, ``, ``, and ``, +will be sent to Datadog as tags. Tags will look like `host:localhost` and `job_name:myjobname`. + +Note Histograms are exposed as a series of gauges following the naming convention of Datadog histograms (`.`). +The `min` aggregation is reported by default, whereas `sum` is not available. +In contrast to Datadog-provided Histograms the reported aggregations are not computed for a specific reporting interval. + +Parameters: + +- `apikey` - the Datadog API key +- `tags` - (optional) the global tags that will be applied to metrics when sending to Datadog. Tags should be separated by comma only +- `proxyHost` - (optional) The proxy host to use when sending to Datadog. +- `proxyPort` - (optional) The proxy port to use when sending to Datadog, defaults to 8080. +- `dataCenter` - (optional) The data center (`EU`/`US`) to connect to, defaults to `US`. +- `maxMetricsPerRequest` - (optional) The maximum number of metrics to include in each request, defaults to 2000. + +Example configuration: + +```yaml + +metrics.reporter.dghttp.factory.class: org.apache.flink.metrics.datadog.DatadogHttpReporterFactory +metrics.reporter.dghttp.apikey: xxx +metrics.reporter.dghttp.tags: myflinkapp,prod +metrics.reporter.dghttp.proxyHost: my.web.proxy.com +metrics.reporter.dghttp.proxyPort: 8080 +metrics.reporter.dghttp.dataCenter: US +metrics.reporter.dghttp.maxMetricsPerRequest: 2000 +metrics.reporter.dghttp.interval: 60 SECONDS + +``` + + +### Slf4j +#### (org.apache.flink.metrics.slf4j.Slf4jReporter) + +Example configuration: + +```yaml + +metrics.reporter.slf4j.factory.class: org.apache.flink.metrics.slf4j.Slf4jReporterFactory +metrics.reporter.slf4j.interval: 60 SECONDS + +``` +{{< top >}} diff --git a/docs/content.zh/docs/deployment/overview.md b/docs/content.zh/docs/deployment/overview.md new file mode 100644 index 0000000000000..960db9be85087 --- /dev/null +++ b/docs/content.zh/docs/deployment/overview.md @@ -0,0 +1,310 @@ +--- +title: '概览' +weight: 1 +type: docs +aliases: + - /zh/deployment/ + - /zh/apis/cluster_execution.html +--- + + +# Deployment + +Flink is a versatile framework, supporting many different deployment scenarios in a mix and match fashion. + +Below, we briefly explain the building blocks of a Flink cluster, their purpose and available implementations. +If you just want to start Flink locally, we recommend setting up a [Standalone Cluster]({{< ref "docs/deployment/resource-providers/standalone/overview" >}}). + +## Overview and Reference Architecture + +The figure below shows the building blocks of every Flink cluster. There is always somewhere a client running. It takes the code of the Flink applications, transforms it into a JobGraph and submits it to the JobManager. + +The JobManager distributes the work onto the TaskManagers, where the actual operators (such as sources, transformations and sinks) are running. + +When deploying Flink, there are often multiple options available for each building block. We have listed them in the table below the figure. + + + +{{< img class="img-fluid" width="80%" src="/fig/deployment_overview.svg" alt="Figure for Overview and Reference Architecture" >}} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ComponentPurposeImplementations
    Flink Client + Compiles batch or streaming applications into a dataflow graph, which it then submits to the JobManager. + + +
    JobManager + JobManager is the name of the central work coordination component of Flink. It has implementations for different resource providers, which differ on high-availability, resource allocation behavior and supported job submission modes.
    + JobManager modes for job submissions: +
      +
    • Application Mode: runs the cluster exclusively for one application. The job's main method (or client) gets executed on the JobManager. Calling `execute`/`executeAsync` multiple times in an application is supported.
    • +
    • Per-Job Mode: runs the cluster exclusively for one job. The job's main method (or client) runs only prior to the cluster creation.
    • +
    • Session Mode: one JobManager instance manages multiple jobs sharing the same cluster of TaskManagers
    • +
    +
    + +
    TaskManager + TaskManagers are the services actually performing the work of a Flink job. + +
    + External Components (all optional) +
    High Availability Service Provider + Flink's JobManager can be run in high availability mode which allows Flink to recover from JobManager faults. In order to failover faster, multiple standby JobManagers can be started to act as backups. + + +
    File Storage and Persistency + For checkpointing (recovery mechanism for streaming jobs) Flink relies on external file storage systems + See }}">FileSystems page.
    Resource Provider + Flink can be deployed through different Resource Provider Frameworks, such as Kubernetes, YARN or Mesos. + See JobManager implementations above.
    Metrics Storage + Flink components report internal metrics and Flink jobs can report additional, job specific metrics as well. + See }}">Metrics Reporter page.
    Application-level data sources and sinks + While application-level data sources and sinks are not technically part of the deployment of Flink cluster components, they should be considered when planning a new Flink production deployment. Colocating frequently used data with Flink can have significant performance benefits + + For example: +
      +
    • Apache Kafka
    • +
    • Amazon S3
    • +
    • ElasticSearch
    • +
    • Apache Cassandra
    • +
    + See }}">Connectors page. +
    + + + +## Deployment Modes + +Flink can execute applications in one of three ways: +- in Application Mode, +- in a Per-Job Mode, +- in Session Mode. + + The above modes differ in: + - the cluster lifecycle and resource isolation guarantees + - whether the application's `main()` method is executed on the client or on the cluster. + + + +{{< img class="img-fluid" width="80%" style="margin: 15px" src="/fig/deployment_modes.svg" alt="Figure for Deployment Modes" >}} + +#### Application Mode + +In all the other modes, the application's `main()` method is executed on the client side. This process +includes downloading the application's dependencies locally, executing the `main()` to extract a representation +of the application that Flink's runtime can understand (i.e. the `JobGraph`) and ship the dependencies and +the `JobGraph(s)` to the cluster. This makes the Client a heavy resource consumer as it may need substantial +network bandwidth to download dependencies and ship binaries to the cluster, and CPU cycles to execute the +`main()`. This problem can be more pronounced when the Client is shared across users. + +Building on this observation, the *Application Mode* creates a cluster per submitted application, but this time, +the `main()` method of the application is executed on the JobManager. Creating a cluster per application can be +seen as creating a session cluster shared only among the jobs of a particular application, and torn down when +the application finishes. With this architecture, the *Application Mode* provides the same resource isolation +and load balancing guarantees as the *Per-Job* mode, but at the granularity of a whole application. Executing +the `main()` on the JobManager allows for saving the CPU cycles required, but also save the bandwidth required +for downloading the dependencies locally. Furthermore, it allows for more even spread of the network load for +downloading the dependencies of the applications in the cluster, as there is one JobManager per application. + +{{< hint info >}} +In the Application Mode, the `main()` is executed on the cluster and not on the client, +as in the other modes. This may have implications for your code as, for example, any paths you register in +your environment using the `registerCachedFile()` must be accessible by the JobManager of your application. +{{< /hint >}} + +Compared to the *Per-Job* mode, the *Application Mode* allows the submission of applications consisting of +multiple jobs. The order of job execution is not affected by the deployment mode but by the call used +to launch the job. Using `execute()`, which is blocking, establishes an order and it will lead to the +execution of the "next" job being postponed until "this" job finishes. Using `executeAsync()`, which is +non-blocking, will lead to the "next" job starting before "this" job finishes. + +{{< hint warning >}} +The Application Mode allows for multi-`execute()` applications but +High-Availability is not supported in these cases. High-Availability in Application Mode is only +supported for single-`execute()` applications. +{{< /hint >}} + +#### Per-Job Mode + +Aiming at providing better resource isolation guarantees, the *Per-Job* mode uses the available resource provider +framework (e.g. YARN, Kubernetes) to spin up a cluster for each submitted job. This cluster is available to +that job only. When the job finishes, the cluster is torn down and any lingering resources (files, etc) are +cleared up. This provides better resource isolation, as a misbehaving job can only bring down its own +TaskManagers. In addition, it spreads the load of book-keeping across multiple JobManagers, as there is +one per job. For these reasons, the *Per-Job* resource allocation model is the preferred mode by many +production reasons. + +#### Session Mode + +*Session mode* assumes an already running cluster and uses the resources of that cluster to execute any +submitted application. Applications executed in the same (session) cluster use, and consequently compete +for, the same resources. This has the advantage that you do not pay the resource overhead of spinning up +a full cluster for every submitted job. But, if one of the jobs misbehaves or brings down a TaskManager, +then all jobs running on that TaskManager will be affected by the failure. This, apart from a negative +impact on the job that caused the failure, implies a potential massive recovery process with all the +restarting jobs accessing the filesystem concurrently and making it unavailable to other services. +Additionally, having a single cluster running multiple jobs implies more load for the JobManager, who +is responsible for the book-keeping of all the jobs in the cluster. + + +#### Summary + +In *Session Mode*, the cluster lifecycle is independent of that of any job running on the cluster +and the resources are shared across all jobs. The *Per-Job* mode pays the price of spinning up a cluster +for every submitted job, but this comes with better isolation guarantees as the resources are not shared +across jobs. In this case, the lifecycle of the cluster is bound to that of the job. Finally, the +*Application Mode* creates a session cluster per application and executes the application's `main()` +method on the cluster. + + + +## Vendor Solutions + +A number of vendors offer managed or fully hosted Flink solutions. +None of these vendors are officially supported or endorsed by the Apache Flink PMC. +Please refer to vendor maintained documentation on how to use these products. + + + +#### AliCloud Realtime Compute + +[Website](https://www.alibabacloud.com/products/realtime-compute) + +Supported Environments: +{{< label AliCloud >}} + +#### Amazon EMR + +[Website](https://aws.amazon.com/emr/) + +Supported Environments: +{{< label AWS >}} + +#### Amazon Kinesis Data Analytics for Apache Flink + +[Website](https://docs.aws.amazon.com/kinesisanalytics/latest/java/what-is.html) + +Supported Environments: +{{< label AWS >}} + +#### Cloudera DataFlow + +[Website](https://www.cloudera.com/products/cdf.html) + +Supported Environment: +{{< label AWS >}} +{{< label Azure >}} +{{< label Google Cloud >}} +{{< label On-Premise >}} + +#### Eventador + +[Website](https://eventador.io) + +Supported Environment: +{{< label AWS >}} + +#### Huawei Cloud Stream Service + +[Website](https://www.huaweicloud.com/en-us/product/cs.html) + +Supported Environment: +{{< label Huawei Cloud >}} + +#### Ververica Platform + +[Website](https://www.ververica.com/platform-overview) + +Supported Environments: +{{< label AliCloud >}} +{{< label AWS >}} +{{< label Azure >}} +{{< label Google Cloud >}} +{{< label On-Premise >}} + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/repls/_index.md b/docs/content.zh/docs/deployment/repls/_index.md new file mode 100644 index 0000000000000..691e1a0f09e60 --- /dev/null +++ b/docs/content.zh/docs/deployment/repls/_index.md @@ -0,0 +1,23 @@ +--- +title: REPLs +bookCollapseSection: true +weight: 9 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/deployment/repls/python_shell.md b/docs/content.zh/docs/deployment/repls/python_shell.md new file mode 100644 index 0000000000000..f29e64787c4b9 --- /dev/null +++ b/docs/content.zh/docs/deployment/repls/python_shell.md @@ -0,0 +1,208 @@ +--- +title: "Python REPL" +weight: 7 +type: docs +aliases: + - /zh/deployment/repls/python_shell.html + - /zh/apis/python_shell.html + - /zh/ops/python_shell.html +--- + + +# Python REPL + +Flink附带了一个集成的交互式Python Shell。 +它既能够运行在本地启动的local模式,也能够运行在集群启动的cluster模式下。 +本地安装Flink,请看[本地安装]({{< ref "docs/deployment/resource-providers/standalone/overview" >}})页面。 +您也可以从源码安装Flink,请看[从源码构建 Flink]({{< ref "docs/flinkDev/building" >}})页面。 + +注意 Python Shell会调用“python”命令。关于Python执行环境的要求,请参考Python Table API[环境安装]({{< ref "docs/dev/python/installation" >}})。 + +你可以通过PyPi安装PyFlink,然后使用Python Shell: + +```bash +# 安装 PyFlink +$ python -m pip install apache-flink +# 执行脚本 +$ pyflink-shell.sh local +``` + +关于如何在一个Cluster集群上运行Python shell,可以参考启动章节介绍。 + +## 使用 + +当前Python shell支持Table API的功能。 +在启动之后,Table Environment的相关内容将会被自动加载。 +可以通过变量"bt_env"来使用BatchTableEnvironment,通过变量"st_env"来使用StreamTableEnvironment。 + +### Table API + +下面是一个通过Python Shell 运行的简单示例: +{{< tabs "7207dd60-97bf-461b-b2a5-fcc3dea507c6" >}} +{{< tab "stream" >}} +```python +>>> import tempfile +>>> import os +>>> import shutil +>>> sink_path = tempfile.gettempdir() + '/streaming.csv' +>>> if os.path.exists(sink_path): +... if os.path.isfile(sink_path): +... os.remove(sink_path) +... else: +... shutil.rmtree(sink_path) +>>> s_env.set_parallelism(1) +>>> t = st_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c']) +>>> st_env.connect(FileSystem().path(sink_path))\ +... .with_format(OldCsv() +... .field_delimiter(',') +... .field("a", DataTypes.BIGINT()) +... .field("b", DataTypes.STRING()) +... .field("c", DataTypes.STRING()))\ +... .with_schema(Schema() +... .field("a", DataTypes.BIGINT()) +... .field("b", DataTypes.STRING()) +... .field("c", DataTypes.STRING()))\ +... .create_temporary_table("stream_sink") +>>> t.select("a + 1, b, c")\ +... .execute_insert("stream_sink").wait() +>>> # 如果作业运行在local模式, 你可以执行以下代码查看结果: +>>> with open(sink_path, 'r') as f: +... print(f.read()) +``` +{{< /tab >}} +{{< tab "batch" >}} +```python +>>> import tempfile +>>> import os +>>> import shutil +>>> sink_path = tempfile.gettempdir() + '/batch.csv' +>>> if os.path.exists(sink_path): +... if os.path.isfile(sink_path): +... os.remove(sink_path) +... else: +... shutil.rmtree(sink_path) +>>> b_env.set_parallelism(1) +>>> t = bt_env.from_elements([(1, 'hi', 'hello'), (2, 'hi', 'hello')], ['a', 'b', 'c']) +>>> bt_env.connect(FileSystem().path(sink_path))\ +... .with_format(OldCsv() +... .field_delimiter(',') +... .field("a", DataTypes.BIGINT()) +... .field("b", DataTypes.STRING()) +... .field("c", DataTypes.STRING()))\ +... .with_schema(Schema() +... .field("a", DataTypes.BIGINT()) +... .field("b", DataTypes.STRING()) +... .field("c", DataTypes.STRING()))\ +... .create_temporary_table("batch_sink") +>>> t.select("a + 1, b, c")\ +... .execute_insert("batch_sink").wait() +>>> # 如果作业运行在local模式, 你可以执行以下代码查看结果: +>>> with open(sink_path, 'r') as f: +... print(f.read()) +``` +{{< /tab >}} +{{< /tabs >}} + +## 启动 + +查看Python Shell提供的可选参数,可以使用: + +```bash +pyflink-shell.sh --help +``` + +### Local + +Python Shell运行在local模式下,只需要执行: + +```bash +pyflink-shell.sh local +``` + + +### Remote + +Python Shell运行在一个指定的JobManager上,通过关键字`remote`和对应的JobManager +的地址和端口号来进行指定: + +```bash +pyflink-shell.sh remote +``` + +### Yarn Python Shell cluster + +Python Shell可以运行在YARN集群之上。Python shell在Yarn上部署一个新的Flink集群,并进行连接。除了指定container数量,你也 +可以指定JobManager的内存,YARN应用的名字等参数。 +例如,在一个部署了两个TaskManager的Yarn集群上运行Python Shell: + +```bash +pyflink-shell.sh yarn -n 2 +``` + +关于所有可选的参数,可以查看本页面底部的完整说明。 + + +### Yarn Session + +如果你已经通过Flink Yarn Session部署了一个Flink集群,能够通过以下的命令连接到这个集群: + +```bash +pyflink-shell.sh yarn +``` + + +## 完整的参考 + +```bash +Flink Python Shell +使用: pyflink-shell.sh [local|remote|yarn] [options] ... + +命令: local [选项] +启动一个部署在local的Flink Python shell +使用: + -h,--help 查看所有可选的参数 +命令: remote [选项] +启动一个部署在remote集群的Flink Python shell + + JobManager的主机名 + + JobManager的端口号 + +使用: + -h,--help 查看所有可选的参数 + +命令: yarn [选项] +启动一个部署在Yarn集群的Flink Python Shell +使用: + -h,--help 查看所有可选的参数 + -jm,--jobManagerMemory 具有可选单元的JobManager + 的container的内存(默认值:MB) + -n,--container 需要分配的YARN container的 + 数量 (=TaskManager的数量) + -nm,--name 自定义YARN Application的名字 + -qu,--queue 指定YARN的queue + -s,--slots 每个TaskManager上slots的数量 + -tm,--taskManagerMemory 具有可选单元的每个TaskManager + 的container的内存(默认值:MB) +-h | --help + 打印输出使用文档 +``` + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/repls/scala_shell.md b/docs/content.zh/docs/deployment/repls/scala_shell.md new file mode 100644 index 0000000000000..889dea1bbe366 --- /dev/null +++ b/docs/content.zh/docs/deployment/repls/scala_shell.md @@ -0,0 +1,313 @@ +--- +title: "Scala REPL" +weight: 8 +type: docs +aliases: + - /zh/deployment/repls/scala_shell.html + - /zh/apis/scala_shell.html + - /zh/ops/scala_shell.html +--- + + +# Scala REPL + +Flink comes with an integrated interactive Scala Shell. +It can be used in a local setup as well as in a cluster setup. + +To use the shell with an integrated Flink cluster just execute: + +```bash +bin/start-scala-shell.sh local +``` + +in the root directory of your binary Flink directory. To run the Shell on a +cluster, please see the Setup section below. + +{{< hint warning >}} +The Scala REPL currently is currently only supported by the Scala 2.11 Flink distribution. +Please follow [Jira](https://issues.apache.org/jira/browse/FLINK-10911) for the status +of >= 2.12 support. +{{< /hint >}} + +## Usage + +The shell supports DataSet, DataStream, Table API and SQL. +Four different Environments are automatically prebound after startup. +Use "benv" and "senv" to access the Batch and Streaming ExecutionEnvironment respectively. +Use "btenv" and "stenv" to access BatchTableEnvironment and StreamTableEnvironment respectively. + +### DataSet API + +The following example will execute the wordcount program in the Scala shell: + +```scala +Scala-Flink> val text = benv.fromElements( + "To be, or not to be,--that is the question:--", + "Whether 'tis nobler in the mind to suffer", + "The slings and arrows of outrageous fortune", + "Or to take arms against a sea of troubles,") +Scala-Flink> val counts = text + .flatMap { _.toLowerCase.split("\\W+") } + .map { (_, 1) }.groupBy(0).sum(1) +Scala-Flink> counts.print() +``` + +The print() command will automatically send the specified tasks to the JobManager for execution and will show the result of the computation in the terminal. + +It is possible to write results to a file. However, in this case you need to call `execute`, to run your program: + +```scala +Scala-Flink> benv.execute("MyProgram") +``` + +### DataStream API + +Similar to the batch program above, we can execute a streaming program through the DataStream API: + +```scala +Scala-Flink> val textStreaming = senv.fromElements( + "To be, or not to be,--that is the question:--", + "Whether 'tis nobler in the mind to suffer", + "The slings and arrows of outrageous fortune", + "Or to take arms against a sea of troubles,") +Scala-Flink> val countsStreaming = textStreaming + .flatMap { _.toLowerCase.split("\\W+") } + .map { (_, 1) }.keyBy(_._1).sum(1) +Scala-Flink> countsStreaming.print() +Scala-Flink> senv.execute("Streaming Wordcount") +``` + +Note, that in the Streaming case, the print operation does not trigger execution directly. + +The Flink Shell comes with command history and auto-completion. + +### Table API + +The example below is a wordcount program using Table API: +{{< tabs "a5a84572-8c20-46a1-b0bc-7c3347a9ff43" >}} +{{< tab "stream" >}} +```scala +Scala-Flink> import org.apache.flink.table.functions.TableFunction +Scala-Flink> val textSource = stenv.fromDataStream( + senv.fromElements( + "To be, or not to be,--that is the question:--", + "Whether 'tis nobler in the mind to suffer", + "The slings and arrows of outrageous fortune", + "Or to take arms against a sea of troubles,"), + 'text) +Scala-Flink> class $Split extends TableFunction[String] { + def eval(s: String): Unit = { + s.toLowerCase.split("\\W+").foreach(collect) + } + } +Scala-Flink> val split = new $Split +Scala-Flink> textSource.join(split('text) as 'word). + groupBy('word).select('word, 'word.count as 'count). + toRetractStream[(String, Long)].print +Scala-Flink> senv.execute("Table Wordcount") +``` +{{< /tab >}} +{{< tab "batch" >}} +```scala +Scala-Flink> import org.apache.flink.table.functions.TableFunction +Scala-Flink> val textSource = btenv.fromDataSet( + benv.fromElements( + "To be, or not to be,--that is the question:--", + "Whether 'tis nobler in the mind to suffer", + "The slings and arrows of outrageous fortune", + "Or to take arms against a sea of troubles,"), + 'text) +Scala-Flink> class $Split extends TableFunction[String] { + def eval(s: String): Unit = { + s.toLowerCase.split("\\W+").foreach(collect) + } + } +Scala-Flink> val split = new $Split +Scala-Flink> textSource.join(split('text) as 'word). + groupBy('word).select('word, 'word.count as 'count). + toDataSet[(String, Long)].print +``` +{{< /tab >}} +{{< /tabs >}} + +Note, that using $ as a prefix for the class name of TableFunction is a workaround of the issue that scala incorrectly generated inner class name. + +### SQL + +The following example is a wordcount program written in SQL: +{{< tabs "3b210000-4585-497a-8636-c7583d10ff42" >}} +{{< tab "stream" >}} +```scala +Scala-Flink> import org.apache.flink.table.functions.TableFunction +Scala-Flink> val textSource = stenv.fromDataStream( + senv.fromElements( + "To be, or not to be,--that is the question:--", + "Whether 'tis nobler in the mind to suffer", + "The slings and arrows of outrageous fortune", + "Or to take arms against a sea of troubles,"), + 'text) +Scala-Flink> stenv.createTemporaryView("text_source", textSource) +Scala-Flink> class $Split extends TableFunction[String] { + def eval(s: String): Unit = { + s.toLowerCase.split("\\W+").foreach(collect) + } + } +Scala-Flink> stenv.registerFunction("split", new $Split) +Scala-Flink> val result = stenv.sqlQuery("""SELECT T.word, count(T.word) AS `count` + FROM text_source + JOIN LATERAL table(split(text)) AS T(word) + ON TRUE + GROUP BY T.word""") +Scala-Flink> result.toRetractStream[(String, Long)].print +Scala-Flink> senv.execute("SQL Wordcount") +``` +{{< /tab >}} +{{< tab "batch" >}} +```scala +Scala-Flink> import org.apache.flink.table.functions.TableFunction +Scala-Flink> val textSource = btenv.fromDataSet( + benv.fromElements( + "To be, or not to be,--that is the question:--", + "Whether 'tis nobler in the mind to suffer", + "The slings and arrows of outrageous fortune", + "Or to take arms against a sea of troubles,"), + 'text) +Scala-Flink> btenv.createTemporaryView("text_source", textSource) +Scala-Flink> class $Split extends TableFunction[String] { + def eval(s: String): Unit = { + s.toLowerCase.split("\\W+").foreach(collect) + } + } +Scala-Flink> btenv.registerFunction("split", new $Split) +Scala-Flink> val result = btenv.sqlQuery("""SELECT T.word, count(T.word) AS `count` + FROM text_source + JOIN LATERAL table(split(text)) AS T(word) + ON TRUE + GROUP BY T.word""") +Scala-Flink> result.toDataSet[(String, Long)].print +``` +{{< /tab >}} +{{< /tabs >}} + +## Adding external dependencies + +It is possible to add external classpaths to the Scala-shell. These will be sent to the Jobmanager automatically alongside your shell program, when calling execute. + +Use the parameter `-a ` or `--addclasspath ` to load additional classes. + +```bash +bin/start-scala-shell.sh [local | remote | yarn] --addclasspath +``` + + +## Setup + +To get an overview of what options the Scala Shell provides, please use + +```bash +bin/start-scala-shell.sh --help +``` + +### Local + +To use the shell with an integrated Flink cluster just execute: + +```bash +bin/start-scala-shell.sh local +``` + + +### Remote + +To use it with a running cluster start the scala shell with the keyword `remote` +and supply the host and port of the JobManager with: + +```bash +bin/start-scala-shell.sh remote +``` + +### Yarn Scala Shell cluster + +The shell can deploy a Flink cluster to YARN, which is used exclusively by the +shell. +The shell deploys a new Flink cluster on YARN and connects the +cluster. You can also specify options for YARN cluster such as memory for +JobManager, name of YARN application, etc. + +For example, to start a Yarn cluster for the Scala Shell with two TaskManagers +use the following: + +```bash +bin/start-scala-shell.sh yarn -n 2 +``` + +For all other options, see the full reference at the bottom. + + +### Yarn Session + +If you have previously deployed a Flink cluster using the Flink Yarn Session, +the Scala shell can connect with it using the following command: + +```bash +bin/start-scala-shell.sh yarn +``` + + +## Full Reference + +```bash +Flink Scala Shell +Usage: start-scala-shell.sh [local|remote|yarn] [options] ... + +Command: local [options] +Starts Flink scala shell with a local Flink cluster + -a | --addclasspath + Specifies additional jars to be used in Flink +Command: remote [options] +Starts Flink scala shell connecting to a remote cluster + + Remote host name as string + + Remote port as integer + + -a | --addclasspath + Specifies additional jars to be used in Flink +Command: yarn [options] +Starts Flink scala shell connecting to a yarn cluster + -jm arg | --jobManagerMemory arg + Memory for JobManager container with optional unit (default: MB) + -nm | --name + Set a custom name for the application on YARN + -qu | --queue + Specifies YARN queue + -s | --slots + Number of slots per TaskManager + -tm | --taskManagerMemory + Memory per TaskManager container with optional unit (default: MB) + -a | --addclasspath + Specifies additional jars to be used in Flink + --configDir + The configuration directory. + -h | --help + Prints this usage text +``` + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/resource-providers/_index.md b/docs/content.zh/docs/deployment/resource-providers/_index.md new file mode 100644 index 0000000000000..625b33ad57645 --- /dev/null +++ b/docs/content.zh/docs/deployment/resource-providers/_index.md @@ -0,0 +1,23 @@ +--- +title: Resource Providers +bookCollapseSection: true +weight: 2 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/deployment/resource-providers/mesos.md b/docs/content.zh/docs/deployment/resource-providers/mesos.md new file mode 100644 index 0000000000000..be2a63c366366 --- /dev/null +++ b/docs/content.zh/docs/deployment/resource-providers/mesos.md @@ -0,0 +1,421 @@ +--- +title: Mesos +weight: 6 +type: docs +aliases: + - /zh/deployment/resource-providers/mesos.html + - /zh/ops/deployment/mesos.html +--- + + +# Flink on Mesos + +{{< hint warning >}} +Apache Mesos support was deprecated in Flink 1.13 and is subject to removal in the future (see +[FLINK-22352](https://issues.apache.org/jira/browse/FLINK-22352) for further details). +{{< /hint >}} + +## Getting Started + +This *Getting Started* section guides you through setting up a fully functional Flink Cluster on Mesos. + +### Introduction + +[Apache Mesos](http://mesos.apache.org/) is another resource provider supported by +Apache Flink. Flink utilizes the worker's provided by Mesos to run its TaskManagers. +Apache Flink provides the script `bin/mesos-appmaster.sh` to initiate the Flink +on Mesos cluster. + +### Preparation + +Flink on Mesos expects a Mesos cluster to be around. It also requires the Flink binaries being +deployed. Additionally, Hadoop needs to be installed on the very same machine. + +Flink provides `bin/mesos-appmaster.sh` to initiate a Flink on Mesos cluster. A Mesos application master +will be created (i.e. a JobManager process with Mesos support) which will utilize the Mesos workers to +run Flink's TaskManager processes. + +For `bin/mesos-appmaster.sh` to work, you have to set the two variables `HADOOP_CLASSPATH` and +`MESOS_NATIVE_JAVA_LIBRARY`: + +```bash +$ export HADOOP_CLASSPATH=$(hadoop classpath) +$ export MESOS_NATIVE_JAVA_LIBRARY=/path/to/lib/libmesos.so +``` + +`MESOS_NATIVE_JAVA_LIBRARY` needs to point to Mesos' native Java library. The library name `libmesos.so` +used above refers to Mesos' Linux library. Running Mesos on MacOS would require you to use +`libmesos.dylib` instead. + +### Starting a Flink Session on Mesos + +Connect to the machine which matches all the requirements listed in the [Preparation section](#preparation). +Change into Flink's home directory and call `bin/mesos-appmaster.sh`: + +```bash +# (0) set required environment variables +$ export HADOOP_CLASSPATH=$(hadoop classpath) +$ export MESOS_NATIVE_JAVA_LIBRARY=/path/to/lib/libmesos.so + +# (1) create Flink on Mesos cluster +$ ./bin/mesos-appmaster.sh \ + -Dmesos.master=:5050 \ + -Djobmanager.rpc.address= \ + -Dmesos.resourcemanager.framework.user= \ + -Dmesos.resourcemanager.tasks.cpus=6 + +# (2) execute Flink job passing the relevant configuration parameters +$ ./bin/flink run \ + --detached \ + --target remote \ + -Djobmanager.rpc.address= \ + -Dmesos.resourcemanager.framework.user= \ + -Dmesos.master=:5050 \ + examples/streaming/WindowJoin.jar +``` + +The commands above use a few placeholders that need to be substituted by settings of the actual +underlying cluster: +* `` refers to the Mesos master's IP address or hostname. +* `` refers to the host that executes `bin/mesos-appmaster.sh` which is starting + Flink's JobManager process. It's important to not use `localhost` or `127.0.0.1` as this parameter + is being shared with the Mesos cluster and the TaskManagers. +* `` refers to the user that owns the Mesos master's Flink installation directory (see Mesos' + documentation on [specifying a user](http://mesos.apache.org/documentation/latest/fetcher/#specifying-a-user-name) + for further details). + +The `run` action requires `--target` to be set to `remote`. Refer to the [CLI documentation]({{< ref "docs/deployment/cli" >}}) +for further details on that parameter. + +The Flink on Mesos cluster is now deployed in [Session Mode]({{< ref "docs/deployment/overview" >}}#session-mode). +Note that you can run multiple Flink jobs on a Session cluster. Each job needs to be submitted to the +cluster. TaskManagers are deployed on the Mesos workers as needed. Keep in mind that you can only run as +many jobs as the Mesos cluster allows in terms of resources provided by the Mesos workers. Play around +with Flink's parameters to find the right resource utilization for your needs. + +Check out [Flink's Mesos configuration]({{< ref "docs/deployment/config" >}}#mesos) to further influence +the resources Flink on Mesos is going to allocate. + +## Deployment Modes + +For production use, we recommend deploying Flink Applications in the +[Per-Job Mode]({{< ref "docs/deployment/overview" >}}#per-job-mode), as it provides a better isolation +for each job. + +### Application Mode + +Flink on Mesos does not support [Application Mode]({{< ref "docs/deployment/overview" >}}#application-mode). + +### Per-Job Cluster Mode + +A job which is executed in [Per-Job Cluster Mode]({{< ref "docs/deployment/overview" >}}#per-job-mode) spins +up a dedicated Flink cluster that is only used for that specific job. No extra job submission is +needed. `bin/mesos-appmaster-job.sh` is used as the startup script. It will start a Flink cluster +for a dedicated job which is passed as a JobGraph file. This file can be created by applying the +following code to your Job source code: +```java +final JobGraph jobGraph = env.getStreamGraph().getJobGraph(); +final String jobGraphFilename = "job.graph"; +File jobGraphFile = new File(jobGraphFilename); +try (FileOutputStream output = new FileOutputStream(jobGraphFile); + ObjectOutputStream obOutput = new ObjectOutputStream(output)){ + obOutput.writeObject(jobGraph); +} +``` + +Flink on Mesos Per-Job cluster can be started in the following way: +```bash +# (0) set required environment variables +$ export HADOOP_CLASSPATH=$(hadoop classpath) +$ export MESOS_NATIVE_JAVA_LIBRARY=/path/to/lib/libmesos.so + +# (1) create Per-Job Flink on Mesos cluster +$ ./bin/mesos-appmaster-job.sh \ + -Dmesos.master=:5050 \ + -Djobmanager.rpc.address= \ + -Dmesos.resourcemanager.framework.user= \ + -Dinternal.jobgraph-path= +``` + +`` refers to the path of the uploaded JobGraph file defining the job that shall be +executed on the Per-Job Flink cluster in the command above. The meaning of ``, +`` and `` are described in the +[Getting Started](#starting-a-flink-session-on-mesos) guide of this page. + +### Session Mode + +The [Getting Started](#starting-a-flink-session-on-mesos) guide at the top of this page describes +deploying Flink in Session Mode. + +## Flink on Mesos Reference + +### Deploying User Libraries + +User libraries can be passed to the Mesos workers by placing them in Flink's `lib/` folder. This way, +they will be picked by Mesos' Fetcher and copied over into the worker's sandbox folders. Alternatively, +Docker containerization can be used as described in [Installing Flink on the Workers](#installing-flink-on-the-workers). + +### Installing Flink on the Workers + +Flink on Mesos offers two ways to distribute the Flink and user binaries within the Mesos cluster: +1. **Using Mesos' Artifact Server**: The Artifact Server provides the resources which are moved by + [Mesos' Fetcher](http://mesos.apache.org/documentation/latest/fetcher/) into the Mesos worker's + [sandbox folders](http://mesos.apache.org/documentation/latest/sandbox/). It can be explicitly + specified by setting [mesos.resourcemanager.tasks.container.type]({{< ref "docs/deployment/config" >}}#mesos-resourcemanager-tasks-container-type) + to `mesos`. This is the default option and is used in the example commands of this page. +2. **Using Docker containerization**: This enables the user to provide user libraries and other + customizations as part of a Docker image. Docker utilization can be enabled by setting + [mesos.resourcemanager.tasks.container.type]({{< ref "docs/deployment/config" >}}#mesos-resourcemanager-tasks-container-type) + to `docker` and by providing the image name through [mesos.resourcemanager.tasks.container.image.name]({{< ref "docs/deployment/config" >}}#mesos-resourcemanager-tasks-container-image-name). + +### High Availability on Mesos + +You will need to run a service like Marathon or Apache Aurora which takes care of restarting the +JobManager process in case of node or process failures. In addition, Zookeeper needs to be configured +as described in the [High Availability section of the Flink docs]({{< ref "docs/deployment/ha/overview" >}}). + +#### Marathon + +Marathon needs to be set up to launch the `bin/mesos-appmaster.sh` script. In particular, it should +also adjust any configuration parameters for the Flink cluster. + +Here is an example configuration for Marathon: +```javascript +{ + "id": "flink", + "cmd": "/opt/flink-{{ site.version }}/bin/mesos-appmaster.sh -Djobmanager.rpc.address=$HOST -Dmesos.resourcemanager.framework.user= -Dmesos.master=:5050 -Dparallelism.default=2", + "user": "", + "cpus": 2, + "mem": 2048, + "instances": 1, + "env": { + "MESOS_NATIVE_JAVA_LIBRARY": "/usr/lib/libmesos.so" + }, + "healthChecks": [ + { + "protocol": "HTTP", + "path": "/", + "port": 8081, + "gracePeriodSeconds": 300, + "intervalSeconds": 60, + "timeoutSeconds": 20, + "maxConsecutiveFailures": 3 + } + ] +} +``` + +Flink is installed into `/opt/flink-{{ site.version }}` having `` as the owner of the Flink +directory (notice that the user is used twice: once as a Marathon and another time as a Mesos +parameter) for the example configuration above to work. Additionally, we have the bundled Hadoop jar +saved in Flink's `lib/` folder for the sake of simplicity here. This way, we don't have to set +`HADOOP_CLASSPATH` as a environment variable next to `MESOS_NATIVE_JAVA_LIBRARY`. + +`` needs to be set to the hostname or IP of Mesos' master node. `$HOST` is a Marathon +environment variable referring to the hostname of the machine the script is executed on. `$HOST` should +not be replaced in the config above! + +The whole Flink cluster including the JobManager will be run as Mesos tasks in the Mesos cluster when +deploying Flink using Marathon. Flink's binaries have to be installed on all Mesos workers for the +above Marathon config to work. + +### Supported Hadoop versions + +Flink on Mesos is compiled against Hadoop 2.4.1, and all Hadoop versions >= 2.4.1 are supported, +including Hadoop 3.x. + +For providing Flink with the required Hadoop dependencies, we recommend setting the `HADOOP_CLASSPATH` +environment variable already introduced in the [Getting Started / Preparation](#preparation) section. + +If that is not possible, the dependencies can also be put into the `lib/` folder of Flink. + +Flink also offers pre-bundled Hadoop fat jars for placing them in the `lib/` folder, on the +[Downloads / Additional Components]({{site.download_url}}#additional-components) section of the website. +These pre-bundled fat jars are shaded to avoid dependency conflicts with common libraries. The Flink +community is not testing the Mesos integration against these pre-bundled jars. + +### Flink on Mesos Architecture + +The Flink on Mesos implementation consists of two components: The application master and the workers. +The workers are simple TaskManagers parameterized by the environment which is set up through the +application master. The most sophisticated component of the Flink on Mesos implementation is the +application master. The application master currently hosts the following components: +- **Mesos Scheduler**: The Scheduler is responsible for registering a framework with Mesos, requesting + resources, and launching worker nodes. The Scheduler continuously needs to report back to Mesos to + ensure the framework is in a healthy state. To verify the health of the cluster, the Scheduler + monitors the spawned workers, marks them as failed and restarts them if necessary. + + Flink's Mesos Scheduler itself is currently not highly available. However, it persists all necessary + information about its state (e.g. configuration, list of workers) in [ZooKeeper](#high-availability-on-mesos). + In the presence of a failure, it relies on an external system to bring up a new Scheduler (see the + [Marathon subsection](#marathon) for further details). The Scheduler will then register with Mesos + again and go through the reconciliation phase. In the reconciliation phase, the Scheduler receives + a list of running workers nodes. It matches these against the recovered information from ZooKeeper + and makes sure to bring back the cluster in the state before the failure. +- **Artifact Server**: The Artifact Server is responsible for providing resources to the worker nodes. + The resources can be anything from the Flink binaries to shared secrets or configuration files. + For instance, in non-containerized environments, the Artifact Server will provide the Flink binaries. + What files will be served depends on the configuration overlay used. + +Flink's Mesos startup scripts `bin/mesos-appmaster.sh` and `bin/mesos-appmaster-job.sh` provide a way +to configure and start the application master. The worker nodes inherit all further configuration. +They are deployed through `bin/mesos-taskmanager.sh`. The configuration inheritance is achieved using +configuration overlays. Configuration overlays provide a way to infer a configuration from environment +variables and config files which are shipped to the worker nodes. + +See [Mesos Architecture](http://mesos.apache.org/documentation/latest/architecture/) for a more details +on how frameworks are handled by Mesos. + +{{< top >}} + +## Appendix +The following resource files can be used to set up a local Mesos cluster running the Marathon framework +and having Flink 1.11.2 installed. + +### Dockerfile + +```yaml +FROM mesosphere/mesos:1.7.1 + +# install Java 11 and wget +RUN apt update && \ + apt -y install wget && \ + wget -nv https://download.java.net/java/GA/jdk11/9/GPL/openjdk-11.0.2_linux-x64_bin.tar.gz && \ + tar xzf openjdk-11.0.2_linux-x64_bin.tar.gz && \ + mv jdk-11* /usr/local/jdk-11.0.2 && \ + update-alternatives --install /usr/bin/java java /usr/local/jdk-11.0.2/bin/java 2048 && \ + update-alternatives --auto java +ENV JAVA_HOME=/usr/local/jdk-11.0.2 + +WORKDIR /opt + +# install Hadoop +RUN wget -nv https://apache.mirror.digionline.de/hadoop/common/hadoop-2.10.1/hadoop-2.10.1.tar.gz && \ + tar -xf hadoop-2.10.1.tar.gz +ENV HADOOP_CLASSPATH=/opt/hadoop-2.10.1/etc/hadoop:/opt/hadoop-2.10.1/share/hadoop/common/lib/*:/opt/hadoop-2.10.1/share/hadoop/common/*:/opt/hadoop-2.10.1/share/hadoop/hdfs:/opt/hadoop-2.10.1/share/hadoop/hdfs/lib/*:/opt/hadoop-2.10.1/share/hadoop/hdfs/*:/opt/hadoop-2.10.1/share/hadoop/yarn:/opt/hadoop-2.10.1/share/hadoop/yarn/lib/*:/opt/hadoop-2.10.1/share/hadoop/yarn/*:/opt/hadoop-2.10.1/share/hadoop/mapreduce/lib/*:/opt/hadoop-2.10.1/share/hadoop/mapreduce/*:/contrib/capacity-scheduler/*.jar + +# install Flink on Mesos +RUN wget -nv https://apache.mirror.digionline.de/flink/flink-1.11.2/flink-1.11.2-bin-scala_2.11.tgz && \ + tar -xf flink-1.11.2-bin-scala_2.11.tgz +ENV MESOS_NATIVE_JAVA_LIBRARY=/usr/lib/libmesos.so +``` + +### Docker Compose + +The `docker-compose.yml` provided below is based on the work done by +[Sean Bennet](https://github.com/sean-bennett112/mesos-docker/blob/master/fig.yml). + +Keep in mind that it requires the `Dockerfile` of the previous section to be found in the same +directory and the file being named `Dockerfile`. It might make sense to scale the worker nodes up to +have enough workers to run Flink on Mesos next +to the Marathon framework: +```bash +docker-compose up -d --scale worker=2 +``` + +```yaml +version: "3.8" +services: + zookeeper: + build: + context: . + dockerfile: Dockerfile + command: /usr/share/zookeeper/bin/zkServer.sh start-foreground + container_name: zookeeper + master: + build: + context: . + dockerfile: Dockerfile + command: mesos-master --registry=in_memory + container_name: master + environment: + - MESOS_ZK=zk://zookeeper:2181/mesos + - MESOS_LOG_DIR=/var/log/mesos + - MESOS_QUORUM=1 + - MESOS_WORK_DIR=/var/lib/mesos + depends_on: + - zookeeper + ports: + - "5050:5050" + - "8081:8081" + worker: + build: + context: . + dockerfile: Dockerfile + command: mesos-slave --launcher=posix + environment: + - MESOS_MASTER=zk://zookeeper:2181/mesos + - MESOS_WORK_DIR=/var/lib/mesos + - MESOS_LOG_DIR=/var/log/mesos + - MESOS_LOGGING_LEVEL=INFO + - MESOS_SYSTEMD_ENABLE_SUPPORT=false + depends_on: + - zookeeper + - master + ports: + - "8081" + marathon: + image: mesosphere/marathon:v1.11.24 + container_name: marathon + environment: + - MARATHON_MASTER=zk://zookeeper:2181/mesos + - MARATHON_ZK=zk://zookeeper:2181/marathon + - MARATHON_ZK_CONNECTION_TIMEOUT=60000 + ports: + - "8080:8080" + depends_on: + - zookeeper + - master +``` + +### Marathon configuration + +The following Marathon configuration can be applied through the Marathon UI: http://localhost:8080/ +It will start a Flink on Mesos cluster on any of the worker machines. Flink's default port `8081` is +forwarded to random ports due to the scaling of the worker nodes. Use `docker ps` to figure out the +host system's ports that to be able to access Flink's web interface. + +```javascript +{ + "id": "flink", + "cmd": "/opt/flink-1.11.2/bin/mesos-appmaster.sh -Dmesos.resourcemanager.framework.user=root -Dmesos.master=master:5050 -Djobmanager.rpc.address=$HOST -Dparallelism.default=2", + "cpus": 2, + "mem": 4096, + "disk": 0, + "instances": 1, + "env": { + "HADOOP_CLASSPATH": "/opt/hadoop-2.10.1/etc/hadoop:/opt/hadoop-2.10.1/share/hadoop/common/lib/*:/opt/hadoop-2.10.1/share/hadoop/common/*:/opt/hadoop-2.10.1/share/hadoop/hdfs:/opt/hadoop-2.10.1/share/hadoop/hdfs/lib/*:/opt/hadoop-2.10.1/share/hadoop/hdfs/*:/opt/hadoop-2.10.1/share/hadoop/yarn:/opt/hadoop-2.10.1/share/hadoop/yarn/lib/*:/opt/hadoop-2.10.1/share/hadoop/yarn/*:/opt/hadoop-2.10.1/share/hadoop/mapreduce/lib/*:/opt/hadoop-2.10.1/share/hadoop/mapreduce/*:/opt/hadoop-2.10.1/etc/hadoop:/opt/hadoop-2.10.1/share/hadoop/common/lib/*:/opt/hadoop-2.10.1/share/hadoop/common/*:/opt/hadoop-2.10.1/share/hadoop/hdfs:/opt/hadoop-2.10.1/share/hadoop/hdfs/lib/*:/opt/hadoop-2.10.1/share/hadoop/hdfs/*:/opt/hadoop-2.10.1/share/hadoop/yarn:/opt/hadoop-2.10.1/share/hadoop/yarn/lib/*:/opt/hadoop-2.10.1/share/hadoop/yarn/*:/opt/hadoop-2.10.1/share/hadoop/mapreduce/lib/*:/opt/hadoop-2.10.1/share/hadoop/mapreduce/*:/contrib/capacity-scheduler/*.jar:/contrib/capacity-scheduler/*.jar" + }, + "healthChecks": [ + { + "protocol": "HTTP", + "path": "/", + "port": 8081, + "gracePeriodSeconds": 300, + "intervalSeconds": 60, + "timeoutSeconds": 20, + "maxConsecutiveFailures": 3 + } + ], + "user": "root" +} +``` + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/resource-providers/native_kubernetes.md b/docs/content.zh/docs/deployment/resource-providers/native_kubernetes.md new file mode 100644 index 0000000000000..526198e32651e --- /dev/null +++ b/docs/content.zh/docs/deployment/resource-providers/native_kubernetes.md @@ -0,0 +1,574 @@ +--- +title: Native Kubernetes +weight: 3 +type: docs +aliases: + - /zh/deployment/resource-providers/native_kubernetes.html + - /zh/ops/deployment/native_kubernetes.html +--- + + +# Native Kubernetes + +This page describes how to deploy Flink natively on [Kubernetes](https://kubernetes.io). + +## Getting Started + +This *Getting Started* section guides you through setting up a fully functional Flink Cluster on Kubernetes. + +### Introduction + +Kubernetes is a popular container-orchestration system for automating computer application deployment, scaling, and management. +Flink's native Kubernetes integration allows you to directly deploy Flink on a running Kubernetes cluster. +Moreover, Flink is able to dynamically allocate and de-allocate TaskManagers depending on the required resources because it can directly talk to Kubernetes. + +### Preparation + +The *Getting Started* section assumes a running Kubernetes cluster fulfilling the following requirements: + +- Kubernetes >= 1.9. +- KubeConfig, which has access to list, create, delete pods and services, configurable via `~/.kube/config`. You can verify permissions by running `kubectl auth can-i pods`. +- Enabled Kubernetes DNS. +- `default` service account with [RBAC](#rbac) permissions to create, delete pods. + +If you have problems setting up a Kubernetes cluster, then take a look at [how to setup a Kubernetes cluster](https://kubernetes.io/docs/setup/). + +### Starting a Flink Session on Kubernetes + +Once you have your Kubernetes cluster running and `kubectl` is configured to point to it, you can launch a Flink cluster in [Session Mode]({{< ref "docs/deployment/overview" >}}#session-mode) via + +```bash +# (1) Start Kubernetes session +$ ./bin/kubernetes-session.sh -Dkubernetes.cluster-id=my-first-flink-cluster + +# (2) Submit example job +$ ./bin/flink run \ + --target kubernetes-session \ + -Dkubernetes.cluster-id=my-first-flink-cluster \ + ./examples/streaming/TopSpeedWindowing.jar + +# (3) Stop Kubernetes session by deleting cluster deployment +$ kubectl delete deployment/my-first-flink-cluster + +``` + +{{< hint info >}} +When using [Minikube](https://minikube.sigs.k8s.io/docs/), you need to call `minikube tunnel` in order to [expose Flink's LoadBalancer service on Minikube](https://minikube.sigs.k8s.io/docs/handbook/accessing/#using-minikube-tunnel). +{{< /hint >}} + +Congratulations! You have successfully run a Flink application by deploying Flink on Kubernetes. + +{{< top >}} + +## Deployment Modes + +For production use, we recommend deploying Flink Applications in the [Application Mode]({{< ref "docs/deployment/overview" >}}#application-mode), as these modes provide a better isolation for the Applications. + +### Application Mode + +The [Application Mode]({{< ref "docs/deployment/overview" >}}#application-mode) requires that the user code is bundled together with the Flink image because it runs the user code's `main()` method on the cluster. +The Application Mode makes sure that all Flink components are properly cleaned up after the termination of the application. + +The Flink community provides a [base Docker image]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#docker-hub-flink-images) which can be used to bundle the user code: + +```dockerfile +FROM flink +RUN mkdir -p $FLINK_HOME/usrlib +COPY /path/of/my-flink-job.jar $FLINK_HOME/usrlib/my-flink-job.jar +``` + +After creating and publishing the Docker image under `custom-image-name`, you can start an Application cluster with the following command: + +```bash +$ ./bin/flink run-application \ + --target kubernetes-application \ + -Dkubernetes.cluster-id=my-first-application-cluster \ + -Dkubernetes.container.image=custom-image-name \ + local:///opt/flink/usrlib/my-flink-job.jar +``` + +Note `local` is the only supported scheme in Application Mode. + +The `kubernetes.cluster-id` option specifies the cluster name and must be unique. +If you do not specify this option, then Flink will generate a random name. + +The `kubernetes.container.image` option specifies the image to start the pods with. + +Once the application cluster is deployed you can interact with it: + +```bash +# List running job on the cluster +$ ./bin/flink list --target kubernetes-application -Dkubernetes.cluster-id=my-first-application-cluster +# Cancel running job +$ ./bin/flink cancel --target kubernetes-application -Dkubernetes.cluster-id=my-first-application-cluster +``` + +You can override configurations set in `conf/flink-conf.yaml` by passing key-value pairs `-Dkey=value` to `bin/flink`. + +### Per-Job Cluster Mode + +Flink on Kubernetes does not support Per-Job Cluster Mode. + +### Session Mode + +You have seen the deployment of a Session cluster in the [Getting Started](#getting-started) guide at the top of this page. + +The Session Mode can be executed in two modes: + +* **detached mode** (default): The `kubernetes-session.sh` deploys the Flink cluster on Kubernetes and then terminates. + +* **attached mode** (`-Dexecution.attached=true`): The `kubernetes-session.sh` stays alive and allows entering commands to control the running Flink cluster. + For example, `stop` stops the running Session cluster. + Type `help` to list all supported commands. + +In order to re-attach to a running Session cluster with the cluster id `my-first-flink-cluster` use the following command: + +```bash +$ ./bin/kubernetes-session.sh \ + -Dkubernetes.cluster-id=my-first-flink-cluster \ + -Dexecution.attached=true +``` + +You can override configurations set in `conf/flink-conf.yaml` by passing key-value pairs `-Dkey=value` to `bin/kubernetes-session.sh`. + +#### Stop a Running Session Cluster + +In order to stop a running Session Cluster with cluster id `my-first-flink-cluster` you can either [delete the Flink deployment](#manual-resource-cleanup) or use: + +```bash +$ echo 'stop' | ./bin/kubernetes-session.sh \ + -Dkubernetes.cluster-id=my-first-flink-cluster \ + -Dexecution.attached=true +``` + +{{< top >}} + +## Flink on Kubernetes Reference + +### Configuring Flink on Kubernetes + +The Kubernetes-specific configuration options are listed on the [configuration page]({{< ref "docs/deployment/config" >}}#kubernetes). + +Flink uses [Fabric8 Kubernetes client](https://github.com/fabric8io/kubernetes-client) to communicate with Kubernetes APIServer to create/delete Kubernetes resources(e.g. Deployment, Pod, ConfigMap, Service, etc.), as well as watch the Pods and ConfigMaps. +Except for the above Flink config options, some [expert options](https://github.com/fabric8io/kubernetes-client#configuring-the-client) of Fabric8 Kubernetes client could be configured via system properties or environment variables. + +For example, users could use the following Flink config options to set the concurrent max requests, which allows running more jobs in a session cluster when [Kubernetes HA Services]({{< ref "docs/deployment/ha/kubernetes_ha" >}}) are used. +Please note that, each Flink job will consume `3` concurrent requests. + +```yaml +containerized.master.env.KUBERNETES_MAX_CONCURRENT_REQUESTS: 200 +env.java.opts.jobmanager: "-Dkubernetes.max.concurrent.requests=200" +``` + +### Accessing Flink's Web UI + +Flink's Web UI and REST endpoint can be exposed in several ways via the [kubernetes.rest-service.exposed.type]({{< ref "docs/deployment/config" >}}#kubernetes-rest-service-exposed-type) configuration option. + +- **ClusterIP**: Exposes the service on a cluster-internal IP. + The Service is only reachable within the cluster. + If you want to access the JobManager UI or submit job to the existing session, you need to start a local proxy. + You can then use `localhost:8081` to submit a Flink job to the session or view the dashboard. + +```bash +$ kubectl port-forward service/ 8081 +``` + +- **NodePort**: Exposes the service on each Node’s IP at a static port (the `NodePort`). + `:` can be used to contact the JobManager service. + `NodeIP` can also be replaced with the Kubernetes ApiServer address. + You can find its address in your kube config file. + +- **LoadBalancer**: Exposes the service externally using a cloud provider’s load balancer. + Since the cloud provider and Kubernetes needs some time to prepare the load balancer, you may get a `NodePort` JobManager Web Interface in the client log. + You can use `kubectl get services/-rest` to get EXTERNAL-IP and construct the load balancer JobManager Web Interface manually `http://:8081`. + +Please refer to the official documentation on [publishing services in Kubernetes](https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types) for more information. + +{{< hint warning >}} +Depending on your environment, starting a Flink cluster with `LoadBalancer` REST service exposed type might make the cluster accessible publicly (usually with the ability to execute arbitrary code). +{{< /hint >}} + +### Logging + +The Kubernetes integration exposes `conf/log4j-console.properties` and `conf/logback-console.xml` as a ConfigMap to the pods. +Changes to these files will be visible to a newly started cluster. + +#### Accessing the Logs + +By default, the JobManager and TaskManager will output the logs to the console and `/opt/flink/log` in each pod simultaneously. +The `STDOUT` and `STDERR` output will only be redirected to the console. +You can access them via + +```bash +$ kubectl logs +``` + +If the pod is running, you can also use `kubectl exec -it bash` to tunnel in and view the logs or debug the process. + +#### Accessing the Logs of the TaskManagers + +Flink will automatically de-allocate idling TaskManagers in order to not waste resources. +This behaviour can make it harder to access the logs of the respective pods. +You can increase the time before idling TaskManagers are released by configuring [resourcemanager.taskmanager-timeout]({{< ref "docs/deployment/config" >}}#resourcemanager-taskmanager-timeout) so that you have more time to inspect the log files. + +#### Changing the Log Level Dynamically + +If you have configured your logger to [detect configuration changes automatically]({{< ref "docs/deployment/advanced/logging" >}}), then you can dynamically adapt the log level by changing the respective ConfigMap (assuming that the cluster id is `my-first-flink-cluster`): + +```bash +$ kubectl edit cm flink-config-my-first-flink-cluster +``` + +### Using Plugins + +In order to use [plugins]({{< ref "docs/deployment/filesystems/plugins" >}}), you must copy them to the correct location in the Flink JobManager/TaskManager pod. +You can use the [built-in plugins]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#using-plugins) without mounting a volume or building a custom Docker image. +For example, use the following command to enable the S3 plugin for your Flink session cluster. + +```bash +$ ./bin/kubernetes-session.sh + -Dcontainerized.master.env.ENABLE_BUILT_IN_PLUGINS=flink-s3-fs-hadoop-{{< version >}}.jar \ + -Dcontainerized.taskmanager.env.ENABLE_BUILT_IN_PLUGINS=flink-s3-fs-hadoop-{{< version >}}.jar +``` + +### Custom Docker Image + +If you want to use a custom Docker image, then you can specify it via the configuration option `kubernetes.container.image`. +The Flink community provides a rich [Flink Docker image]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}) which can be a good starting point. +See [how to customize Flink's Docker image]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#customize-flink-image) for how to enable plugins, add dependencies and other options. + +### Using Secrets + +[Kubernetes Secrets](https://kubernetes.io/docs/concepts/configuration/secret/) is an object that contains a small amount of sensitive data such as a password, a token, or a key. +Such information might otherwise be put in a pod specification or in an image. +Flink on Kubernetes can use Secrets in two ways: + +* Using Secrets as files from a pod; + +* Using Secrets as environment variables; + +#### Using Secrets as Files From a Pod + +The following command will mount the secret `mysecret` under the path `/path/to/secret` in the started pods: + +```bash +$ ./bin/kubernetes-session.sh -Dkubernetes.secrets=mysecret:/path/to/secret +``` + +The username and password of the secret `mysecret` can then be found stored in the files `/path/to/secret/username` and `/path/to/secret/password`. +For more details see the [official Kubernetes documentation](https://kubernetes.io/docs/concepts/configuration/secret/#using-secrets-as-files-from-a-pod). + +#### Using Secrets as Environment Variables + +The following command will expose the secret `mysecret` as environment variable in the started pods: + +```bash +$ ./bin/kubernetes-session.sh -Dkubernetes.env.secretKeyRef=\ + env:SECRET_USERNAME,secret:mysecret,key:username;\ + env:SECRET_PASSWORD,secret:mysecret,key:password +``` + +The env variable `SECRET_USERNAME` contains the username and the env variable `SECRET_PASSWORD` contains the password of the secret `mysecret`. +For more details see the [official Kubernetes documentation](https://kubernetes.io/docs/concepts/configuration/secret/#using-secrets-as-environment-variables). + +### High-Availability on Kubernetes + +For high availability on Kubernetes, you can use the [existing high availability services]({{< ref "docs/deployment/ha/overview" >}}). + +### Manual Resource Cleanup + +Flink uses [Kubernetes OwnerReference's](https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/) to clean up all cluster components. +All the Flink created resources, including `ConfigMap`, `Service`, and `Pod`, have the `OwnerReference` being set to `deployment/`. +When the deployment is deleted, all related resources will be deleted automatically. + +```bash +$ kubectl delete deployment/ +``` + +### Supported Kubernetes Versions + +Currently, all Kubernetes versions `>= 1.9` are supported. + +### Namespaces + +[Namespaces in Kubernetes](https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/) divide cluster resources between multiple users via [resource quotas](https://kubernetes.io/docs/concepts/policy/resource-quotas/). +Flink on Kubernetes can use namespaces to launch Flink clusters. +The namespace can be configured via [kubernetes.namespace]({{< ref "docs/deployment/config" >}}#kubernetes-namespace). + +### RBAC + +Role-based access control ([RBAC](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)) is a method of regulating access to compute or network resources based on the roles of individual users within an enterprise. +Users can configure RBAC roles and service accounts used by JobManager to access the Kubernetes API server within the Kubernetes cluster. + +Every namespace has a default service account. However, the `default` service account may not have the permission to create or delete pods within the Kubernetes cluster. +Users may need to update the permission of the `default` service account or specify another service account that has the right role bound. + +```bash +$ kubectl create clusterrolebinding flink-role-binding-default --clusterrole=edit --serviceaccount=default:default +``` + +If you do not want to use the `default` service account, use the following command to create a new `flink-service-account` service account and set the role binding. +Then use the config option `-Dkubernetes.service-account=flink-service-account` to make the JobManager pod use the `flink-service-account` service account to create/delete TaskManager pods and leader ConfigMaps. +Also this will allow the TaskManager to watch leader ConfigMaps to retrieve the address of JobManager and ResourceManager. + +```bash +$ kubectl create serviceaccount flink-service-account +$ kubectl create clusterrolebinding flink-role-binding-flink --clusterrole=edit --serviceaccount=default:flink-service-account +``` + +Please refer to the official Kubernetes documentation on [RBAC Authorization](https://kubernetes.io/docs/reference/access-authn-authz/rbac/) for more information. + +### Pod Template + +Flink allows users to define the JobManager and TaskManager pods via template files. This allows to support advanced features +that are not supported by Flink [Kubernetes config options]({{< ref "docs/deployment/config" >}}#kubernetes) directly. +Use [`kubernetes.pod-template-file`]({{< ref "docs/deployment/config" >}}#kubernetes-pod-template-file) +to specify a local file that contains the pod definition. It will be used to initialize the JobManager and TaskManager. +The main container should be defined with name `flink-main-container`. +Please refer to the [pod template example](#example-of-pod-template) for more information. + +#### Fields Overwritten by Flink + +Some fields of the pod template will be overwritten by Flink. +The mechanism for resolving effective field values can be categorized as follows: +* **Defined by Flink:** User cannot configure it. +* **Defined by the user:** User can freely specify this value. Flink framework won't set any additional values and the effective value derives from the config option and the template. + + Precedence order: First an explicit config option value is taken, then the value in pod template and at last the default value of a config option if nothing is specified. +* **Merged with Flink:** Flink will merge values for a setting with a user defined value (see precedence order for "Defined by the user"). Flink values have precedence in case of same name fields. + +Refer to the following tables for the full list of pod fields that will be overwritten. +All the fields defined in the pod template that are not listed in the tables will be unaffected. + +**Pod Metadata** + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyCategoryRelated Config OptionsDescription
    nameDefined by FlinkThe JobManager pod name will be overwritten with the deployment which is defined by }}#kubernetes-cluster-id">kubernetes.cluster-id. + The TaskManager pod names will be overwritten with the pattern <clusterID>-<attempt>-<index> which is generated by Flink ResourceManager.
    namespaceDefined by the user}}#kubernetes-namespace">kubernetes.namespaceBoth the JobManager deployment and TaskManager pods will be created in the user specified namespace.
    ownerReferencesDefined by FlinkThe owner reference of JobManager and TaskManager pods will always be set to the JobManager deployment. + Please use }}#kubernetes-jobmanager-owner-reference">kubernetes.jobmanager.owner.reference to control when the deployment is deleted.
    annotationsDefined by the user}}#kubernetes-jobmanager-annotations">kubernetes.jobmanager.annotations + }}#kubernetes-taskmanager-annotations">kubernetes.taskmanager.annotationsFlink will add additional annotations specified by the Flink configuration options.
    labelsMerged with Flink}}#kubernetes-jobmanager-labels">kubernetes.jobmanager.labels + }}#kubernetes-taskmanager-labels">kubernetes.taskmanager.labelsFlink will add some internal labels to the user defined values.
    + +**Pod Spec** + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyCategoryRelated Config OptionsDescription
    imagePullSecretsDefined by the user}}#kubernetes-container-image-pull-secrets">kubernetes.container.image.pull-secretsFlink will add additional pull secrets specified by the Flink configuration options.
    nodeSelectorDefined by the user}}#kubernetes-jobmanager-node-selector">kubernetes.jobmanager.node-selector + }}#kubernetes-taskmanager-node-selector">kubernetes.taskmanager.node-selectorFlink will add additional node selectors specified by the Flink configuration options.
    tolerationsDefined by the user}}#kubernetes-jobmanager-tolerations">kubernetes.jobmanager.tolerations + }}#kubernetes-taskmanager-tolerations">kubernetes.taskmanager.tolerationsFlink will add additional tolerations specified by the Flink configuration options.
    restartPolicyDefined by Flink"always" for JobManager pod and "never" for TaskManager pod. +
    + The JobManager pod will always be restarted by deployment. And the TaskManager pod should not be restarted.
    serviceAccountDefined by the user}}#kubernetes-service-account">kubernetes.service-accountThe JobManager and TaskManager pods will be created with the user defined service account.
    volumesMerged with FlinkFlink will add some internal ConfigMap volumes(e.g. flink-config-volume, hadoop-config-volume) which is necessary for shipping the Flink configuration and hadoop configuration.
    + +**Main Container Spec** + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    KeyCategoryRelated Config OptionsDescription
    envMerged with Flink}}#forwarding-environment-variables">containerized.master.env.{ENV_NAME} + }}#forwarding-environment-variables">containerized.taskmanager.env.{ENV_NAME}Flink will add some internal environment variables to the user defined values.
    imageDefined by the user}}#kubernetes-container-image">kubernetes.container.imageThe container image will be resolved with respect to the defined precedence order for user defined values.
    imagePullPolicyDefined by the user}}#kubernetes-container-image-pull-policy">kubernetes.container.image.pull-policyThe container image pull policy will be resolved with respect to the defined precedence order for user defined values.
    nameDefined by FlinkThe container name will be overwritten by Flink with "flink-main-container".
    resourcesDefined by the userMemory:
    + }}#jobmanager-memory-process-size">jobmanager.memory.process.size + }}#taskmanager-memory-process-size">taskmanager.memory.process.size +
    + CPU:
    + }}#kubernetes-jobmanager-cpu">kubernetes.jobmanager.cpu + }}#kubernetes-taskmanager-cpu">kubernetes.taskmanager.cpu
    The memory and cpu resources(including requests and limits) will be overwritten by Flink configuration options. All other resources(e.g. ephemeral-storage) will be retained.
    containerPortsMerged with FlinkFlink will add some internal container ports(e.g. rest, jobmanager-rpc, blob, taskmanager-rpc).
    volumeMountsMerged with FlinkFlink will add some internal volume mounts(e.g. flink-config-volume, hadoop-config-volume) which is necessary for shipping the Flink configuration and hadoop configuration.
    + +#### Example of Pod Template +`pod-template.yaml` +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: jobmanager-pod-template +spec: + initContainers: + - name: artifacts-fetcher + image: artifacts-fetcher:latest + # Use wget or other tools to get user jars from remote storage + command: [ 'wget', 'https://path/of/StateMachineExample.jar', '-O', '/flink-artifact/myjob.jar' ] + volumeMounts: + - mountPath: /flink-artifact + name: flink-artifact + containers: + # Do not change the main container name + - name: flink-main-container + resources: + requests: + ephemeral-storage: 2048Mi + limits: + ephemeral-storage: 2048Mi + volumeMounts: + - mountPath: /opt/flink/volumes/hostpath + name: flink-volume-hostpath + - mountPath: /opt/flink/artifacts + name: flink-artifact + - mountPath: /opt/flink/log + name: flink-logs + # Use sidecar container to push logs to remote storage or do some other debugging things + - name: sidecar-log-collector + image: sidecar-log-collector:latest + command: [ 'command-to-upload', '/remote/path/of/flink-logs/' ] + volumeMounts: + - mountPath: /flink-logs + name: flink-logs + volumes: + - name: flink-volume-hostpath + hostPath: + path: /tmp + type: Directory + - name: flink-artifact + emptyDir: { } + - name: flink-logs + emptyDir: { } +``` + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/resource-providers/standalone/_index.md b/docs/content.zh/docs/deployment/resource-providers/standalone/_index.md new file mode 100644 index 0000000000000..32605a375e1b1 --- /dev/null +++ b/docs/content.zh/docs/deployment/resource-providers/standalone/_index.md @@ -0,0 +1,23 @@ +--- +title: Standalone +bookCollapseSection: true +weight: 2 +--- + diff --git a/docs/content.zh/docs/deployment/resource-providers/standalone/docker.md b/docs/content.zh/docs/deployment/resource-providers/standalone/docker.md new file mode 100644 index 0000000000000..82fc95b135cb2 --- /dev/null +++ b/docs/content.zh/docs/deployment/resource-providers/standalone/docker.md @@ -0,0 +1,640 @@ +--- +title: Docker 设置 +weight: 4 +type: docs +aliases: + - /zh/deployment/resource-providers/standalone/docker.html + - /zh/ops/deployment/docker.html +--- + + +# Docker Setup + +## Getting Started + +This *Getting Started* section guides you through the local setup (on one machine, but in separate containers) of a Flink cluster using Docker containers. + +### Introduction + +[Docker](https://www.docker.com) is a popular container runtime. +There are official Docker images for Apache Flink available [on Docker Hub](https://hub.docker.com/_/flink). +You can use the Docker images to deploy a *Session* or *Application cluster* on Docker. This page focuses on the setup of Flink on Docker, Docker Swarm and Docker Compose. + +Deployment into managed containerized environments, such as [standalone Kubernetes]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}}) or [native Kubernetes]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}), are described on separate pages. + + +### Starting a Session Cluster on Docker + +A *Flink Session cluster* can be used to run multiple jobs. Each job needs to be submitted to the cluster after the cluster has been deployed. +To deploy a *Flink Session cluster* with Docker, you need to start a JobManager container. To enable communication between the containers, we first set a required Flink configuration property and create a network: + +```sh +$ FLINK_PROPERTIES="jobmanager.rpc.address: jobmanager" +$ docker network create flink-network +``` + +Then we launch the JobManager: + +```sh +$ docker run \ + --rm \ + --name=jobmanager \ + --network flink-network \ + --publish 8081:8081 \ + --env FLINK_PROPERTIES="${FLINK_PROPERTIES}" \ + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} jobmanager +``` + +and one or more TaskManager containers: + +```sh +$ docker run \ + --rm \ + --name=taskmanager \ + --network flink-network \ + --env FLINK_PROPERTIES="${FLINK_PROPERTIES}" \ + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} taskmanager +``` + +The web interface is now available at [localhost:8081](http://localhost:8081). + + +Submission of a job is now possible like this (assuming you have a local distribution of Flink available): + +```sh +$ ./bin/flink run ./examples/streaming/TopSpeedWindowing.jar +``` + +To shut down the cluster, either terminate (e.g. with `CTRL-C`) the JobManager and TaskManager processes, or use `docker ps` to identify and `docker stop` to terminate the containers. + +## Deployment Modes + +The Flink image contains a regular Flink distribution with its default configuration and a standard entry point script. +You can run its entry point in the following modes: +* [JobManager]({{< ref "docs/concepts/glossary" >}}#flink-jobmanager) for [a Session cluster](#starting-a-session-cluster-on-docker) +* [JobManager]({{< ref "docs/concepts/glossary" >}}#flink-jobmanager) for [a Application cluster](#application-mode-on-docker) +* [TaskManager]({{< ref "docs/concepts/glossary" >}}#flink-taskmanager) for any cluster + +This allows you to deploy a standalone cluster (Session or Application Mode) in any containerised environment, for example: +* manually in a local Docker setup, +* [in a Kubernetes cluster]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}}), +* [with Docker Compose](#flink-with-docker-compose), +* [with Docker swarm](#flink-with-docker-swarm). + +Note [The native Kubernetes]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}) also runs the same image by default +and deploys TaskManagers on demand so that you do not have to do it manually. + +The next chapters describe how to start a single Flink Docker container for various purposes. + +Once you've started Flink on Docker, you can access the Flink Webfrontend on [localhost:8081](http://localhost:8081/#/overview) or submit jobs like this `./bin/flink run ./examples/streaming/TopSpeedWindowing.jar`. + +We recommend using [Docker Compose](#flink-with-docker-compose) or [Docker Swarm](#flink-with-docker-swarm) for deploying Flink in Session Mode to ease system configuration. + + +### Application Mode on Docker + +A *Flink Application cluster* is a dedicated cluster which runs a single job. +In this case, you deploy the cluster with the job as one step, thus, there is no extra job submission needed. + +The *job artifacts* are included into the class path of Flink's JVM process within the container and consist of: +* your job jar, which you would normally submit to a *Session cluster* and +* all other necessary dependencies or resources, not included into Flink. + +To deploy a cluster for a single job with Docker, you need to +* make *job artifacts* available locally in all containers under `/opt/flink/usrlib`, +* start a JobManager container in the *Application cluster* mode +* start the required number of TaskManager containers. + +To make the **job artifacts available** locally in the container, you can + +* **either mount a volume** (or multiple volumes) with the artifacts to `/opt/flink/usrlib` when you start + the JobManager and TaskManagers: + + ```sh + $ FLINK_PROPERTIES="jobmanager.rpc.address: jobmanager" + $ docker network create flink-network + + $ docker run \ + --mount type=bind,src=/host/path/to/job/artifacts1,target=/opt/flink/usrlib/artifacts1 \ + --mount type=bind,src=/host/path/to/job/artifacts2,target=/opt/flink/usrlib/artifacts2 \ + --rm \ + --env FLINK_PROPERTIES="${FLINK_PROPERTIES}" \ + --name=jobmanager \ + --network flink-network \ + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} standalone-job \ + --job-classname com.job.ClassName \ + [--job-id ] \ + [--fromSavepoint /path/to/savepoint [--allowNonRestoredState]] \ + [job arguments] + + $ docker run \ + --mount type=bind,src=/host/path/to/job/artifacts1,target=/opt/flink/usrlib/artifacts1 \ + --mount type=bind,src=/host/path/to/job/artifacts2,target=/opt/flink/usrlib/artifacts2 \ + --env FLINK_PROPERTIES="${FLINK_PROPERTIES}" \ + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} taskmanager + ``` + +* **or extend the Flink image** by writing a custom `Dockerfile`, build it and use it for starting the JobManager and TaskManagers: + + + ```dockerfile + FROM flink + ADD /host/path/to/job/artifacts/1 /opt/flink/usrlib/artifacts/1 + ADD /host/path/to/job/artifacts/2 /opt/flink/usrlib/artifacts/2 + ``` + + ```sh + $ docker build --tag flink_with_job_artifacts . + $ docker run \ + flink_with_job_artifacts standalone-job \ + --job-classname com.job.ClassName \ + [--job-id ] \ + [--fromSavepoint /path/to/savepoint [--allowNonRestoredState]] \ + [job arguments] + + $ docker run flink_with_job_artifacts taskmanager + ``` + +The `standalone-job` argument starts a JobManager container in the Application Mode. + +#### JobManager additional command line arguments + +You can provide the following additional command line arguments to the cluster entrypoint: + +* `--job-classname `: Class name of the job to run. + + By default, Flink scans its class path for a JAR with a Main-Class or program-class manifest entry and chooses it as the job class. + Use this command line argument to manually set the job class. + This argument is required in case that no or more than one JAR with such a manifest entry is available on the class path. + +* `--job-id ` (optional): Manually set a Flink job ID for the job (default: 00000000000000000000000000000000) + +* `--fromSavepoint /path/to/savepoint` (optional): Restore from a savepoint + + In order to resume from a savepoint, you also need to pass the savepoint path. + Note that `/path/to/savepoint` needs to be accessible in all Docker containers of the cluster + (e.g., storing it on a DFS or from the mounted volume or adding it to the image). + +* `--allowNonRestoredState` (optional): Skip broken savepoint state + + Additionally you can specify this argument to allow that savepoint state is skipped which cannot be restored. + +If the main function of the user job main class accepts arguments, you can also pass them at the end of the `docker run` command. + +### Per-Job Mode on Docker + +[Per-Job Mode]({{< ref "docs/deployment/overview" >}}#per-job-mode) is not supported by Flink on Docker. + +### Session Mode on Docker + +Local deployment in the Session Mode has already been described in the [Getting Started](#starting-a-session-cluster-on-docker) section above. + + +{{< top >}} + +## Flink on Docker Reference + +### Image hosting + +There are two distribution channels for the Flink Docker images: +1. [Official Flink images on Docker Hub (reviewed and build by Docker)](https://hub.docker.com/_/flink/) +2. [Flink images on Docker Hub `apache/flink` (managed by the Flink developers)](https://hub.docker.com/r/apache/flink) + +We recommend using the official images on Docker Hub, as they are reviewed by Docker. The images on `apache/flink` are provided in case of delays in the review process by Docker. + +Launching an image named `flink:latest` will pull the latest image from Docker Hub. In order to use the images hosted in `apache/flink`, replace `flink` by `apache/flink`. Any of the image tags (starting from Flink 1.11.3) are avaialble on `apache/flink` as well. + +### Image tags + +The [Flink Docker repository](https://hub.docker.com/_/flink/) is hosted on Docker Hub and serves images of Flink version 1.2.1 and later. +The source for these images can be found in the [Apache flink-docker](https://github.com/apache/flink-docker) repository. + +Images for each supported combination of Flink and Scala versions are available, and +[tag aliases](https://hub.docker.com/_/flink?tab=tags) are provided for convenience. + +For example, you can use the following aliases: + +* `flink:latest` → `flink:-scala_` +* `flink:1.11` → `flink:1.11.-scala_2.11` + +Note It is recommended to always use an explicit version tag of the docker image that specifies both the needed Flink and Scala +versions (for example `flink:1.11-scala_2.12`). +This will avoid some class conflicts that can occur if the Flink and/or Scala versions used in the application are different +from the versions provided by the docker image. + +Note Prior to Flink 1.5 version, Hadoop dependencies were always bundled with Flink. +You can see that certain tags include the version of Hadoop, e.g. (e.g. `-hadoop28`). +Beginning with Flink 1.5, image tags that omit the Hadoop version correspond to Hadoop-free releases of Flink +that do not include a bundled Hadoop distribution. + + +### Passing configuration via environment variables + +When you run Flink image, you can also change its configuration options by setting the environment variable `FLINK_PROPERTIES`: + +```sh +$ FLINK_PROPERTIES="jobmanager.rpc.address: host +taskmanager.numberOfTaskSlots: 3 +blob.server.port: 6124 +" +$ docker run --env FLINK_PROPERTIES=${FLINK_PROPERTIES} flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} +``` + +The [`jobmanager.rpc.address`]({{< ref "docs/deployment/config" >}}#jobmanager-rpc-address) option must be configured, others are optional to set. + +The environment variable `FLINK_PROPERTIES` should contain a list of Flink cluster configuration options separated by new line, +the same way as in the `flink-conf.yaml`. `FLINK_PROPERTIES` takes precedence over configurations in `flink-conf.yaml`. + +### Provide custom configuration + +The configuration files (`flink-conf.yaml`, logging, hosts etc) are located in the `/opt/flink/conf` directory in the Flink image. +To provide a custom location for the Flink configuration files, you can + +* **either mount a volume** with the custom configuration files to this path `/opt/flink/conf` when you run the Flink image: + + ```sh + $ docker run \ + --mount type=bind,src=/host/path/to/custom/conf,target=/opt/flink/conf \ + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + ``` + +* or add them to your **custom Flink image**, build and run it: + + + ```dockerfile + FROM flink + ADD /host/path/to/flink-conf.yaml /opt/flink/conf/flink-conf.yaml + ADD /host/path/to/log4j.properties /opt/flink/conf/log4j.properties + ``` + +{{< hint info >}} +The mounted volume must contain all necessary configuration files. +The `flink-conf.yaml` file must have write permission so that the Docker entry point script can modify it in certain cases. +{{< /hint >}} + +### Using filesystem plugins + +As described in the [plugins]({{< ref "docs/deployment/filesystems/plugins" >}}) documentation page: In order to use plugins they must be +copied to the correct location in the Flink installation in the Docker container for them to work. + +If you want to enable plugins provided with Flink (in the `opt/` directory of the Flink distribution), you can pass the environment variable `ENABLE_BUILT_IN_PLUGINS` when you run the Flink image. +The `ENABLE_BUILT_IN_PLUGINS` should contain a list of plugin jar file names separated by `;`. A valid plugin name is for example `flink-s3-fs-hadoop-{{site.version}}.jar` + +```sh + $ docker run \ + --env ENABLE_BUILT_IN_PLUGINS=flink-plugin1.jar;flink-plugin2.jar \ + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} +``` + +There are also more [advanced ways](#advanced-customization) for customizing the Flink image. + +### Enabling Python + +To build a custom image which has Python and PyFlink prepared, you can refer to the following Dockerfile: +```Dockerfile +{{< stable >}} +FROM flink:{{< version >}} +{{< /stable >}} +{{< unstable >}} +FROM flink:latest +{{< /unstable >}} + +# install python3 and pip3 +RUN apt-get update -y && \ +apt-get install -y python3.7 python3-pip python3.7-dev && rm -rf /var/lib/apt/lists/* +RUN ln -s /usr/bin/python3 /usr/bin/python + +# install Python Flink +{{< stable >}} +RUN pip3 install apache-flink[=={{site.version}}] +{{< /stable >}} +{{< unstable >}} +RUN pip3 install apache-flink +{{< /unstable >}} +``` + +Build the image named as **pyflink:latest**: + +```bash +$ docker build --tag pyflink:latest . +``` + +### Switch memory allocator + +Flink introduced `jemalloc` as default memory allocator to resolve memory fragmentation problem (please refer to [FLINK-19125](https://issues.apache.org/jira/browse/FLINK-19125)). + +You could switch back to use `glibc` as the memory allocator to restore the old behavior or if any unexpected memory consumption or problem observed +(and please report the issue via JIRA or mailing list if you found any), by setting environment variable `DISABLE_JEMALLOC` as true: + +```sh + $ docker run \ + --env DISABLE_JEMALLOC=true \ + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} +``` + +### Advanced customization + +There are several ways in which you can further customize the Flink image: + +* install custom software (e.g. python) +* enable (symlink) optional libraries or plugins from `/opt/flink/opt` into `/opt/flink/lib` or `/opt/flink/plugins` +* add other libraries to `/opt/flink/lib` (e.g. Hadoop) +* add other plugins to `/opt/flink/plugins` + +You can customize the Flink image in several ways: + +* **override the container entry point** with a custom script where you can run any bootstrap actions. + At the end you can call the standard `/docker-entrypoint.sh` script of the Flink image with the same arguments + as described in [supported deployment modes](#deployment-modes). + + The following example creates a custom entry point script which enables more libraries and plugins. + The custom script, custom library and plugin are provided from a mounted volume. + Then it runs the standard entry point script of the Flink image: + + ```sh + # create custom_lib.jar + # create custom_plugin.jar + + $ echo " + # enable an optional library + ln -fs /opt/flink/opt/flink-queryable-state-runtime-*.jar /opt/flink/lib/ + # enable a custom library + ln -fs /mnt/custom_lib.jar /opt/flink/lib/ + + mkdir -p /opt/flink/plugins/flink-s3-fs-hadoop + # enable an optional plugin + ln -fs /opt/flink/opt/flink-s3-fs-hadoop-*.jar /opt/flink/plugins/flink-s3-fs-hadoop/ + + mkdir -p /opt/flink/plugins/custom_plugin + # enable a custom plugin + ln -fs /mnt/custom_plugin.jar /opt/flink/plugins/custom_plugin/ + + /docker-entrypoint.sh + " > custom_entry_point_script.sh + + $ chmod 755 custom_entry_point_script.sh + + $ docker run \ + --mount type=bind,src=$(pwd),target=/mnt + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} /mnt/custom_entry_point_script.sh + ``` + +* **extend the Flink image** by writing a custom `Dockerfile` and build a custom image: + + + ```dockerfile + FROM flink + + RUN set -ex; apt-get update; apt-get -y install python + + ADD /host/path/to/flink-conf.yaml /container/local/path/to/custom/conf/flink-conf.yaml + ADD /host/path/to/log4j.properties /container/local/path/to/custom/conf/log4j.properties + + RUN ln -fs /opt/flink/opt/flink-queryable-state-runtime-*.jar /opt/flink/lib/. + + RUN mkdir -p /opt/flink/plugins/flink-s3-fs-hadoop + RUN ln -fs /opt/flink/opt/flink-s3-fs-hadoop-*.jar /opt/flink/plugins/flink-s3-fs-hadoop/. + + ENV VAR_NAME value + ``` + + **Commands for building**: + + ```sh + $ docker build --tag custom_flink_image . + # optional push to your docker image registry if you have it, + # e.g. to distribute the custom image to your cluster + $ docker push custom_flink_image + ``` + + +### Flink with Docker Compose + +[Docker Compose](https://docs.docker.com/compose/) is a way to run a group of Docker containers locally. +The next sections show examples of configuration files to run Flink. + +#### Usage + +* Create the `yaml` files with the container configuration, check examples for: + * [Application cluster](#app-cluster-yml) + * [Session cluster](#session-cluster-yml) + + See also [the Flink Docker image tags](#image-tags) and [how to customize the Flink Docker image](#advanced-customization) + for usage in the configuration files. + +* Launch a cluster in the foreground (use `-d` for background) + + ```sh + $ docker-compose up + ``` + +* Scale the cluster up or down to `N` TaskManagers + + ```sh + $ docker-compose scale taskmanager= + ``` + +* Access the JobManager container + + ```sh + $ docker exec -it $(docker ps --filter name=jobmanager --format={{.ID}}) /bin/sh + ``` + +* Kill the cluster + + ```sh + $ docker-compose kill + ``` + +* Access Web UI + + When the cluster is running, you can visit the web UI at [http://localhost:8081](http://localhost:8081). + You can also use the web UI to submit a job to a *Session cluster*. + +* To submit a job to a *Session cluster* via the command line, you can either + + * use [Flink CLI]({{< ref "docs/deployment/cli" >}}) on the host if it is installed: + + ```sh + $ ./bin/flink run --detached --class ${JOB_CLASS_NAME} /job.jar + ``` + + * or copy the JAR to the JobManager container and submit the job using the [CLI]({{< ref "docs/deployment/cli" >}}) from there, for example: + + ```sh + $ JOB_CLASS_NAME="com.job.ClassName" + $ JM_CONTAINER=$(docker ps --filter name=jobmanager --format={{.ID}})) + $ docker cp path/to/jar "${JM_CONTAINER}":/job.jar + $ docker exec -t -i "${JM_CONTAINER}" flink run -d -c ${JOB_CLASS_NAME} /job.jar + ``` + +Here, we provide the
    docker-compose.yml for *Application Cluster*. + +Note: For the Application Mode cluster, the artifacts must be available in the Flink containers, check details [here](#application-mode-on-docker). +See also [how to specify the JobManager arguments](#jobmanager-additional-command-line-arguments) +in the `command` for the `jobmanager` service. + +```yaml +version: "2.2" +services: + jobmanager: + image: flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + ports: + - "8081:8081" + command: standalone-job --job-classname com.job.ClassName [--job-id ] [--fromSavepoint /path/to/savepoint [--allowNonRestoredState]] [job arguments] + volumes: + - /host/path/to/job/artifacts:/opt/flink/usrlib + environment: + - | + FLINK_PROPERTIES= + jobmanager.rpc.address: jobmanager + parallelism.default: 2 + + taskmanager: + image: flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + depends_on: + - jobmanager + command: taskmanager + scale: 1 + volumes: + - /host/path/to/job/artifacts:/opt/flink/usrlib + environment: + - | + FLINK_PROPERTIES= + jobmanager.rpc.address: jobmanager + taskmanager.numberOfTaskSlots: 2 + parallelism.default: 2 +``` + + +As well as the configuration file for *Session Cluster*: + + +```yaml +version: "2.2" +services: + jobmanager: + image: flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + ports: + - "8081:8081" + command: jobmanager + environment: + - | + FLINK_PROPERTIES= + jobmanager.rpc.address: jobmanager + + taskmanager: + image: flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + depends_on: + - jobmanager + command: taskmanager + scale: 1 + environment: + - | + FLINK_PROPERTIES= + jobmanager.rpc.address: jobmanager + taskmanager.numberOfTaskSlots: 2 +``` + + +### Flink with Docker Swarm + +The [Docker swarm](https://docs.docker.com/engine/swarm) is a container orchestration tool, that +allows you to manage multiple containers deployed across multiple host machines. + +The following chapters contain examples of how to configure and start JobManager and TaskManager containers. +You can adjust them accordingly to start a cluster. +See also [the Flink Docker image tags](#image-tags) and [how to customize the Flink Docker image](#advanced-customization) for usage in the provided scripts. + +The port `8081` is exposed for the Flink Web UI access. +If you run the swarm locally, you can visit the web UI at [http://localhost:8081](http://localhost:8081) after starting the cluster. + +#### Session Cluster with Docker Swarm + +```sh +$ FLINK_PROPERTIES="jobmanager.rpc.address: flink-session-jobmanager +taskmanager.numberOfTaskSlots: 2 +" + +# Create overlay network +$ docker network create -d overlay flink-session + +# Create the JobManager service +$ docker service create \ + --name flink-session-jobmanager \ + --env FLINK_PROPERTIES="${FLINK_PROPERTIES}" \ + --publish 8081:8081 \ + --network flink-session \ + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} \ + jobmanager + +# Create the TaskManager service (scale this out as needed) +$ docker service create \ + --name flink-session-taskmanager \ + --replicas 2 \ + --env FLINK_PROPERTIES="${FLINK_PROPERTIES}" \ + --network flink-session \ + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} \ + taskmanager +``` + +#### Application Cluster with Docker Swarm + +```sh +$ FLINK_PROPERTIES="jobmanager.rpc.address: flink-jobmanager +taskmanager.numberOfTaskSlots: 2 +" + +# Create overlay network +$ docker network create -d overlay flink-job + +# Create the JobManager service +$ docker service create \ + --name flink-jobmanager \ + --env FLINK_PROPERTIES="${FLINK_PROPERTIES}" \ + --mount type=bind,source=/host/path/to/job/artifacts,target=/opt/flink/usrlib \ + --publish 8081:8081 \ + --network flink-job \ + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} \ + standalone-job \ + --job-classname com.job.ClassName \ + [--job-id ] \ + [--fromSavepoint /path/to/savepoint [--allowNonRestoredState]] \ + [job arguments] + +# Create the TaskManager service (scale this out as needed) +$ docker service create \ + --name flink-job-taskmanager \ + --replicas 2 \ + --env FLINK_PROPERTIES="${FLINK_PROPERTIES}" \ + --mount type=bind,source=/host/path/to/job/artifacts,target=/opt/flink/usrlib \ + --network flink-job \ + flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} \ + taskmanager +``` + +The *job artifacts* must be available in the JobManager container, as outlined [here](#application-mode-on-docker). +See also [how to specify the JobManager arguments](#jobmanager-additional-command-line-arguments) to pass them +to the `flink-jobmanager` container. + +The example assumes that you run the swarm locally and expects the *job artifacts* to be in `/host/path/to/job/artifacts`. +It also mounts the host path with the artifacts as a volume to the container's path `/opt/flink/usrlib`. + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/resource-providers/standalone/kubernetes.md b/docs/content.zh/docs/deployment/resource-providers/standalone/kubernetes.md new file mode 100644 index 0000000000000..b9f51ce71a23a --- /dev/null +++ b/docs/content.zh/docs/deployment/resource-providers/standalone/kubernetes.md @@ -0,0 +1,769 @@ +--- +title: Kubernetes 设置 +weight: 5 +type: docs +aliases: + - /zh/deployment/resource-providers/standalone/kubernetes.html + - /zh/ops/deployment/kubernetes.html +--- + + +# Kubernetes Setup + +## Getting Started + +This *Getting Started* guide describes how to deploy a *Session cluster* on [Kubernetes](https://kubernetes.io). + +### Introduction + +This page describes deploying a [standalone]({{< ref "docs/deployment/resource-providers/standalone/overview" >}}) Flink cluster on top of Kubernetes, using Flink's standalone deployment. +We generally recommend new users to deploy Flink on Kubernetes using [native Kubernetes deployments]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}). + +### Preparation + +This guide expects a Kubernetes environment to be present. You can ensure that your Kubernetes setup is working by running a command like `kubectl get nodes`, which lists all connected Kubelets. + +If you want to run Kubernetes locally, we recommend using [MiniKube](https://minikube.sigs.k8s.io/docs/start/). + +{{< hint info >}} +If using MiniKube please make sure to execute `minikube ssh 'sudo ip link set docker0 promisc on'` before deploying a Flink cluster. Otherwise Flink components are not able to reference themselves through a Kubernetes service. +{{< /hint >}} + +### Starting a Kubernetes Cluster (Session Mode) + +A *Flink Session cluster* is executed as a long-running Kubernetes Deployment. You can run multiple Flink jobs on a *Session cluster*. +Each job needs to be submitted to the cluster after the cluster has been deployed. + +A *Flink Session cluster* deployment in Kubernetes has at least three components: + +* a *Deployment* which runs a [JobManager]({{< ref "docs/concepts/glossary" >}}#flink-jobmanager) +* a *Deployment* for a pool of [TaskManagers]({{< ref "docs/concepts/glossary" >}}#flink-taskmanager) +* a *Service* exposing the *JobManager's* REST and UI ports + +Using the file contents provided in the [the common resource definitions](#common-cluster-resource-definitions), create the following files, and create the respective components with the `kubectl` command: + +```sh + # Configuration and service definition + $ kubectl create -f flink-configuration-configmap.yaml + $ kubectl create -f jobmanager-service.yaml + # Create the deployments for the cluster + $ kubectl create -f jobmanager-session-deployment.yaml + $ kubectl create -f taskmanager-session-deployment.yaml +``` + +Next, we set up a port forward to access the Flink UI and submit jobs: + +1. Run `kubectl port-forward ${flink-jobmanager-pod} 8081:8081` to forward your jobmanager's web ui port to local 8081. +2. Navigate to [http://localhost:8081](http://localhost:8081) in your browser. +3. Moreover, you could use the following command below to submit jobs to the cluster: +```bash +$ ./bin/flink run -m localhost:8081 ./examples/streaming/TopSpeedWindowing.jar +``` + + + +You can tear down the cluster using the following commands: + +```sh + $ kubectl delete -f jobmanager-service.yaml + $ kubectl delete -f flink-configuration-configmap.yaml + $ kubectl delete -f taskmanager-session-deployment.yaml + $ kubectl delete -f jobmanager-session-deployment.yaml +``` + + +{{< top >}} + +## Deployment Modes + +### Deploy Application Cluster + +A *Flink Application cluster* is a dedicated cluster which runs a single application, which needs to be available at deployment time. + +A basic *Flink Application cluster* deployment in Kubernetes has three components: + +* an *Application* which runs a *JobManager* +* a *Deployment* for a pool of *TaskManagers* +* a *Service* exposing the *JobManager's* REST and UI ports + +Check [the Application cluster specific resource definitions](#application-cluster-resource-definitions) and adjust them accordingly: + +The `args` attribute in the `jobmanager-job.yaml` has to specify the main class of the user job. +See also [how to specify the JobManager arguments]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#jobmanager-additional-command-line-arguments) to understand +how to pass other `args` to the Flink image in the `jobmanager-job.yaml`. + +The *job artifacts* should be available from the `job-artifacts-volume` in [the resource definition examples](#application-cluster-resource-definitions). +The definition examples mount the volume as a local directory of the host assuming that you create the components in a minikube cluster. +If you do not use a minikube cluster, you can use any other type of volume, available in your Kubernetes cluster, to supply the *job artifacts*. +Alternatively, you can build [a custom image]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#advanced-customization) which already contains the artifacts instead. + +After creating [the common cluster components](#common-cluster-resource-definitions), use [the Application cluster specific resource definitions](#application-cluster-resource-definitions) to launch the cluster with the `kubectl` command: + +```sh + $ kubectl create -f jobmanager-job.yaml + $ kubectl create -f taskmanager-job-deployment.yaml +``` + +To terminate the single application cluster, these components can be deleted along with [the common ones](#common-cluster-resource-definitions) +with the `kubectl` command: + +```sh + $ kubectl delete -f taskmanager-job-deployment.yaml + $ kubectl delete -f jobmanager-job.yaml +``` + +### Per-Job Cluster Mode +Flink on Standalone Kubernetes does not support the Per-Job Cluster Mode. + +### Session Mode + +Deployment of a Session cluster is explained in the [Getting Started](#getting-started) guide at the top of this page. + +{{< top >}} + +## Flink on Standalone Kubernetes Reference + +### Configuration + +All configuration options are listed on the [configuration page]({{< ref "docs/deployment/config" >}}). Configuration options can be added to the `flink-conf.yaml` section of the `flink-configuration-configmap.yaml` config map. + +### Accessing Flink in Kubernetes + +You can then access the Flink UI and submit jobs via different ways: +* `kubectl proxy`: + + 1. Run `kubectl proxy` in a terminal. + 2. Navigate to [http://localhost:8001/api/v1/namespaces/default/services/flink-jobmanager:webui/proxy](http://localhost:8001/api/v1/namespaces/default/services/flink-jobmanager:webui/proxy) in your browser. + +* `kubectl port-forward`: + 1. Run `kubectl port-forward ${flink-jobmanager-pod} 8081:8081` to forward your jobmanager's web ui port to local 8081. + 2. Navigate to [http://localhost:8081](http://localhost:8081) in your browser. + 3. Moreover, you can use the following command below to submit jobs to the cluster: + ```bash + $ ./bin/flink run -m localhost:8081 ./examples/streaming/TopSpeedWindowing.jar + ``` + +* Create a `NodePort` service on the rest service of jobmanager: + 1. Run `kubectl create -f jobmanager-rest-service.yaml` to create the `NodePort` service on jobmanager. The example of `jobmanager-rest-service.yaml` can be found in [appendix](#common-cluster-resource-definitions). + 2. Run `kubectl get svc flink-jobmanager-rest` to know the `node-port` of this service and navigate to [http://<public-node-ip>:<node-port>](http://:) in your browser. + 3. If you use minikube, you can get its public ip by running `minikube ip`. + 4. Similarly to the `port-forward` solution, you can also use the following command below to submit jobs to the cluster: + + ```bash + $ ./bin/flink run -m : ./examples/streaming/TopSpeedWindowing.jar + ``` + +### Debugging and Log Access + +Many common errors are easy to detect by checking Flink's log files. If you have access to Flink's web user interface, you can access the JobManager and TaskManager logs from there. + +If there are problems starting Flink, you can also use Kubernetes utilities to access the logs. Use `kubectl get pods` to see all running pods. +For the quickstart example from above, you should see three pods: +``` +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +flink-jobmanager-589967dcfc-m49xv 1/1 Running 3 3m32s +flink-taskmanager-64847444ff-7rdl4 1/1 Running 3 3m28s +flink-taskmanager-64847444ff-nnd6m 1/1 Running 3 3m28s +``` + +You can now access the logs by running `kubectl logs flink-jobmanager-589967dcfc-m49xv` + +### High-Availability with Standalone Kubernetes + +For high availability on Kubernetes, you can use the [existing high availability services]({{< ref "docs/deployment/ha/overview" >}}). + +#### Kubernetes High-Availability Services + +Session Mode and Application Mode clusters support using the [Kubernetes high availability service]({{< ref "docs/deployment/ha/kubernetes_ha" >}}). +You need to add the following Flink config options to [flink-configuration-configmap.yaml](#common-cluster-resource-definitions). + +Note The filesystem which corresponds to the scheme of your configured HA storage directory must be available to the runtime. Refer to [custom Flink image]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#advanced-customization) and [enable plugins]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#using-filesystem-plugins) for more information. + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flink-config + labels: + app: flink +data: + flink-conf.yaml: |+ + ... + kubernetes.cluster-id: + high-availability: org.apache.flink.kubernetes.highavailability.KubernetesHaServicesFactory + high-availability.storageDir: hdfs:///flink/recovery + restart-strategy: fixed-delay + restart-strategy.fixed-delay.attempts: 10 + ... +``` + +Moreover, you have to start the JobManager and TaskManager pods with a service account which has the permissions to create, edit, delete ConfigMaps. +See [how to configure service accounts for pods](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/) for more information. + +When High-Availability is enabled, Flink will use its own HA-services for service discovery. +Therefore, JobManager pods should be started with their IP address instead of a Kubernetes service as its `jobmanager.rpc.address`. +Refer to the [appendix](#appendix) for full configuration. + +#### Standby JobManagers + +Usually, it is enough to only start a single JobManager pod, because Kubernetes will restart it once the pod crashes. +If you want to achieve faster recovery, configure the `replicas` in `jobmanager-session-deployment-ha.yaml` or `parallelism` in `jobmanager-application-ha.yaml` to a value greater than `1` to start standby JobManagers. + +### Enabling Queryable State + +You can access the queryable state of TaskManager if you create a `NodePort` service for it: + 1. Run `kubectl create -f taskmanager-query-state-service.yaml` to create the `NodePort` service for the `taskmanager` pod. The example of `taskmanager-query-state-service.yaml` can be found in [appendix](#common-cluster-resource-definitions). + 2. Run `kubectl get svc flink-taskmanager-query-state` to get the `` of this service. Then you can create the [QueryableStateClient(<public-node-ip>, <node-port>]({{< ref "docs/dev/datastream/fault-tolerance/queryable_state" >}}#querying-state) to submit state queries. + +### Using Standalone Kubernetes with Reactive Mode + +[Reactive Mode]({{< ref "docs/deployment/elastic_scaling" >}}#reactive-mode) allows to run Flink in a mode, where the *Application Cluster* is always adjusting the job parallelism to the available resources. In combination with Kubernetes, the replica count of the TaskManager deployment determines the available resources. Increasing the replica count will scale up the job, reducing it will trigger a scale down. This can also be done automatically by using a [Horizontal Pod Autoscaler](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/). + +To use Reactive Mode on Kubernetes, follow the same steps as for [deploying a job using an Application Cluster](#deploy-application-cluster). But instead of `flink-configuration-configmap.yaml` use this config map: `flink-reactive-mode-configuration-configmap.yaml`. It contains the `scheduler-mode: reactive` setting for Flink. + +Once you have deployed the *Application Cluster*, you can scale your job up or down by changing the replica count in the `flink-taskmanager` deployment. + + +{{< top >}} + +## Appendix + +### Common cluster resource definitions + +`flink-configuration-configmap.yaml` +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flink-config + labels: + app: flink +data: + flink-conf.yaml: |+ + jobmanager.rpc.address: flink-jobmanager + taskmanager.numberOfTaskSlots: 2 + blob.server.port: 6124 + jobmanager.rpc.port: 6123 + taskmanager.rpc.port: 6122 + queryable-state.proxy.ports: 6125 + jobmanager.memory.process.size: 1600m + taskmanager.memory.process.size: 1728m + parallelism.default: 2 + log4j-console.properties: |+ + # This affects logging for both user code and Flink + rootLogger.level = INFO + rootLogger.appenderRef.console.ref = ConsoleAppender + rootLogger.appenderRef.rolling.ref = RollingFileAppender + + # Uncomment this if you want to _only_ change Flink's logging + #logger.flink.name = org.apache.flink + #logger.flink.level = INFO + + # The following lines keep the log level of common libraries/connectors on + # log level INFO. The root logger does not override this. You have to manually + # change the log levels here. + logger.akka.name = akka + logger.akka.level = INFO + logger.kafka.name= org.apache.kafka + logger.kafka.level = INFO + logger.hadoop.name = org.apache.hadoop + logger.hadoop.level = INFO + logger.zookeeper.name = org.apache.zookeeper + logger.zookeeper.level = INFO + + # Log all infos to the console + appender.console.name = ConsoleAppender + appender.console.type = CONSOLE + appender.console.layout.type = PatternLayout + appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n + + # Log all infos in the given rolling file + appender.rolling.name = RollingFileAppender + appender.rolling.type = RollingFile + appender.rolling.append = false + appender.rolling.fileName = ${sys:log.file} + appender.rolling.filePattern = ${sys:log.file}.%i + appender.rolling.layout.type = PatternLayout + appender.rolling.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n + appender.rolling.policies.type = Policies + appender.rolling.policies.size.type = SizeBasedTriggeringPolicy + appender.rolling.policies.size.size=100MB + appender.rolling.strategy.type = DefaultRolloverStrategy + appender.rolling.strategy.max = 10 + + # Suppress the irrelevant (wrong) warnings from the Netty channel handler + logger.netty.name = org.apache.flink.shaded.akka.org.jboss.netty.channel.DefaultChannelPipeline + logger.netty.level = OFF +``` + + +`flink-reactive-mode-configuration-configmap.yaml` + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: flink-config + labels: + app: flink +data: + flink-conf.yaml: |+ + jobmanager.rpc.address: flink-jobmanager + taskmanager.numberOfTaskSlots: 2 + blob.server.port: 6124 + jobmanager.rpc.port: 6123 + taskmanager.rpc.port: 6122 + queryable-state.proxy.ports: 6125 + jobmanager.memory.process.size: 1600m + taskmanager.memory.process.size: 1728m + parallelism.default: 2 + scheduler-mode: reactive + execution.checkpointing.interval: 10s + log4j-console.properties: |+ + # This affects logging for both user code and Flink + rootLogger.level = INFO + rootLogger.appenderRef.console.ref = ConsoleAppender + rootLogger.appenderRef.rolling.ref = RollingFileAppender + + # Uncomment this if you want to _only_ change Flink's logging + #logger.flink.name = org.apache.flink + #logger.flink.level = INFO + + # The following lines keep the log level of common libraries/connectors on + # log level INFO. The root logger does not override this. You have to manually + # change the log levels here. + logger.akka.name = akka + logger.akka.level = INFO + logger.kafka.name= org.apache.kafka + logger.kafka.level = INFO + logger.hadoop.name = org.apache.hadoop + logger.hadoop.level = INFO + logger.zookeeper.name = org.apache.zookeeper + logger.zookeeper.level = INFO + + # Log all infos to the console + appender.console.name = ConsoleAppender + appender.console.type = CONSOLE + appender.console.layout.type = PatternLayout + appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n + + # Log all infos in the given rolling file + appender.rolling.name = RollingFileAppender + appender.rolling.type = RollingFile + appender.rolling.append = false + appender.rolling.fileName = ${sys:log.file} + appender.rolling.filePattern = ${sys:log.file}.%i + appender.rolling.layout.type = PatternLayout + appender.rolling.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n + appender.rolling.policies.type = Policies + appender.rolling.policies.size.type = SizeBasedTriggeringPolicy + appender.rolling.policies.size.size=100MB + appender.rolling.strategy.type = DefaultRolloverStrategy + appender.rolling.strategy.max = 10 + + # Suppress the irrelevant (wrong) warnings from the Netty channel handler + logger.netty.name = org.apache.flink.shaded.akka.org.jboss.netty.channel.DefaultChannelPipeline + logger.netty.level = OFF +``` + +`jobmanager-service.yaml` Optional service, which is only necessary for non-HA mode. +```yaml +apiVersion: v1 +kind: Service +metadata: + name: flink-jobmanager +spec: + type: ClusterIP + ports: + - name: rpc + port: 6123 + - name: blob-server + port: 6124 + - name: webui + port: 8081 + selector: + app: flink + component: jobmanager +``` + +`jobmanager-rest-service.yaml`. Optional service, that exposes the jobmanager `rest` port as public Kubernetes node's port. +```yaml +apiVersion: v1 +kind: Service +metadata: + name: flink-jobmanager-rest +spec: + type: NodePort + ports: + - name: rest + port: 8081 + targetPort: 8081 + nodePort: 30081 + selector: + app: flink + component: jobmanager +``` + +`taskmanager-query-state-service.yaml`. Optional service, that exposes the TaskManager port to access the queryable state as a public Kubernetes node's port. +```yaml +apiVersion: v1 +kind: Service +metadata: + name: flink-taskmanager-query-state +spec: + type: NodePort + ports: + - name: query-state + port: 6125 + targetPort: 6125 + nodePort: 30025 + selector: + app: flink + component: taskmanager +``` + +### Session cluster resource definitions + +`jobmanager-session-deployment-non-ha.yaml` +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flink-jobmanager +spec: + replicas: 1 + selector: + matchLabels: + app: flink + component: jobmanager + template: + metadata: + labels: + app: flink + component: jobmanager + spec: + containers: + - name: jobmanager + image: apache/flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + args: ["jobmanager"] + ports: + - containerPort: 6123 + name: rpc + - containerPort: 6124 + name: blob-server + - containerPort: 8081 + name: webui + livenessProbe: + tcpSocket: + port: 6123 + initialDelaySeconds: 30 + periodSeconds: 60 + volumeMounts: + - name: flink-config-volume + mountPath: /opt/flink/conf + securityContext: + runAsUser: 9999 # refers to user _flink_ from official flink image, change if necessary + volumes: + - name: flink-config-volume + configMap: + name: flink-config + items: + - key: flink-conf.yaml + path: flink-conf.yaml + - key: log4j-console.properties + path: log4j-console.properties +``` + +`jobmanager-session-deployment-ha.yaml` +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flink-jobmanager +spec: + replicas: 1 # Set the value to greater than 1 to start standby JobManagers + selector: + matchLabels: + app: flink + component: jobmanager + template: + metadata: + labels: + app: flink + component: jobmanager + spec: + containers: + - name: jobmanager + image: apache/flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + env: + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + # The following args overwrite the value of jobmanager.rpc.address configured in the configuration config map to POD_IP. + args: ["jobmanager", "$(POD_IP)"] + ports: + - containerPort: 6123 + name: rpc + - containerPort: 6124 + name: blob-server + - containerPort: 8081 + name: webui + livenessProbe: + tcpSocket: + port: 6123 + initialDelaySeconds: 30 + periodSeconds: 60 + volumeMounts: + - name: flink-config-volume + mountPath: /opt/flink/conf + securityContext: + runAsUser: 9999 # refers to user _flink_ from official flink image, change if necessary + serviceAccountName: flink-service-account # Service account which has the permissions to create, edit, delete ConfigMaps + volumes: + - name: flink-config-volume + configMap: + name: flink-config + items: + - key: flink-conf.yaml + path: flink-conf.yaml + - key: log4j-console.properties + path: log4j-console.properties +``` + +`taskmanager-session-deployment.yaml` +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flink-taskmanager +spec: + replicas: 2 + selector: + matchLabels: + app: flink + component: taskmanager + template: + metadata: + labels: + app: flink + component: taskmanager + spec: + containers: + - name: taskmanager + image: apache/flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + args: ["taskmanager"] + ports: + - containerPort: 6122 + name: rpc + - containerPort: 6125 + name: query-state + livenessProbe: + tcpSocket: + port: 6122 + initialDelaySeconds: 30 + periodSeconds: 60 + volumeMounts: + - name: flink-config-volume + mountPath: /opt/flink/conf/ + securityContext: + runAsUser: 9999 # refers to user _flink_ from official flink image, change if necessary + volumes: + - name: flink-config-volume + configMap: + name: flink-config + items: + - key: flink-conf.yaml + path: flink-conf.yaml + - key: log4j-console.properties + path: log4j-console.properties +``` + +### Application cluster resource definitions + +`jobmanager-application-non-ha.yaml` +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: flink-jobmanager +spec: + template: + metadata: + labels: + app: flink + component: jobmanager + spec: + restartPolicy: OnFailure + containers: + - name: jobmanager + image: apache/flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + env: + args: ["standalone-job", "--job-classname", "com.job.ClassName", , ] # optional arguments: ["--job-id", "", "--fromSavepoint", "/path/to/savepoint", "--allowNonRestoredState"] + ports: + - containerPort: 6123 + name: rpc + - containerPort: 6124 + name: blob-server + - containerPort: 8081 + name: webui + livenessProbe: + tcpSocket: + port: 6123 + initialDelaySeconds: 30 + periodSeconds: 60 + volumeMounts: + - name: flink-config-volume + mountPath: /opt/flink/conf + - name: job-artifacts-volume + mountPath: /opt/flink/usrlib + securityContext: + runAsUser: 9999 # refers to user _flink_ from official flink image, change if necessary + volumes: + - name: flink-config-volume + configMap: + name: flink-config + items: + - key: flink-conf.yaml + path: flink-conf.yaml + - key: log4j-console.properties + path: log4j-console.properties + - name: job-artifacts-volume + hostPath: + path: /host/path/to/job/artifacts +``` + +`jobmanager-application-ha.yaml` +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: flink-jobmanager +spec: + parallelism: 1 # Set the value to greater than 1 to start standby JobManagers + template: + metadata: + labels: + app: flink + component: jobmanager + spec: + restartPolicy: OnFailure + containers: + - name: jobmanager + image: apache/flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + env: + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + # The following args overwrite the value of jobmanager.rpc.address configured in the configuration config map to POD_IP. + args: ["standalone-job", "--host", "$(POD_IP)", "--job-classname", "com.job.ClassName", , ] # optional arguments: ["--job-id", "", "--fromSavepoint", "/path/to/savepoint", "--allowNonRestoredState"] + ports: + - containerPort: 6123 + name: rpc + - containerPort: 6124 + name: blob-server + - containerPort: 8081 + name: webui + livenessProbe: + tcpSocket: + port: 6123 + initialDelaySeconds: 30 + periodSeconds: 60 + volumeMounts: + - name: flink-config-volume + mountPath: /opt/flink/conf + - name: job-artifacts-volume + mountPath: /opt/flink/usrlib + securityContext: + runAsUser: 9999 # refers to user _flink_ from official flink image, change if necessary + serviceAccountName: flink-service-account # Service account which has the permissions to create, edit, delete ConfigMaps + volumes: + - name: flink-config-volume + configMap: + name: flink-config + items: + - key: flink-conf.yaml + path: flink-conf.yaml + - key: log4j-console.properties + path: log4j-console.properties + - name: job-artifacts-volume + hostPath: + path: /host/path/to/job/artifacts +``` + +`taskmanager-job-deployment.yaml` +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flink-taskmanager +spec: + replicas: 2 + selector: + matchLabels: + app: flink + component: taskmanager + template: + metadata: + labels: + app: flink + component: taskmanager + spec: + containers: + - name: taskmanager + image: apache/flink:{{< stable >}}{{< version >}}-scala{{< scala_version >}}{{< /stable >}}{{< unstable >}}latest{{< /unstable >}} + env: + args: ["taskmanager"] + ports: + - containerPort: 6122 + name: rpc + - containerPort: 6125 + name: query-state + livenessProbe: + tcpSocket: + port: 6122 + initialDelaySeconds: 30 + periodSeconds: 60 + volumeMounts: + - name: flink-config-volume + mountPath: /opt/flink/conf/ + - name: job-artifacts-volume + mountPath: /opt/flink/usrlib + securityContext: + runAsUser: 9999 # refers to user _flink_ from official flink image, change if necessary + volumes: + - name: flink-config-volume + configMap: + name: flink-config + items: + - key: flink-conf.yaml + path: flink-conf.yaml + - key: log4j-console.properties + path: log4j-console.properties + - name: job-artifacts-volume + hostPath: + path: /host/path/to/job/artifacts +``` + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/resource-providers/standalone/overview.md b/docs/content.zh/docs/deployment/resource-providers/standalone/overview.md new file mode 100644 index 0000000000000..b44607184900d --- /dev/null +++ b/docs/content.zh/docs/deployment/resource-providers/standalone/overview.md @@ -0,0 +1,241 @@ +--- +title: "概览" +weight: 2 +type: docs +aliases: + - /zh/deployment/resource-providers/standalone/ + - /zh/ops/deployment/cluster_setup.html + - /zh/apis/local_execution.html + - /zh/getting-started/tutorials/local_setup.html + - /zh/quickstart/setup_quickstart.html + - /zh/tutorials/flink_on_windows.html + - /zh/tutorials/local_setup.html + - /zh/getting-started/tutorials/flink_on_windows.html + - /zh/start/flink_on_windows.html +--- + + +# Standalone + +本页面提供了关于如何在*静态*(但可能异构)集群上以*完全分布式方式*运行 Flink 的说明。 + + + +## 需求 + + + +### 软件需求 + +Flink 运行在所有*类 UNIX 环境*下,例如 **Linux**,**Mac OS X** 和 **Cygwin** (Windows),集群由**一个 master 节点**以及**一个或多个 worker 节点**构成。在配置系统之前,请确保**在每个节点上**安装有以下软件: + +- **Java 1.8.x** 或更高版本, +- **ssh** (必须运行 sshd 以执行用于管理 Flink 各组件的脚本) + +如果集群不满足软件要求,那么你需要安装/更新这些软件。 + +使集群中所有节点使用**免密码 SSH** 以及拥有**相同的目录结构**可以让你使用脚本来控制一切。 + +{{< top >}} + + + +### `JAVA_HOME` 配置 + +Flink 需要 master 和所有 worker 节点设置 `JAVA_HOME` 环境变量,并指向你的 Java 安装目录。 + +你可以在 `conf/flink-conf.yaml` 文件中通过 `env.java.home` 配置项来设置此变量。 + +{{< top >}} + + + +## Flink 设置 + +前往 [下载页面]({{ site.zh_download_url }}) 获取可运行的软件包。 + +在下载完最新的发布版本后,复制压缩文件到 master 节点并解压: + +```bash +tar xzf flink-*.tgz +cd flink-* +``` + + + +### 配置 Flink + +在解压完文件后,你需要编辑 *conf/flink-conf.yaml* 文件来为集群配置 Flink。 + +设置 `jobmanager.rpc.address` 配置项指向 master 节点。你也应该通过设置 `jobmanager.memory.process.size` 和 `taskmanager.memory.process.size` 配置项来定义 Flink 允许在每个节点上分配的最大内存值。 + +这些值的单位是 MB。如果一些 worker 节点上有你想分配到 Flink 系统的多余内存,你可以在这些特定节点的 *conf/flink-conf.yaml* 文件中重写 `taskmanager.memory.process.size` 或 `taskmanager.memory.flink.size` 的默认值。 + +最后,你必须提供集群上会被用作为 worker 节点的所有节点列表,也就是运行 TaskManager 的节点。编辑文件 *conf/workers* 并输入每个 worker 节点的 IP 或主机名。 + +以下例子展示了三个节点(IP 地址从 _10.0.0.1_ 到 _10.0.0.3_,主机名为 _master_、_worker1_、 _woker2_)的设置,以及配置文件(在所有机器上都需要在相同路径访问)的内容: + +
    +
    + +
    +
    +
    +

    + /path/to/flink/conf/
    flink-conf.yaml
    +

    jobmanager.rpc.address: 10.0.0.1
    +

    +
    +
    +

    + /path/to/flink/
    conf/workers
    +

    +10.0.0.2
    +10.0.0.3
    +

    +
    +
    +
    + +Flink 目录必须放在所有 worker 节点的相同目录下。你可以使用共享的 NFS 目录,或将 Flink 目录复制到每个 worker 节点上。 + +请参考 [配置参数页面]({{< ref "docs/deployment/config.zh" >}}) 获取更多细节以及额外的配置项。 + +特别地, + +* 每个 JobManager 的可用内存值(`jobmanager.memory.process.size`), +* 每个 TaskManager 的可用内存值 (`taskmanager.memory.process.size`,并检查 [内存调优指南]({{< ref "docs/deployment/memory/mem_tuning.zh" >}}#configure-memory-for-standalone-deployment)), +* 每台机器的可用 CPU 数(`taskmanager.numberOfTaskSlots`), +* 集群中所有 CPU 数(`parallelism.default`)和 +* 临时目录(`io.tmp.dirs`) + +的值都是非常重要的配置项。 + +{{< top >}} + + + +### 启动 Flink + +下面的脚本在本地节点启动了一个 JobManager 并通过 SSH 连接到 *workers* 文件中所有的 worker 节点,在每个节点上启动 TaskManager。现在你的 Flink 系统已经启动并运行着。可以通过配置的 RPC 端口向本地节点上的 JobManager 提交作业。 + +假定你在 master 节点并且在 Flink 目录下: + +```bash +bin/start-cluster.sh +``` + +为了关闭 Flink,这里同样有一个 `stop-cluster.sh` 脚本。 + +{{< top >}} + + + +### 为集群添加 JobManager/TaskManager 实例 + +你可以使用 `bin/jobmanager.sh` 和 `bin/taskmanager.sh` 脚本为正在运行的集群添加 JobManager 和 TaskManager 实例。 + + + +#### 添加 JobManager + +```bash +bin/jobmanager.sh ((start|start-foreground) [host] [webui-port])|stop|stop-all +``` + + + +#### 添加 TaskManager + +```bash +bin/taskmanager.sh start|start-foreground|stop|stop-all +``` + +确保在你想启动/关闭相应实例的主机上执行这些脚本。 + +## High-Availability with Standalone + +In order to enable HA for a standalone cluster, you have to use the [ZooKeeper HA services]({{< ref "docs/deployment/ha/zookeeper_ha.zh" >}}). + +Additionally, you have to configure your cluster to start multiple JobManagers. + +### Masters File (masters) + +In order to start an HA-cluster configure the *masters* file in `conf/masters`: + +- **masters file**: The *masters file* contains all hosts, on which JobManagers are started, and the ports to which the web user interface binds. + +
    +jobManagerAddress1:webUIPort1
    +[...]
    +jobManagerAddressX:webUIPortX
    +  
    + +By default, the job manager will pick a *random port* for inter process communication. You can change this via the [high-availability.jobmanager.port]({{< ref "docs/deployment/config.zh" >}}#high-availability-jobmanager-port) key. This key accepts single ports (e.g. `50010`), ranges (`50000-50025`), or a combination of both (`50010,50011,50020-50025,50050-50075`). + +### Example: Standalone Cluster with 2 JobManagers + +1. **Configure high availability mode and ZooKeeper quorum** in `conf/flink-conf.yaml`: + +
    +high-availability: zookeeper
    +high-availability.zookeeper.quorum: localhost:2181
    +high-availability.zookeeper.path.root: /flink
    +high-availability.cluster-id: /cluster_one # important: customize per cluster
    +high-availability.storageDir: hdfs:///flink/recovery
    + +2. **Configure masters** in `conf/masters`: + +
    +localhost:8081
    +localhost:8082
    + +3. **Configure ZooKeeper server** in `conf/zoo.cfg` (currently it's only possible to run a single ZooKeeper server per machine): + +
    server.0=localhost:2888:3888
    + +4. **Start ZooKeeper quorum**: + +
    +$ bin/start-zookeeper-quorum.sh
    +Starting zookeeper daemon on host localhost.
    + +5. **Start an HA-cluster**: + +
    +$ bin/start-cluster.sh
    +Starting HA cluster with 2 masters and 1 peers in ZooKeeper quorum.
    +Starting standalonesession daemon on host localhost.
    +Starting standalonesession daemon on host localhost.
    +Starting taskexecutor daemon on host localhost.
    + +6. **Stop ZooKeeper quorum and cluster**: + +
    +$ bin/stop-cluster.sh
    +Stopping taskexecutor daemon (pid: 7647) on localhost.
    +Stopping standalonesession daemon (pid: 7495) on host localhost.
    +Stopping standalonesession daemon (pid: 7349) on host localhost.
    +$ bin/stop-zookeeper-quorum.sh
    +Stopping zookeeper daemon (pid: 7101) on host localhost.
    + + + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/resource-providers/yarn.md b/docs/content.zh/docs/deployment/resource-providers/yarn.md new file mode 100644 index 0000000000000..91d7c126e7889 --- /dev/null +++ b/docs/content.zh/docs/deployment/resource-providers/yarn.md @@ -0,0 +1,250 @@ +--- +title: YARN +weight: 5 +type: docs +aliases: + - /zh/deployment/resource-providers/yarn.html + - /zh/ops/deployment/yarn_setup.html +--- + + +# Apache Hadoop YARN + +## Getting Started + +This *Getting Started* section guides you through setting up a fully functional Flink Cluster on YARN. + +### Introduction + +[Apache Hadoop YARN](https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html) is a resource provider popular with many data processing frameworks. +Flink services are submitted to YARN's ResourceManager, which spawns containers on machines managed by YARN NodeManagers. Flink deploys its JobManager and TaskManager instances into such containers. + +Flink can dynamically allocate and de-allocate TaskManager resources depending on the number of processing slots required by the job(s) running on the JobManager. + +### Preparation + +This *Getting Started* section assumes a functional YARN environment, starting from version 2.4.1. YARN environments are provided most conveniently through services such as Amazon EMR, Google Cloud DataProc or products like Cloudera. [Manually setting up a YARN environment locally](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html) or [on a cluster](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html) is not recommended for following through this *Getting Started* tutorial. + +- Make sure your YARN cluster is ready for accepting Flink applications by running `yarn top`. It should show no error messages. +- Download a recent Flink distribution from the [download page]({{ site.download_url }}) and unpack it. +- **Important** Make sure that the `HADOOP_CLASSPATH` environment variable is set up (it can be checked by running `echo $HADOOP_CLASSPATH`). If not, set it up using + +```bash +export HADOOP_CLASSPATH=`hadoop classpath` +``` + +### Starting a Flink Session on YARN + +Once you've made sure that the `HADOOP_CLASSPATH` environment variable is set, you can launch a Flink on YARN session, and submit an example job: + +```bash + +# we assume to be in the root directory of +# the unzipped Flink distribution + +# (0) export HADOOP_CLASSPATH +export HADOOP_CLASSPATH=`hadoop classpath` + +# (1) Start YARN Session +./bin/yarn-session.sh --detached + +# (2) You can now access the Flink Web Interface through the +# URL printed in the last lines of the command output, or through +# the YARN ResourceManager web UI. + +# (3) Submit example job +./bin/flink run ./examples/streaming/TopSpeedWindowing.jar + +# (4) Stop YARN session (replace the application id based +# on the output of the yarn-session.sh command) +echo "stop" | ./bin/yarn-session.sh -id application_XXXXX_XXX +``` + +Congratulations! You have successfully run a Flink application by deploying Flink on YARN. + +{{< top >}} + +## Deployment Modes Supported by Flink on YARN + +For production use, we recommend deploying Flink Applications in the [Per-job or Application Mode]({{< ref "docs/deployment/overview" >}}#deployment-modes), as these modes provide a better isolation for the Applications. + +### Application Mode + +Application Mode will launch a Flink cluster on YARN, where the main() method of the application jar gets executed on the JobManager in YARN. +The cluster will shut down as soon as the application has finished. You can manually stop the cluster using `yarn application -kill ` or by cancelling the Flink job. + +```bash +./bin/flink run-application -t yarn-application ./examples/streaming/TopSpeedWindowing.jar +``` + + +Once an Application Mode cluster is deployed, you can interact with it for operations like cancelling or taking a savepoint. + +```bash +# List running job on the cluster +./bin/flink list -t yarn-application -Dyarn.application.id=application_XXXX_YY +# Cancel running job +./bin/flink cancel -t yarn-application -Dyarn.application.id=application_XXXX_YY +``` + +Note that cancelling your job on an Application Cluster will stop the cluster. + +To unlock the full potential of the application mode, consider using it with the `yarn.provided.lib.dirs` configuration option +and pre-upload your application jar to a location accessible by all nodes in your cluster. In this case, the +command could look like: + +```bash +./bin/flink run-application -t yarn-application \ + -Dyarn.provided.lib.dirs="hdfs://myhdfs/my-remote-flink-dist-dir" \ + hdfs://myhdfs/jars/my-application.jar +``` + +The above will allow the job submission to be extra lightweight as the needed Flink jars and the application jar +are going to be picked up by the specified remote locations rather than be shipped to the cluster by the +client. + +### Per-Job Cluster Mode + +The Per-job Cluster mode will launch a Flink cluster on YARN, then run the provided application jar locally and finally submit the JobGraph to the JobManager on YARN. If you pass the `--detached` argument, the client will stop once the submission is accepted. + +The YARN cluster will stop once the job has stopped. + +```bash +./bin/flink run -t yarn-per-job --detached ./examples/streaming/TopSpeedWindowing.jar +``` + +Once a Per-Job Cluster is deployed, you can interact with it for operations like cancelling or taking a savepoint. + +```bash +# List running job on the cluster +./bin/flink list -t yarn-per-job -Dyarn.application.id=application_XXXX_YY +# Cancel running job +./bin/flink cancel -t yarn-per-job -Dyarn.application.id=application_XXXX_YY +``` + +Note that cancelling your job on an Per-Job Cluster will stop the cluster. + + +### Session Mode + +We describe deployment with the Session Mode in the [Getting Started](#getting-started) guide at the top of the page. + +The Session Mode has two operation modes: +- **attached mode** (default): The `yarn-session.sh` client submits the Flink cluster to YARN, but the client keeps running, tracking the state of the cluster. If the cluster fails, the client will show the error. If the client gets terminated, it will signal the cluster to shut down as well. +- **detached mode** (`-d` or `--detached`): The `yarn-session.sh` client submits the Flink cluster to YARN, then the client returns. Another invocation of the client, or YARN tools is needed to stop the Flink cluster. + +The session mode will create a hidden YARN properties file in `/tmp/.yarn-properties-`, which will be picked up for cluster discovery by the command line interface when submitting a job. + +You can also **manually specifiy the target YARN cluster** in the command line interface when submitting a Flink job. Here's an example: + +```bash +./bin/flink run -t yarn-session \ + -Dyarn.application.id=application_XXXX_YY \ + ./examples/streaming/TopSpeedWindowing.jar +``` + +You can **re-attach to a YARN session** using the following command: + +``` +./bin/yarn-session.sh -id application_XXXX_YY +``` + +Besides passing [configuration]({{< ref "docs/deployment/config" >}}) via the `conf/flink-conf.yaml` file, you can also pass any configuration at submission time to the `./bin/yarn-session.sh` client using `-Dkey=value` arguments. + +The YARN session client also has a few "shortcut arguments" for commonly used settings. They can be listed with `./bin/yarn-session.sh -h`. + +{{< top >}} + +## Flink on YARN Reference + +### Configuring Flink on YARN + +The YARN-specific configurations are listed on the [configuration page]({{< ref "docs/deployment/config" >}}#yarn). + +The following configuration parameters are managed by Flink on YARN, as they might get overwritten by the framework at runtime: +- `jobmanager.rpc.address` (dynamically set to the address of the JobManager container by Flink on YARN) +- `io.tmp.dirs` (If not set, Flink sets the temporary directories defined by YARN) +- `high-availability.cluster-id` (automatically generated ID to distinguish multiple clusters in the HA service) + +If you need to pass additional Hadoop configuration files to Flink, you can do so via the `HADOOP_CONF_DIR` environment variable, which accepts a directory name containing Hadoop configuration files. By default, all required Hadoop configuration files are loaded from the classpath via the `HADOOP_CLASSPATH` environment variable. + +### Resource Allocation Behavior + +A JobManager running on YARN will request additional TaskManagers, if it can not run all submitted jobs with the existing resources. In particular when running in Session Mode, the JobManager will, if needed, allocate additional TaskManagers as additional jobs are submitted. Unused TaskManagers are freed up again after a timeout. + +The memory configurations for JobManager and TaskManager processes will be respected by the YARN implementation. The number of reported VCores is by default equal to the number of configured slots per TaskManager. The [yarn.containers.vcores]({{< ref "docs/deployment/config" >}}#yarn-containers-vcores) allows overwriting the number of vcores with a custom value. In order for this parameter to work you should enable CPU scheduling in your YARN cluster. + +Failed containers (including the JobManager) are replaced by YARN. The maximum number of JobManager container restarts is configured via [yarn.application-attempts]({{< ref "docs/deployment/config" >}}#yarn-application-attempts) (default 1). The YARN Application will fail once all attempts are exhausted. + +### High-Availability on YARN + +High-Availability on YARN is achieved through a combination of YARN and a [high availability service]({{< ref "docs/deployment/ha/overview" >}}). + +Once a HA service is configured, it will persist JobManager metadata and perform leader elections. + +YARN is taking care of restarting failed JobManagers. The maximum number of JobManager restarts is defined through two configuration parameters. First Flink's [yarn.application-attempts]({{< ref "docs/deployment/config" >}}#yarn-application-attempts) configuration will default 2. This value is limited by YARN's [yarn.resourcemanager.am.max-attempts](https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-common/yarn-default.xml), which also defaults to 2. + +Note that Flink is managing the `high-availability.cluster-id` configuration parameter when deploying on YARN. +Flink sets it per default to the YARN application id. +**You should not overwrite this parameter when deploying an HA cluster on YARN**. +The cluster ID is used to distinguish multiple HA clusters in the HA backend (for example Zookeeper). +Overwriting this configuration parameter can lead to multiple YARN clusters affecting each other. + +#### Container Shutdown Behaviour + +- **YARN 2.3.0 < version < 2.4.0**. All containers are restarted if the application master fails. +- **YARN 2.4.0 < version < 2.6.0**. TaskManager containers are kept alive across application master failures. This has the advantage that the startup time is faster and that the user does not have to wait for obtaining the container resources again. +- **YARN 2.6.0 <= version**: Sets the attempt failure validity interval to the Flinks' Akka timeout value. The attempt failure validity interval says that an application is only killed after the system has seen the maximum number of application attempts during one interval. This avoids that a long lasting job will deplete it's application attempts. + +{{< hint danger >}} +Hadoop YARN 2.4.0 has a major bug (fixed in 2.5.0) preventing container restarts from a restarted Application Master/Job Manager container. See FLINK-4142 for details. We recommend using at least Hadoop 2.5.0 for high availability setups on YARN.

    +{{< /hint >}} + +### Supported Hadoop versions. + +Flink on YARN is compiled against Hadoop 2.4.1, and all Hadoop versions `>= 2.4.1` are supported, including Hadoop 3.x. + +For providing Flink with the required Hadoop dependencies, we recommend setting the `HADOOP_CLASSPATH` environment variable already introduced in the [Getting Started / Preparation](#preparation) section. + +If that is not possible, the dependencies can also be put into the `lib/` folder of Flink. + +Flink also offers pre-bundled Hadoop fat jars for placing them in the `lib/` folder, on the [Downloads / Additional Components]({{site.download_url}}#additional-components) section of the website. These pre-bundled fat jars are shaded to avoid dependency conflicts with common libraries. The Flink community is not testing the YARN integration against these pre-bundled jars. + +### Running Flink on YARN behind Firewalls + +Some YARN clusters use firewalls for controlling the network traffic between the cluster and the rest of the network. +In those setups, Flink jobs can only be submitted to a YARN session from within the cluster's network (behind the firewall). +If this is not feasible for production use, Flink allows to configure a port range for its REST endpoint, used for the client-cluster communication. With this range configured, users can also submit jobs to Flink crossing the firewall. + +The configuration parameter for specifying the REST endpoint port is [rest.bind-port]({{< ref "docs/deployment/config" >}}#rest-bind-port). This configuration option accepts single ports (for example: "50010"), ranges ("50000-50025"), or a combination of both. + +### User jars & Classpath + +By default Flink will include the user jars into the system classpath when running a single job. This behavior can be controlled with the [yarn.per-job-cluster.include-user-jar]({{< ref "docs/deployment/config" >}}#yarn-per-job-cluster-include-user-jar) parameter. + +When setting this to `DISABLED` Flink will include the jar in the user classpath instead. + +The user-jars position in the classpath can be controlled by setting the parameter to one of the following: + +- `ORDER`: (default) Adds the jar to the system classpath based on the lexicographic order. +- `FIRST`: Adds the jar to the beginning of the system classpath. +- `LAST`: Adds the jar to the end of the system classpath. + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/security/_index.md b/docs/content.zh/docs/deployment/security/_index.md new file mode 100644 index 0000000000000..d098fefc680fe --- /dev/null +++ b/docs/content.zh/docs/deployment/security/_index.md @@ -0,0 +1,23 @@ +--- +title: Security +bookCollapseSection: true +weight: 8 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/deployment/security/security-kerberos.md b/docs/content.zh/docs/deployment/security/security-kerberos.md new file mode 100644 index 0000000000000..936220d3dcfea --- /dev/null +++ b/docs/content.zh/docs/deployment/security/security-kerberos.md @@ -0,0 +1,129 @@ +--- +title: Kerberos +weight: 3 +type: docs +aliases: + - /zh/deployment/security/security-kerberos.html + - /zh/ops/security-kerberos.html +--- + + +# Kerberos Authentication Setup and Configuration + +This document briefly describes how Flink security works in the context of various deployment mechanisms (Standalone, native Kubernetes, YARN, or Mesos), +filesystems, connectors, and state backends. + +## Objective +The primary goals of the Flink Kerberos security infrastructure are: + +1. to enable secure data access for jobs within a cluster via connectors (e.g. Kafka) +2. to authenticate to ZooKeeper (if configured to use SASL) +3. to authenticate to Hadoop components (e.g. HDFS, HBase) + +In a production deployment scenario, streaming jobs are understood to run for long periods of time (days/weeks/months) and be able to authenticate to secure +data sources throughout the life of the job. Kerberos keytabs do not expire in that timeframe, unlike a Hadoop delegation token +or ticket cache entry. + +The current implementation supports running Flink clusters (JobManager / TaskManager / jobs) with either a configured keytab credential +or with Hadoop delegation tokens. Keep in mind that all jobs share the credential configured for a given cluster. To use a different keytab +for a certain job, simply launch a separate Flink cluster with a different configuration. Numerous Flink clusters may run side-by-side in a Kubernetes, YARN +or Mesos environment. + +## How Flink Security works +In concept, a Flink program may use first- or third-party connectors (Kafka, HDFS, Cassandra, Flume, Kinesis etc.) necessitating arbitrary authentication methods (Kerberos, SSL/TLS, username/password, etc.). While satisfying the security requirements for all connectors is an ongoing effort, +Flink provides first-class support for Kerberos authentication only. The following services and connectors are supported for Kerberos authentication: + +- Kafka (0.9+) +- HDFS +- HBase +- ZooKeeper + +Note that it is possible to enable the use of Kerberos independently for each service or connector. For example, the user may enable +Hadoop security without necessitating the use of Kerberos for ZooKeeper, or vice versa. The shared element is the configuration of +Kerberos credentials, which is then explicitly used by each component. + +The internal architecture is based on security modules (implementing `org.apache.flink.runtime.security.modules.SecurityModule`) which +are installed at startup. The following sections describes each security module. + +### Hadoop Security Module +This module uses the Hadoop `UserGroupInformation` (UGI) class to establish a process-wide *login user* context. The login user is +then used for all interactions with Hadoop, including HDFS, HBase, and YARN. + +If Hadoop security is enabled (in `core-site.xml`), the login user will have whatever Kerberos credential is configured. Otherwise, +the login user conveys only the user identity of the OS account that launched the cluster. + +### JAAS Security Module +This module provides a dynamic JAAS configuration to the cluster, making available the configured Kerberos credential to ZooKeeper, +Kafka, and other such components that rely on JAAS. + +Note that the user may also provide a static JAAS configuration file using the mechanisms described in the [Java SE Documentation](http://docs.oracle.com/javase/7/docs/technotes/guides/security/jgss/tutorials/LoginConfigFile.html). Static entries override any +dynamic entries provided by this module. + +### ZooKeeper Security Module +This module configures certain process-wide ZooKeeper security-related settings, namely the ZooKeeper service name (default: `zookeeper`) +and the JAAS login context name (default: `Client`). + +## Deployment Modes +Here is some information specific to each deployment mode. + +### Standalone Mode + +Steps to run a secure Flink cluster in standalone/cluster mode: + +1. Add security-related configuration options to the Flink configuration file (on all cluster nodes) (see [here]({{< ref "docs/deployment/config" >}}#auth-with-external-systems)). +2. Ensure that the keytab file exists at the path indicated by `security.kerberos.login.keytab` on all cluster nodes. +3. Deploy Flink cluster as normal. + +### Native Kubernetes, YARN and Mesos Mode + +Steps to run a secure Flink cluster in native Kubernetes, YARN and Mesos mode: + +1. Add security-related configuration options to the Flink configuration file on the client (see [here]({{< ref "docs/deployment/config" >}}#auth-with-external-systems)). +2. Ensure that the keytab file exists at the path as indicated by `security.kerberos.login.keytab` on the client node. +3. Deploy Flink cluster as normal. + +In YARN, Mesos and native Kubernetes mode, the keytab is automatically copied from the client to the Flink containers. + +To enable Kerberos authentication, the Kerberos configuration file is also required. This file can be either fetched from the cluster environment or uploaded by Flink. In the latter case, you need to configure the `security.kerberos.krb5-conf.path` to indicate the path of the Kerberos configuration file and Flink will copy this file to its containers/pods. + +Note that the property `java.security.krb5.conf`, which was available in Mesos mode previously, has been deprecated. Despite it's still taking effect for backward compatibility, please be aware this property can be removed in future releases. + +For more information, see YARN security documentation. + +#### Using `kinit` (YARN only) + +In YARN mode, it is possible to deploy a secure Flink cluster without a keytab, using only the ticket cache (as managed by `kinit`). +This avoids the complexity of generating a keytab and avoids entrusting the cluster manager with it. In this scenario, the Flink CLI acquires Hadoop delegation tokens (for HDFS and for HBase). +The main drawback is that the cluster is necessarily short-lived since the generated delegation tokens will expire (typically within a week). + +Steps to run a secure Flink cluster using `kinit`: + +1. Add security-related configuration options to the Flink configuration file on the client (see [here]({{< ref "docs/deployment/config" >}}#auth-with-external-systems)). +2. Login using the `kinit` command. +3. Deploy Flink cluster as normal. + +## Further Details + +### Ticket Renewal +Each component that uses Kerberos is independently responsible for renewing the Kerberos ticket-granting-ticket (TGT). +Hadoop, ZooKeeper, and Kafka all renew the TGT automatically when provided a keytab. In the delegation token scenario, +YARN itself renews the token (up to its maximum lifespan). + +{{< top >}} diff --git a/docs/content.zh/docs/deployment/security/security-ssl.md b/docs/content.zh/docs/deployment/security/security-ssl.md new file mode 100644 index 0000000000000..744149fa2b081 --- /dev/null +++ b/docs/content.zh/docs/deployment/security/security-ssl.md @@ -0,0 +1,323 @@ +--- +title: "SSL 设置" +weight: 2 +type: docs +aliases: + - /zh/deployment/security/security-ssl.html + - /zh/ops/security-ssl.html +--- + + +# SSL 设置 + +This page provides instructions on how to enable TLS/SSL authentication and encryption for network communication with and between Flink processes. +**NOTE: TLS/SSL authentication is not enabled by default.** + +## Internal and External Connectivity + +When securing network connections between machines processes through authentication and encryption, Apache Flink differentiates between *internal* and *external* connectivity. +*Internal Connectivity* refers to all connections made between Flink processes. These connections run Flink custom protocols. Users never connect directly to internal connectivity endpoints. +*External / REST Connectivity* endpoints refers to all connections made from the outside to Flink processes. This includes the web UI and REST commands to +start and control running Flink jobs/applications, including the communication of the Flink CLI with the JobManager / Dispatcher. + +For more flexibility, security for internal and external connectivity can be enabled and configured separately. + +{{< img src="/fig/ssl_internal_external.svg" alt="Internal and External Connectivity" width=75% >}} + +### Internal Connectivity + +Internal connectivity includes: + + - Control messages: RPC between JobManager / TaskManager / Dispatcher / ResourceManager + - The data plane: The connections between TaskManagers to exchange data during shuffles, broadcasts, redistribution, etc. + - The Blob Service (distribution of libraries and other artifacts). + +All internal connections are SSL authenticated and encrypted. The connections use **mutual authentication**, meaning both server +and client side of each connection need to present the certificate to each other. The certificate acts effectively as a shared +secret when a dedicated CA is used to exclusively sign an internal cert. +The certificate for internal communication is not needed by any other party to interact with Flink, and can be simply +added to the container images, or attached to the YARN deployment. + + - The easiest way to realize this setup is by generating a dedicated public/private key pair and self-signed certificate + for the Flink deployment. The key- and truststore are identical and contains only that key pair / certificate. + An example is [shown below](#example-ssl-setup-standalone-and-kubernetes). + + - In an environment where operators are constrained to use firm-wide Internal CA (cannot generated self-signed certificates), + the recommendation is to still have a dedicated key pair / certificate for the Flink deployment, signed by that CA. + However, the TrustStore must then also contain the CA's public certificate tho accept the deployment's certificate + during the SSL handshake (requirement in JDK TrustStore implementation). + + **NOTE:** Because of that, it is critical that you specify the fingerprint of the deployment certificate + (`security.ssl.internal.cert.fingerprint`), when it is not self-signed, to pin that certificate as the only trusted + certificate and prevent the TrustStore from trusting all certificates signed by that CA. + +*Note: Because internal connections are mutually authenticated with shared certificates, Flink can skip hostname verification. +This makes container-based setups easier.* + +### External / REST Connectivity + +All external connectivity is exposed via an HTTP/REST endpoint, used for example by the web UI and the CLI: + + - Communication with the *Dispatcher* to submit jobs (session clusters) + - Communication with the *JobMaster* to inspect and modify a running job/application + +The REST endpoints can be configured to require SSL connections. The server will, however, accept connections from any client by default, meaning the REST endpoint does not authenticate the client. + +Simple mutual authentication may be enabled by configuration if authentication of connections to the REST endpoint is required, but we recommend to deploy a "side car proxy": +Bind the REST endpoint to the loopback interface (or the pod-local interface in Kubernetes) and start a REST proxy that authenticates and forwards the requests to Flink. +Examples for proxies that Flink users have deployed are [Envoy Proxy](https://www.envoyproxy.io/) or +[NGINX with MOD_AUTH](http://nginx.org/en/docs/http/ngx_http_auth_request_module.html). + +The rationale behind delegating authentication to a proxy is that such proxies offer a wide variety of authentication options and thus better integration into existing infrastructures. + + +### Queryable State + +Connections to the queryable state endpoints is currently not authenticated or encrypted. + + +## Configuring SSL + +SSL can be enabled separately for *internal* and *external* connectivity: + + - **security.ssl.internal.enabled**: Enable SSL for all *internal* connections. + - **security.ssl.rest.enabled**: Enable SSL for *REST / external* connections. + +*Note: For backwards compatibility, the **security.ssl.enabled** option still exists and enables SSL for both internal and REST endpoints.* + +For internal connectivity, you can optionally disable security for different connection types separately. +When `security.ssl.internal.enabled` is set to `true`, you can set the following parameters to `false` to disable SSL for that particular connection type: + + - `taskmanager.data.ssl.enabled`: Data communication between TaskManagers + - `blob.service.ssl.enabled`: Transport of BLOBs from JobManager to TaskManager + - `akka.ssl.enabled`: Akka-based RPC connections between JobManager / TaskManager / ResourceManager + +### Keystores and Truststores + +The SSL configuration requires to configure a **keystore** and a **truststore**. The *keystore* contains the public certificate +(public key) and the private key, while the truststore contains the trusted certificates or the trusted authorities. Both stores +need to be set up such that the truststore trusts the keystore's certificate. + +#### Internal Connectivity + +Because internal communication is mutually authenticated between server and client side, keystore and truststore typically refer to a dedicated +certificate that acts as a shared secret. In such a setup, the certificate can use wild card hostnames or addresses. +WHen using self-signed certificates, it is even possible to use the same file as keystore and truststore. + +```yaml +security.ssl.internal.keystore: /path/to/file.keystore +security.ssl.internal.keystore-password: keystore_password +security.ssl.internal.key-password: key_password +security.ssl.internal.truststore: /path/to/file.truststore +security.ssl.internal.truststore-password: truststore_password +``` + +When using a certificate that is not self-signed, but signed by a CA, you need to use certificate pinning to allow only a +a specific certificate to be trusted when establishing the connectivity. + +```yaml +security.ssl.internal.cert.fingerprint: 00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00 +``` + +#### REST Endpoints (external connectivity) + +For REST endpoints, by default the keystore is used by the server endpoint, and the truststore is used by the REST clients (including the CLI client) +to accept the server's certificate. In the case where the REST keystore has a self-signed certificate, the truststore must trust that certificate directly. +If the REST endpoint uses a certificate that is signed through a proper certification hierarchy, the roots of that hierarchy should +be in the trust store. + +If mutual authentication is enabled, the keystore and the truststore are used by both, the server endpoint and the REST clients as with internal connectivity. + +```yaml +security.ssl.rest.keystore: /path/to/file.keystore +security.ssl.rest.keystore-password: keystore_password +security.ssl.rest.key-password: key_password +security.ssl.rest.truststore: /path/to/file.truststore +security.ssl.rest.truststore-password: truststore_password +security.ssl.rest.authentication-enabled: false +``` + +### Cipher suites + +{{< hint warning >}} +The [IETF RFC 7525](https://tools.ietf.org/html/rfc7525) recommends to use a specific set of cipher suites for strong security. +Because these cipher suites were not available on many setups out of the box, Flink's default value is set to a slightly +weaker but more compatible cipher suite. +We recommend that SSL setups update to the stronger cipher suites, if possible, by adding the below entry to the Flink configuration: + + +```yaml +security.ssl.algorithms: TLS_DHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_DHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 +``` + +If these cipher suites are not supported on your setup, you will see that Flink processes will not be able to connect to each other. + +{{< /hint >}} + +### Complete List of SSL Options + +{{< generated/security_configuration >}} + +## Creating and Deploying Keystores and Truststores + +Keys, Certificates, and the Keystores and Truststores can be generatedd using the [keytool utility](https://docs.oracle.com/javase/8/docs/technotes/tools/unix/keytool.html). +You need to have an appropriate Java Keystore and Truststore accessible from each node in the Flink cluster. + + - For standalone setups, this means copying the files to each node, or adding them to a shared mounted directory. + - For container based setups, add the keystore and truststore files to the container images. + - For Yarn/Mesos setups, the cluster deployment phase can automatically distribute the keystore and truststore files. + +For the externally facing REST endpoint, the common name or subject alternative names in the certificate should match the node's hostname and IP address. + + +## Example SSL Setup Standalone and Kubernetes + +**Internal Connectivity** + +Execute the following keytool commands to create a key pair in a keystore: + +```bash +$ keytool -genkeypair \ + -alias flink.internal \ + -keystore internal.keystore \ + -dname "CN=flink.internal" \ + -storepass internal_store_password \ + -keyalg RSA \ + -keysize 4096 \ + -storetype PKCS12 +``` + +The single key/certificate in the keystore is used the same way by the server and client endpoints (mutual authentication). +The key pair acts as the shared secret for internal security, and we can directly use it as keystore and truststore. + +```yaml +security.ssl.internal.enabled: true +security.ssl.internal.keystore: /path/to/flink/conf/internal.keystore +security.ssl.internal.truststore: /path/to/flink/conf/internal.keystore +security.ssl.internal.keystore-password: internal_store_password +security.ssl.internal.truststore-password: internal_store_password +security.ssl.internal.key-password: internal_store_password +``` + +**REST Endpoint** + +The REST endpoint may receive connections from external processes, including tools that are not part of Flink (for example curl request to the REST API). +Setting up a proper certificate that is signed though a CA hierarchy may make sense for the REST endpoint. + +However, as mentioned above, the REST endpoint does not authenticate clients and thus typically needs to be secured via a proxy anyways. + +**REST Endpoint (simple self signed certificate)** + +This example shows how to create a simple keystore / truststore pair. The truststore does not contain the primary key and can +be shared with other applications. In this example, *myhost.company.org / ip:10.0.2.15* is the node (or service) for the JobManager. + +```bash +$ keytool -genkeypair -alias flink.rest -keystore rest.keystore -dname "CN=myhost.company.org" -ext "SAN=dns:myhost.company.org,ip:10.0.2.15" -storepass rest_keystore_password -keyalg RSA -keysize 4096 -storetype PKCS12 + +$ keytool -exportcert -keystore rest.keystore -alias flink.rest -storepass rest_keystore_password -file flink.cer + +$ keytool -importcert -keystore rest.truststore -alias flink.rest -storepass rest_truststore_password -file flink.cer -noprompt +``` + +```yaml +security.ssl.rest.enabled: true +security.ssl.rest.keystore: /path/to/flink/conf/rest.keystore +security.ssl.rest.truststore: /path/to/flink/conf/rest.truststore +security.ssl.rest.keystore-password: rest_keystore_password +security.ssl.rest.truststore-password: rest_truststore_password +security.ssl.rest.key-password: rest_keystore_password +``` + +**REST Endpoint (with a self signed CA)** + +Execute the following keytool commands to create a truststore with a self signed CA. + +```bash +$ keytool -genkeypair -alias ca -keystore ca.keystore -dname "CN=Sample CA" -storepass ca_keystore_password -keyalg RSA -keysize 4096 -ext "bc=ca:true" -storetype PKCS12 + +$ keytool -exportcert -keystore ca.keystore -alias ca -storepass ca_keystore_password -file ca.cer + +$ keytool -importcert -keystore ca.truststore -alias ca -storepass ca_truststore_password -file ca.cer -noprompt +``` + +Now create a keystore for the REST endpoint with a certificate signed by the above CA. +Let *flink.company.org / ip:10.0.2.15* be the hostname of the JobManager. + +```bash +$ keytool -genkeypair -alias flink.rest -keystore rest.signed.keystore -dname "CN=flink.company.org" -ext "SAN=dns:flink.company.org" -storepass rest_keystore_password -keyalg RSA -keysize 4096 -storetype PKCS12 + +$ keytool -certreq -alias flink.rest -keystore rest.signed.keystore -storepass rest_keystore_password -file rest.csr + +$ keytool -gencert -alias ca -keystore ca.keystore -storepass ca_keystore_password -ext "SAN=dns:flink.company.org,ip:10.0.2.15" -infile rest.csr -outfile rest.cer + +$ keytool -importcert -keystore rest.signed.keystore -storepass rest_keystore_password -file ca.cer -alias ca -noprompt + +$ keytool -importcert -keystore rest.signed.keystore -storepass rest_keystore_password -file rest.cer -alias flink.rest -noprompt +``` + +Now add the following configuration to your `flink-conf.yaml`: + +```yaml +security.ssl.rest.enabled: true +security.ssl.rest.keystore: /path/to/flink/conf/rest.signed.keystore +security.ssl.rest.truststore: /path/to/flink/conf/ca.truststore +security.ssl.rest.keystore-password: rest_keystore_password +security.ssl.rest.key-password: rest_keystore_password +security.ssl.rest.truststore-password: ca_truststore_password +``` + +**Tips to query REST Endpoint with curl utility** + +You can convert the keystore into the `PEM` format using `openssl`: + +```bash +$ openssl pkcs12 -passin pass:rest_keystore_password -in rest.keystore -out rest.pem -nodes +``` + +Then you can query REST Endpoint with `curl`: + +```bash +$ curl --cacert rest.pem flink_url +``` + +If mutual SSL is enabled: + +```bash +$ curl --cacert rest.pem --cert rest.pem flink_url +``` + +## Tips for YARN / Mesos Deployment + +For YARN and Mesos, you can use the tools of Yarn and Mesos to help: + + - Configuring security for internal communication is exactly the same as in the example above. + + - To secure the REST endpoint, you need to issue the REST endpoint's certificate such that it is valid for all hosts + that the JobManager may get deployed to. This can be done with a wild card DNS name, or by adding multiple DNS names. + + - The easiest way to deploy keystores and truststore is by YARN client's *ship files* option (`-yt`). + Copy the keystore and truststore files into a local directory (say `deploy-keys/`) and start the YARN session as + follows: `flink run -m yarn-cluster -yt deploy-keys/ flinkapp.jar` + + - When deployed using YARN, Flink's web dashboard is accessible through YARN proxy's Tracking URL. + To ensure that the YARN proxy is able to access Flink's HTTPS URL, you need to configure YARN proxy to accept Flink's SSL certificates. + For that, add the custom CA certificate into Java's default truststore on the YARN Proxy node. + +{{< top >}} diff --git a/docs/content.zh/docs/dev/_index.md b/docs/content.zh/docs/dev/_index.md new file mode 100644 index 0000000000000..db2c9ee264ce8 --- /dev/null +++ b/docs/content.zh/docs/dev/_index.md @@ -0,0 +1,26 @@ +--- +title: 应用开发 +icon: +bold: true +sectionBreak: true +bookCollapseSection: true +weight: 4 +--- + diff --git a/docs/content.zh/docs/dev/dataset/_index.md b/docs/content.zh/docs/dev/dataset/_index.md new file mode 100644 index 0000000000000..44d728e62aa39 --- /dev/null +++ b/docs/content.zh/docs/dev/dataset/_index.md @@ -0,0 +1,23 @@ +--- +title: DataSet API +bookCollapseSection: true +weight: 2 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/dev/dataset/cluster_execution.md b/docs/content.zh/docs/dev/dataset/cluster_execution.md new file mode 100644 index 0000000000000..bcd56576c8278 --- /dev/null +++ b/docs/content.zh/docs/dev/dataset/cluster_execution.md @@ -0,0 +1,88 @@ +--- +title: "集群执行" +weight: 13 +type: docs +aliases: + - /zh/dev/cluster_execution.html +--- + + +# 集群执行 + + + +Flink 程序可以分布式运行在多机器集群上。有两种方式可以将程序提交到集群上执行: + + + +## 命令行界面(Interface) + +命令行界面使你可以将打包的程序(JARs)提交到集群(或单机设置)。 + +有关详细信息,请参阅[命令行界面]({{< ref "docs/deployment/cli" >}})文档。 + + + +## 远程环境(Remote Environment) + +远程环境使你可以直接在集群上执行 Flink Java 程序。远程环境指向你要执行程序的集群。 + + + +### Maven Dependency + +如果将程序作为 Maven 项目开发,则必须添加 `flink-clients` 模块的依赖: + +```xml + + org.apache.flink + flink-clients{{< scala_version >}} + {{< version >}} + +``` + + + +### 示例 + +下面演示了 `RemoteEnvironment` 的用法: + +```java +public static void main(String[] args) throws Exception { + ExecutionEnvironment env = ExecutionEnvironment + .createRemoteEnvironment("flink-jobmanager", 8081, "/home/user/udfs.jar"); + + DataSet data = env.readTextFile("hdfs://path/to/file"); + + data + .filter(new FilterFunction() { + public boolean filter(String value) { + return value.startsWith("http://"); + } + }) + .writeAsText("hdfs://path/to/result"); + + env.execute(); +} +``` + +请注意,该程序包含用户自定义代码,因此需要一个带有附加代码类的 JAR 文件。远程环境的构造函数使用 JAR 文件的路径进行构造。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/dataset/examples.md b/docs/content.zh/docs/dev/dataset/examples.md new file mode 100644 index 0000000000000..b2a572e89c9f7 --- /dev/null +++ b/docs/content.zh/docs/dev/dataset/examples.md @@ -0,0 +1,422 @@ +--- +title: Batch 示例 +weight: 21 +type: docs +aliases: + - /zh/dev/batch/examples.html +--- + + +# Batch 示例 + +以下示例展示了 Flink 从简单的WordCount到图算法的应用。示例代码展示了 [Flink's DataSet API]({{< ref "docs/dev/dataset/overview" >}}) 的使用。 + +完整的源代码可以在 Flink 源代码库的 {{< gh_link file="flink-examples/flink-examples-batch" file="flink-examples-batch" >}} 模块找到。 + + + + +## 运行一个示例 + +在开始运行一个示例前,我们假设你已经有了 Flink 的运行示例。导航栏中的“快速开始(Quickstart)”和“安装(Setup)” 标签页提供了启动 Flink 的不同方法。 + +最简单的方法就是执行 `./bin/start-cluster.sh`,从而启动一个只有一个 JobManager 和 TaskManager 的本地 Flink 集群。 + +每个 Flink 的 binary release 都会包含一个`examples`(示例)目录,其中可以找到这个页面上每个示例的 jar 包文件。 + +可以通过执行以下命令来运行WordCount 示例: + +```bash +./bin/flink run ./examples/batch/WordCount.jar +``` + +其他的示例也可以通过类似的方式执行。 + +注意很多示例在不传递执行参数的情况下都会使用内置数据。如果需要利用 WordCount 程序计算真实数据,你需要传递存储数据的文件路径。 + +```bash +./bin/flink run ./examples/batch/WordCount.jar --input /path/to/some/text/data --output /path/to/result +``` + +注意非本地文件系统需要一个对应前缀,例如 `hdfs://`。 + + +## Word Count +WordCount 是大数据系统中的 “Hello World”。他可以计算一个文本集合中不同单词的出现频次。这个算法分两步进行: 第一步,把所有文本切割成单独的单词。第二步,把单词分组并分别统计。 + +{{< tabs "0eb78629-fd10-4bd0-ac00-526363473635" >}} +{{< tab "Java" >}} + +```java +ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); + +DataSet text = env.readTextFile("/path/to/file"); + +DataSet> counts = + // 把每一行文本切割成二元组,每个二元组为: (word,1) + text.flatMap(new Tokenizer()) + // 根据二元组的第“0”位分组,然后对第“1”位求和 + .groupBy(0) + .sum(1); + +counts.writeAsCsv(outputPath, "\n", " "); + +// 自定义函数 +public static class Tokenizer implements FlatMapFunction> { + + @Override + public void flatMap(String value, Collector> out) { + // 统一大小写并把每一行切割为单词 + String[] tokens = value.toLowerCase().split("\\W+"); + + // 消费二元组 + for (String token : tokens) { + if (token.length() > 0) { + out.collect(new Tuple2(token, 1)); + } + } + } +} +``` + +{{< gh_link file="/flink-examples/flink-examples-batch/src/main/java/org/apache/flink/examples/java/wordcount/WordCount.java" name="WordCount 示例" >}}增加如下执行参数: `--input --output `即可实现上述算法。 任何文本文件都可作为测试数据使用。 + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val env = ExecutionEnvironment.getExecutionEnvironment + +// 获取输入数据 +val text = env.readTextFile("/path/to/file") + +val counts = text.flatMap { _.toLowerCase.split("\\W+") filter { _.nonEmpty } } + .map { (_, 1) } + .groupBy(0) + .sum(1) + +counts.writeAsCsv(outputPath, "\n", " ") +``` + +{{< gh_link file="/flink-examples/flink-examples-batch/src/main/scala/org/apache/flink/examples/scala/wordcount/WordCount.scala" name="WordCount 示例" >}}增加如下执行参数: `--input --output `即可实现上述算法。 任何文本文件都可作为测试数据使用。 + + +{{< /tab >}} +{{< /tabs >}} + +## Page Rank + +PageRank算法可以计算互联网中一个网页的重要性,这个重要性通过由一个页面指向其他页面的链接定义。PageRank 算法是一个重复执行相同运算的迭代图算法。在每一次迭代中,每个页面把他当前的 rank 值分发给他所有的邻居节点,并且通过他收到邻居节点的 rank 值更新自身的 rank 值。PageRank 算法因 Google 搜索引擎的使用而流行,它根据网页的重要性来对搜索结果进行排名。 + +在这个简单的示例中,PageRank 算法由一个[批量迭代](iterations.html)和一些固定次数的迭代实现。 + +{{< tabs "cb5cd992-34f8-4dea-bb6f-c5ec27a97a00" >}} +{{< tab "Java" >}} + +```java +ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); + +// 通过解析一个CSV文件来获取每个页面原始的rank值 +DataSet> pagesWithRanks = env.readCsvFile(pagesInputPath) + .types(Long.class, Double.class) + +// 链接被编码为邻接表: (page-id, Array(neighbor-ids)) +DataSet> pageLinkLists = getLinksDataSet(env); + +// 设置迭代数据集合 +IterativeDataSet> iteration = pagesWithRanks.iterate(maxIterations); + +DataSet> newRanks = iteration + // 为每个页面匹配其对应的出边,并发送rank值 + .join(pageLinkLists).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch()) + // 收集并计算新的rank值 + .groupBy(0).sum(1) + // 施加阻尼系数 + .map(new Dampener(DAMPENING_FACTOR, numPages)); + +DataSet> finalPageRanks = iteration.closeWith( + newRanks, + newRanks.join(iteration).where(0).equalTo(0) + // 结束条件 + .filter(new EpsilonFilter())); + +finalPageRanks.writeAsCsv(outputPath, "\n", " "); + +// 自定义函数 + +public static final class JoinVertexWithEdgesMatch + implements FlatJoinFunction, Tuple2, + Tuple2> { + + @Override + public void join( page, Tuple2 adj, + Collector> out) { + Long[] neighbors = adj.f1; + double rank = page.f1; + double rankToDistribute = rank / ((double) neigbors.length); + + for (int i = 0; i < neighbors.length; i++) { + out.collect(new Tuple2(neighbors[i], rankToDistribute)); + } + } +} + +public static final class Dampener implements MapFunction, Tuple2> { + private final double dampening, randomJump; + + public Dampener(double dampening, double numVertices) { + this.dampening = dampening; + this.randomJump = (1 - dampening) / numVertices; + } + + @Override + public Tuple2 map(Tuple2 value) { + value.f1 = (value.f1 * dampening) + randomJump; + return value; + } +} + +public static final class EpsilonFilter + implements FilterFunction, Tuple2>> { + + @Override + public boolean filter(Tuple2, Tuple2> value) { + return Math.abs(value.f0.f1 - value.f1.f1) > EPSILON; + } +} +``` + +{{< gh_link file="/flink-examples/flink-examples-batch/src/main/java/org/apache/flink/examples/java/graph/PageRank.java" name="PageRank代码" >}}实现了以上示例。 +他需要以下参数来运行: `--pages --links --output --numPages --iterations `。 + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +// 自定义类型 +case class Link(sourceId: Long, targetId: Long) +case class Page(pageId: Long, rank: Double) +case class AdjacencyList(sourceId: Long, targetIds: Array[Long]) + +// 初始化执行环境 +val env = ExecutionEnvironment.getExecutionEnvironment + +// 通过解析一个CSV文件来获取每个页面原始的rank值 +val pages = env.readCsvFile[Page](pagesInputPath) + +// 链接被编码为邻接表: (page-id, Array(neighbor-ids)) +val links = env.readCsvFile[Link](linksInputPath) + +// 将原始rank值赋给每个页面 +val pagesWithRanks = pages.map(p => Page(p, 1.0 / numPages)) + +// 通过输入链接建立邻接表 +val adjacencyLists = links + // initialize lists + .map(e => AdjacencyList(e.sourceId, Array(e.targetId))) + // concatenate lists + .groupBy("sourceId").reduce { + (l1, l2) => AdjacencyList(l1.sourceId, l1.targetIds ++ l2.targetIds) + } + +// 开始迭代 +val finalRanks = pagesWithRanks.iterateWithTermination(maxIterations) { + currentRanks => + val newRanks = currentRanks + // 发送rank值给目标页面 + .join(adjacencyLists).where("pageId").equalTo("sourceId") { + (page, adjacent, out: Collector[Page]) => + for (targetId <- adjacent.targetIds) { + out.collect(Page(targetId, page.rank / adjacent.targetIds.length)) + } + } + // 收集rank值并求和更新 + .groupBy("pageId").aggregate(SUM, "rank") + // 施加阻尼系数 + .map { p => + Page(p.pageId, (p.rank * DAMPENING_FACTOR) + ((1 - DAMPENING_FACTOR) / numPages)) + } + + // 如果没有明显的rank更新则停止迭代 + val termination = currentRanks.join(newRanks).where("pageId").equalTo("pageId") { + (current, next, out: Collector[Int]) => + // check for significant update + if (math.abs(current.rank - next.rank) > EPSILON) out.collect(1) + } + + (newRanks, termination) +} + +val result = finalRanks + +// 输出结果 +result.writeAsCsv(outputPath, "\n", " ") +``` + +{{< gh_link file="/flink-examples/flink-examples-batch/src/main/scala/org/apache/flink/examples/scala/graph/PageRankBasic.scala" name="PageRank代码" >}} 实现了以上示例。 +他需要以下参数来执行: `--pages --links --output --numPages --iterations `。 +{{< /tab >}} +{{< /tabs >}} + +输入文件是纯文本文件,并且必须存为以下格式: +- 页面被表示为一个长整型(long)ID并由换行符分割 + * 例如 `"1\n2\n12\n42\n63\n"` 给出了ID为 1, 2, 12, 42和63的五个页面。 +- 链接由空格分割的两个页面ID来表示。每个链接由换行符来分割。 + * 例如 `"1 2\n2 12\n1 12\n42 63\n"` 表示了以下四个有向链接: (1)->(2), (2)->(12), (1)->(12) 和 (42)->(63). + +这个简单的实现版本要求每个页面至少有一个入链接和一个出链接(一个页面可以指向自己)。 + +## Connected Components(连通组件算法) + +Connected Components 通过给相连的顶点相同的组件ID来标示出一个较大的图中的连通部分。类似PageRank,Connected Components 也是一个迭代算法。在每一次迭代中,每个顶点把他现在的组件ID传播给所有邻居顶点。当一个顶点接收到的组件ID小于他自身的组件ID时,这个顶点也更新其组件ID为这个新组件ID。 + + +这个代码实现使用了[增量迭代](iterations.html): 没有改变其组件 ID 的顶点不会参与下一轮迭代。这种方法会带来更好的性能,因为后面的迭代可以只处理少量的需要计算的顶点。 + +{{< tabs "33947399-ae91-4965-acd9-44b267d3d53a" >}} +{{< tab "Java" >}} + +```java +// 读取顶点和边的数据 +DataSet vertices = getVertexDataSet(env); +DataSet> edges = getEdgeDataSet(env).flatMap(new UndirectEdge()); + +// 分配初始的组件ID(等于每个顶点的ID) +DataSet> verticesWithInitialId = vertices.map(new DuplicateValue()); + +// 开始一个增量迭代 +DeltaIteration, Tuple2> iteration = + verticesWithInitialId.iterateDelta(verticesWithInitialId, maxIterations, 0); + +// 应用迭代计算逻辑: +DataSet> changes = iteration.getWorkset() + // 链接相应的边 + .join(edges).where(0).equalTo(0).with(new NeighborWithComponentIDJoin()) + // 选出最小的邻居组件ID + .groupBy(0).aggregate(Aggregations.MIN, 1) + // 如果邻居的组件ID更小则进行更新 + .join(iteration.getSolutionSet()).where(0).equalTo(0) + .flatMap(new ComponentIdFilter()); + +// 停止增量迭代 (增量和新的数据集是相同的) +DataSet> result = iteration.closeWith(changes, changes); + +// 输出结果 +result.writeAsCsv(outputPath, "\n", " "); + +// 自定义函数 + +public static final class DuplicateValue implements MapFunction> { + + @Override + public Tuple2 map(T vertex) { + return new Tuple2(vertex, vertex); + } +} + +public static final class UndirectEdge + implements FlatMapFunction, Tuple2> { + Tuple2 invertedEdge = new Tuple2(); + + @Override + public void flatMap(Tuple2 edge, Collector> out) { + invertedEdge.f0 = edge.f1; + invertedEdge.f1 = edge.f0; + out.collect(edge); + out.collect(invertedEdge); + } +} + +public static final class NeighborWithComponentIDJoin + implements JoinFunction, Tuple2, Tuple2> { + + @Override + public Tuple2 join(Tuple2 vertexWithComponent, Tuple2 edge) { + return new Tuple2(edge.f1, vertexWithComponent.f1); + } +} + +public static final class ComponentIdFilter + implements FlatMapFunction, Tuple2>, + Tuple2> { + + @Override + public void flatMap(Tuple2, Tuple2> value, + Collector> out) { + if (value.f0.f1 < value.f1.f1) { + out.collect(value.f0); + } + } +} +``` + +{{< gh_link file="/flink-examples/flink-examples-batch/src/main/java/org/apache/flink/examples/java/graph/ConnectedComponents.java" file="ConnectedComponents代码" >}} 实现了以上示例。他需要以下参数来运行: `--vertices --edges --output --iterations `。 + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +// 初始化运行环境 +val env = ExecutionEnvironment.getExecutionEnvironment + +// 读顶点和边的数据 +// 分配初始的组件ID(等于每个顶点的ID) +val vertices = getVerticesDataSet(env).map { id => (id, id) } + +// 通过发出每条输入边自身和他的反向边得到无向边 +val edges = getEdgesDataSet(env).flatMap { edge => Seq(edge, (edge._2, edge._1)) } + +// 开始增量迭代 +val verticesWithComponents = vertices.iterateDelta(vertices, maxIterations, Array(0)) { + (s, ws) => + + // 开始迭代逻辑: 链接相应的边 + val allNeighbors = ws.join(edges).where(0).equalTo(0) { (vertex, edge) => + (edge._2, vertex._2) + } + + // 选择组件ID最小的邻居节点 + val minNeighbors = allNeighbors.groupBy(0).min(1) + + // 如果邻居的ID更小则更新 + val updatedComponents = minNeighbors.join(s).where(0).equalTo(0) { + (newVertex, oldVertex, out: Collector[(Long, Long)]) => + if (newVertex._2 < oldVertex._2) out.collect(newVertex) + } + + // 增量和新的数据集是一致的 + (updatedComponents, updatedComponents) +} + +verticesWithComponents.writeAsCsv(outputPath, "\n", " ") + +``` + +{{< gh_link file="/flink-examples/flink-examples-batch/src/main/scala/org/apache/flink/examples/scala/graph/ConnectedComponents.scala" name="ConnectedComponents代码" >}} 实现了以上示例。他需要以下参数来运行: `--vertices --edges --output --iterations `。 +{{< /tab >}} +{{< /tabs >}} + +输入文件是纯文本文件并且必须被存储为如下格式: +- 顶点被表示为 ID,并且由换行符分隔。 + * 例如 `"1\n2\n12\n42\n63\n"` 表示 (1), (2), (12), (42) 和 (63)五个顶点。 +- 边被表示为空格分隔的顶点对。边由换行符分隔: + * 例如 `"1 2\n2 12\n1 12\n42 63\n"` 表示四条无向边: (1)-(2), (2)-(12), (1)-(12), and (42)-(63)。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/dataset/hadoop_compatibility.md b/docs/content.zh/docs/dev/dataset/hadoop_compatibility.md new file mode 100644 index 0000000000000..ee6ad86d0ef38 --- /dev/null +++ b/docs/content.zh/docs/dev/dataset/hadoop_compatibility.md @@ -0,0 +1,261 @@ +--- +title: "Hadoop 兼容" +is_beta: true +weight: 8 +type: docs +aliases: + - /zh/dev/batch/hadoop_compatibility.html +--- + + +# Hadoop 兼容 + +Flink is compatible with Apache Hadoop MapReduce interfaces and therefore allows +reusing code that was implemented for Hadoop MapReduce. + +You can: + +- use Hadoop's `Writable` [data types]({{< ref "docs/dev/serialization/types_serialization" >}}#supported-data-types) in Flink programs. +- use any Hadoop `InputFormat` as a [DataSource](index.html#data-sources). +- use any Hadoop `OutputFormat` as a [DataSink](index.html#data-sinks). +- use a Hadoop `Mapper` as [FlatMapFunction](dataset_transformations.html#flatmap). +- use a Hadoop `Reducer` as [GroupReduceFunction](dataset_transformations.html#groupreduce-on-grouped-dataset). + +This document shows how to use existing Hadoop MapReduce code with Flink. Please refer to the +[Connecting to other systems]({{< ref "docs/deployment/filesystems/overview" >}}#hadoop-file-system-hdfs-and-its-other-implementations) guide for reading from Hadoop supported file systems. + + + +### Project Configuration + +Support for Hadoop input/output formats is part of the `flink-java` and +`flink-scala` Maven modules that are always required when writing Flink jobs. +The code is located in `org.apache.flink.api.java.hadoop` and +`org.apache.flink.api.scala.hadoop` in an additional sub-package for the +`mapred` and `mapreduce` API. + +Support for Hadoop Mappers and Reducers is contained in the `flink-hadoop-compatibility` +Maven module. +This code resides in the `org.apache.flink.hadoopcompatibility` +package. + +Add the following dependency to your `pom.xml` if you want to reuse Mappers +and Reducers. + +```xml + + org.apache.flink + flink-hadoop-compatibility{{< scala_version >}} + {{< version >}} + +``` + +If you want to run your Flink application locally (e.g. from your IDE), you also need to add +a `hadoop-client` dependency such as: + +```xml + + org.apache.hadoop + hadoop-client + 2.8.3 + provided + +``` + +### Using Hadoop InputFormats + +To use Hadoop `InputFormats` with Flink the format must first be wrapped +using either `readHadoopFile` or `createHadoopInput` of the +`HadoopInputs` utility class. +The former is used for input formats derived +from `FileInputFormat` while the latter has to be used for general purpose +input formats. +The resulting `InputFormat` can be used to create a data source by using +`ExecutionEnvironmen#createInput`. + +The resulting `DataSet` contains 2-tuples where the first field +is the key and the second field is the value retrieved from the Hadoop +InputFormat. + +The following example shows how to use Hadoop's `TextInputFormat`. + +{{< tabs "dcdf7b32-ac7a-4349-9bee-07060970ab52" >}} +{{< tab "Java" >}} + +```java +ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); + +DataSet> input = + env.createInput(HadoopInputs.readHadoopFile(new TextInputFormat(), + LongWritable.class, Text.class, textPath)); + +// Do something with the data. +[...] +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val env = ExecutionEnvironment.getExecutionEnvironment + +val input: DataSet[(LongWritable, Text)] = + env.createInput(HadoopInputs.readHadoopFile( + new TextInputFormat, classOf[LongWritable], classOf[Text], textPath)) + +// Do something with the data. +[...] +``` + +{{< /tab >}} +{{< /tabs >}} + +### Using Hadoop OutputFormats + +Flink provides a compatibility wrapper for Hadoop `OutputFormats`. Any class +that implements `org.apache.hadoop.mapred.OutputFormat` or extends +`org.apache.hadoop.mapreduce.OutputFormat` is supported. +The OutputFormat wrapper expects its input data to be a DataSet containing +2-tuples of key and value. These are to be processed by the Hadoop OutputFormat. + +The following example shows how to use Hadoop's `TextOutputFormat`. + +{{< tabs "db1f6300-4cfd-412a-8c3b-e82c34f9d308" >}} +{{< tab "Java" >}} + +```java +// Obtain the result we want to emit +DataSet> hadoopResult = [...] + +// Set up the Hadoop TextOutputFormat. +HadoopOutputFormat hadoopOF = + // create the Flink wrapper. + new HadoopOutputFormat( + // set the Hadoop OutputFormat and specify the job. + new TextOutputFormat(), job + ); +hadoopOF.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); +TextOutputFormat.setOutputPath(job, new Path(outputPath)); + +// Emit data using the Hadoop TextOutputFormat. +hadoopResult.output(hadoopOF); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +// Obtain your result to emit. +val hadoopResult: DataSet[(Text, IntWritable)] = [...] + +val hadoopOF = new HadoopOutputFormat[Text,IntWritable]( + new TextOutputFormat[Text, IntWritable], + new JobConf) + +hadoopOF.getJobConf.set("mapred.textoutputformat.separator", " ") +FileOutputFormat.setOutputPath(hadoopOF.getJobConf, new Path(resultPath)) + +hadoopResult.output(hadoopOF) + + +``` + +{{< /tab >}} +{{< /tabs >}} + +### Using Hadoop Mappers and Reducers + +Hadoop Mappers are semantically equivalent to Flink's [FlatMapFunctions](dataset_transformations.html#flatmap) and Hadoop Reducers are equivalent to Flink's [GroupReduceFunctions](dataset_transformations.html#groupreduce-on-grouped-dataset). Flink provides wrappers for implementations of Hadoop MapReduce's `Mapper` and `Reducer` interfaces, i.e., you can reuse your Hadoop Mappers and Reducers in regular Flink programs. At the moment, only the Mapper and Reduce interfaces of Hadoop's mapred API (`org.apache.hadoop.mapred`) are supported. + +The wrappers take a `DataSet>` as input and produce a `DataSet>` as output where `KEYIN` and `KEYOUT` are the keys and `VALUEIN` and `VALUEOUT` are the values of the Hadoop key-value pairs that are processed by the Hadoop functions. For Reducers, Flink offers a wrapper for a GroupReduceFunction with (`HadoopReduceCombineFunction`) and without a Combiner (`HadoopReduceFunction`). The wrappers accept an optional `JobConf` object to configure the Hadoop Mapper or Reducer. + +Flink's function wrappers are + +- `org.apache.flink.hadoopcompatibility.mapred.HadoopMapFunction`, +- `org.apache.flink.hadoopcompatibility.mapred.HadoopReduceFunction`, and +- `org.apache.flink.hadoopcompatibility.mapred.HadoopReduceCombineFunction`. + +and can be used as regular Flink [FlatMapFunctions](dataset_transformations.html#flatmap) or [GroupReduceFunctions](dataset_transformations.html#groupreduce-on-grouped-dataset). + +The following example shows how to use Hadoop `Mapper` and `Reducer` functions. + +```java +// Obtain data to process somehow. +DataSet> text = [...] + +DataSet> result = text + // use Hadoop Mapper (Tokenizer) as MapFunction + .flatMap(new HadoopMapFunction( + new Tokenizer() + )) + .groupBy(0) + // use Hadoop Reducer (Counter) as Reduce- and CombineFunction + .reduceGroup(new HadoopReduceCombineFunction( + new Counter(), new Counter() + )); +``` + +**Please note:** The Reducer wrapper works on groups as defined by Flink's [groupBy()](dataset_transformations.html#transformations-on-grouped-dataset) operation. It does not consider any custom partitioners, sort or grouping comparators you might have set in the `JobConf`. + +### Complete Hadoop WordCount Example + +The following example shows a complete WordCount implementation using Hadoop data types, Input- and OutputFormats, and Mapper and Reducer implementations. + +```java +ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); + +// Set up the Hadoop TextInputFormat. +Job job = Job.getInstance(); +HadoopInputFormat hadoopIF = + new HadoopInputFormat( + new TextInputFormat(), LongWritable.class, Text.class, job + ); +TextInputFormat.addInputPath(job, new Path(inputPath)); + +// Read data using the Hadoop TextInputFormat. +DataSet> text = env.createInput(hadoopIF); + +DataSet> result = text + // use Hadoop Mapper (Tokenizer) as MapFunction + .flatMap(new HadoopMapFunction( + new Tokenizer() + )) + .groupBy(0) + // use Hadoop Reducer (Counter) as Reduce- and CombineFunction + .reduceGroup(new HadoopReduceCombineFunction( + new Counter(), new Counter() + )); + +// Set up the Hadoop TextOutputFormat. +HadoopOutputFormat hadoopOF = + new HadoopOutputFormat( + new TextOutputFormat(), job + ); +hadoopOF.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); +TextOutputFormat.setOutputPath(job, new Path(outputPath)); + +// Emit data using the Hadoop TextOutputFormat. +result.output(hadoopOF); + +// Execute Program +env.execute("Hadoop WordCount"); +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/dataset/iterations.md b/docs/content.zh/docs/dev/dataset/iterations.md new file mode 100644 index 0000000000000..aa56a0b78d6c3 --- /dev/null +++ b/docs/content.zh/docs/dev/dataset/iterations.md @@ -0,0 +1,217 @@ +--- +title: "迭代" +weight: 3 +type: docs +aliases: + - /zh/dev/batch/iterations.html +--- + + +# 迭代 + +Iterative algorithms occur in many domains of data analysis, such as *machine learning* or *graph analysis*. Such algorithms are crucial in order to realize the promise of Big Data to extract meaningful information out of your data. With increasing interest to run these kinds of algorithms on very large data sets, there is a need to execute iterations in a massively parallel fashion. + +Flink programs implement iterative algorithms by defining a **step function** and embedding it into a special iteration operator. There are two variants of this operator: **Iterate** and **Delta Iterate**. Both operators repeatedly invoke the step function on the current iteration state until a certain termination condition is reached. + +Here, we provide background on both operator variants and outline their usage. The [programming guide](index.html) explains how to implement the operators in both Scala and Java. We also support both **vertex-centric and gather-sum-apply iterations** through Flink's graph processing API, [Gelly]({{< ref "docs/libs/gelly/overview" >}}). + +The following table provides an overview of both operators: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    IterateDelta Iterate
    Iteration InputPartial SolutionWorkset and Solution Set
    Step FunctionArbitrary Data Flows
    State UpdateNext partial solution +
      +
    • Next workset
    • +
    • Changes to solution set
    • +
    +
    Iteration ResultLast partial solutionSolution set state after last iteration
    Termination +
      +
    • Maximum number of iterations (default)
    • +
    • Custom aggregator convergence
    • +
    +
    +
      +
    • Maximum number of iterations or empty workset (default)
    • +
    • Custom aggregator convergence
    • +
    +
    + + + + +Iterate Operator +---------------- + +The **iterate operator** covers the *simple form of iterations*: in each iteration, the **step function** consumes the **entire input** (the *result of the previous iteration*, or the *initial data set*), and computes the **next version of the partial solution** (e.g. `map`, `reduce`, `join`, etc.). + +

    + {{Iterate Operator}} +

    + + 1. **Iteration Input**: Initial input for the *first iteration* from a *data source* or *previous operators*. + 2. **Step Function**: The step function will be executed in each iteration. It is an arbitrary data flow consisting of operators like `map`, `reduce`, `join`, etc. and depends on your specific task at hand. + 3. **Next Partial Solution**: In each iteration, the output of the step function will be fed back into the *next iteration*. + 4. **Iteration Result**: Output of the *last iteration* is written to a *data sink* or used as input to the *following operators*. + +There are multiple options to specify **termination conditions** for an iteration: + + - **Maximum number of iterations**: Without any further conditions, the iteration will be executed this many times. + - **Custom aggregator convergence**: Iterations allow to specify *custom aggregators* and *convergence criteria* like sum aggregate the number of emitted records (aggregator) and terminate if this number is zero (convergence criterion). + +You can also think about the iterate operator in pseudo-code: + +```java +IterationState state = getInitialState(); + +while (!terminationCriterion()) { + state = step(state); +} + +setFinalState(state); +``` + +
    +
    + See the Programming Guide for details and code examples. +
    +
    + +### Example: Incrementing Numbers + +In the following example, we **iteratively increment a set numbers**: + +

    + {{Iterate Operator Example}} +

    + + 1. **Iteration Input**: The initial input is read from a data source and consists of five single-field records (integers `1` to `5`). + 2. **Step function**: The step function is a single `map` operator, which increments the integer field from `i` to `i+1`. It will be applied to every record of the input. + 3. **Next Partial Solution**: The output of the step function will be the output of the map operator, i.e. records with incremented integers. + 4. **Iteration Result**: After ten iterations, the initial numbers will have been incremented ten times, resulting in integers `11` to `15`. + +```plain +// 1st 2nd 10th +map(1) -> 2 map(2) -> 3 ... map(10) -> 11 +map(2) -> 3 map(3) -> 4 ... map(11) -> 12 +map(3) -> 4 map(4) -> 5 ... map(12) -> 13 +map(4) -> 5 map(5) -> 6 ... map(13) -> 14 +map(5) -> 6 map(6) -> 7 ... map(14) -> 15 +``` + +Note that **1**, **2**, and **4** can be arbitrary data flows. + + +Delta Iterate Operator +---------------------- + +The **delta iterate operator** covers the case of **incremental iterations**. Incremental iterations **selectively modify elements** of their **solution** and evolve the solution rather than fully recompute it. + +Where applicable, this leads to **more efficient algorithms**, because not every element in the solution set changes in each iteration. This allows to **focus on the hot parts** of the solution and leave the **cold parts untouched**. Frequently, the majority of the solution cools down comparatively fast and the later iterations operate only on a small subset of the data. + +

    + {{Delta Iterate Operator}} +

    + + 1. **Iteration Input**: The initial workset and solution set are read from *data sources* or *previous operators* as input to the first iteration. + 2. **Step Function**: The step function will be executed in each iteration. It is an arbitrary data flow consisting of operators like `map`, `reduce`, `join`, etc. and depends on your specific task at hand. + 3. **Next Workset/Update Solution Set**: The *next workset* drives the iterative computation and will be fed back into the *next iteration*. Furthermore, the solution set will be updated and implicitly forwarded (it is not required to be rebuild). Both data sets can be updated by different operators of the step function. + 4. **Iteration Result**: After the *last iteration*, the *solution set* is written to a *data sink* or used as input to the *following operators*. + +The default **termination condition** for delta iterations is specified by the **empty workset convergence criterion** and a **maximum number of iterations**. The iteration will terminate when a produced *next workset* is empty or when the maximum number of iterations is reached. It is also possible to specify a **custom aggregator** and **convergence criterion**. + +You can also think about the iterate operator in pseudo-code: + +```java +IterationState workset = getInitialState(); +IterationState solution = getInitialSolution(); + +while (!terminationCriterion()) { + (delta, workset) = step(workset, solution); + + solution.update(delta) +} + +setFinalState(solution); +``` + +
    +
    + See the programming guide for details and code examples. +
    +
    + +### Example: Propagate Minimum in Graph + +In the following example, every vertex has an **ID** and a **coloring**. Each vertex will propagate its vertex ID to neighboring vertices. The **goal** is to *assign the minimum ID to every vertex in a subgraph*. If a received ID is smaller then the current one, it changes to the color of the vertex with the received ID. One application of this can be found in *community analysis* or *connected components* computation. + +

    + {{Delta Iterate Operator Example}} +

    + +The **initial input** is set as **both workset and solution set.** In the above figure, the colors visualize the **evolution of the solution set**. With each iteration, the color of the minimum ID is spreading in the respective subgraph. At the same time, the amount of work (exchanged and compared vertex IDs) decreases with each iteration. This corresponds to the **decreasing size of the workset**, which goes from all seven vertices to zero after three iterations, at which time the iteration terminates. The **important observation** is that *the lower subgraph converges before the upper half* does and the delta iteration is able to capture this with the workset abstraction. + +In the upper subgraph **ID 1** (*orange*) is the **minimum ID**. In the **first iteration**, it will get propagated to vertex 2, which will subsequently change its color to orange. Vertices 3 and 4 will receive **ID 2** (in *yellow*) as their current minimum ID and change to yellow. Because the color of *vertex 1* didn't change in the first iteration, it can be skipped it in the next workset. + +In the lower subgraph **ID 5** (*cyan*) is the **minimum ID**. All vertices of the lower subgraph will receive it in the first iteration. Again, we can skip the unchanged vertices (*vertex 5*) for the next workset. + +In the **2nd iteration**, the workset size has already decreased from seven to five elements (vertices 2, 3, 4, 6, and 7). These are part of the iteration and further propagate their current minimum IDs. After this iteration, the lower subgraph has already converged (**cold part** of the graph), as it has no elements in the workset, whereas the upper half needs a further iteration (**hot part** of the graph) for the two remaining workset elements (vertices 3 and 4). + +The iteration **terminates**, when the workset is empty after the **3rd iteration**. + + + +Superstep Synchronization +------------------------- + +We referred to each execution of the step function of an iteration operator as *a single iteration*. In parallel setups, **multiple instances of the step function are evaluated in parallel** on different partitions of the iteration state. In many settings, one evaluation of the step function on all parallel instances forms a so called **superstep**, which is also the granularity of synchronization. Therefore, *all* parallel tasks of an iteration need to complete the superstep, before a next superstep will be initialized. **Termination criteria** will also be evaluated at superstep barriers. + +

    + {{Supersteps}} +

    + + diff --git a/docs/content.zh/docs/dev/dataset/local_execution.md b/docs/content.zh/docs/dev/dataset/local_execution.md new file mode 100644 index 0000000000000..55ae80d25da1b --- /dev/null +++ b/docs/content.zh/docs/dev/dataset/local_execution.md @@ -0,0 +1,127 @@ +--- +title: "本地执行" +weight: 9 +type: docs +aliases: + - /zh/dev/local_execution.html +--- + + +# 本地执行 + +Flink can run on a single machine, even in a single Java Virtual Machine. This allows users to test and debug Flink programs locally. This section gives an overview of the local execution mechanisms. + +The local environments and executors allow you to run Flink programs in a local Java Virtual Machine, or with within any JVM as part of existing programs. Most examples can be launched locally by simply hitting the "Run" button of your IDE. + +There are two different kinds of local execution supported in Flink. The `LocalExecutionEnvironment` is starting the full Flink runtime, including a JobManager and a TaskManager. These include memory management and all the internal algorithms that are executed in the cluster mode. + +The `CollectionEnvironment` is executing the Flink program on Java collections. This mode will not start the full Flink runtime, so the execution is very low-overhead and lightweight. For example a `DataSet.map()`-transformation will be executed by applying the `map()` function to all elements in a Java list. + +## Debugging + +If you are running Flink programs locally, you can also debug your program like any other Java program. You can either use `System.out.println()` to write out some internal variables or you can use the debugger. It is possible to set breakpoints within `map()`, `reduce()` and all the other methods. +Please also refer to the [debugging section]({{< ref "docs/dev/dataset/overview" >}}#debugging) in the Java API documentation for a guide to testing and local debugging utilities in the Java API. + +## Maven Dependency + +If you are developing your program in a Maven project, you have to add the `flink-clients` module using this dependency: + +```xml + + org.apache.flink + flink-clients{{< scala_version >}} + {{< version >}} + +``` + +## Local Environment + +The `LocalEnvironment` is a handle to local execution for Flink programs. Use it to run a program within a local JVM - standalone or embedded in other programs. + +The local environment is instantiated via the method `ExecutionEnvironment.createLocalEnvironment()`. By default, it will use as many local threads for execution as your machine has CPU cores (hardware contexts). You can alternatively specify the desired parallelism. The local environment can be configured to log to the console using `enableLogging()`/`disableLogging()`. + +In most cases, calling `ExecutionEnvironment.getExecutionEnvironment()` is the even better way to go. That method returns a `LocalEnvironment` when the program is started locally (outside the command line interface), and it returns a pre-configured environment for cluster execution, when the program is invoked by the [command line interface]({{< ref "docs/deployment/cli" >}}). + +```java +public static void main(String[] args) throws Exception { + ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); + + DataSet data = env.readTextFile("file:///path/to/file"); + + data + .filter(new FilterFunction() { + public boolean filter(String value) { + return value.startsWith("http://"); + } + }) + .writeAsText("file:///path/to/result"); + + JobExecutionResult res = env.execute(); +} +``` + +The `JobExecutionResult` object, which is returned after the execution finished, contains the program runtime and the accumulator results. + +The `LocalEnvironment` allows also to pass custom configuration values to Flink. + +```java +Configuration conf = new Configuration(); +conf.setFloat(ConfigConstants.TASK_MANAGER_MEMORY_FRACTION_KEY, 0.5f); +final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf); +``` + +*Note:* The local execution environments do not start any web frontend to monitor the execution. + +## Collection Environment + +The execution on Java Collections using the `CollectionEnvironment` is a low-overhead approach for executing Flink programs. Typical use-cases for this mode are automated tests, debugging and code re-use. + +Users can use algorithms implemented for batch processing also for cases that are more interactive. A slightly changed variant of a Flink program could be used in a Java Application Server for processing incoming requests. + +**Skeleton for Collection-based execution** + +```java +public static void main(String[] args) throws Exception { + // initialize a new Collection-based execution environment + final ExecutionEnvironment env = new CollectionEnvironment(); + + DataSet users = env.fromCollection( /* get elements from a Java Collection */); + + /* Data Set transformations ... */ + + // retrieve the resulting Tuple2 elements into a ArrayList. + Collection<...> result = new ArrayList<...>(); + resultDataSet.output(new LocalCollectionOutputFormat<...>(result)); + + // kick off execution. + env.execute(); + + // Do some work with the resulting ArrayList (=Collection). + for(... t : result) { + System.err.println("Result = "+t); + } +} +``` + +The `flink-examples-batch` module contains a full example, called `CollectionExecutionExample`. + +Please note that the execution of the collection-based Flink programs is only possible on small data, which fits into the JVM heap. The execution on collections is not multi-threaded, only one thread is used. + +{{< top >}} diff --git a/docs/content.zh/docs/dev/dataset/overview.md b/docs/content.zh/docs/dev/dataset/overview.md new file mode 100644 index 0000000000000..8faee27d2d466 --- /dev/null +++ b/docs/content.zh/docs/dev/dataset/overview.md @@ -0,0 +1,1703 @@ +--- +title: 概览 +type: docs +weight: 1 +aliases: + - /zh/dev/batch/index.html + - /zh/apis/programming_guide.html +--- + + +# DataSet API 编程指南 + +DataSet programs in Flink are regular programs that implement transformations on data sets (e.g., filtering, mapping, joining, grouping). The data sets are initially created from certain sources (e.g., by reading files, or from local collections). Results are returned via sinks, which may for example write the data to (distributed) files, or to standard output (for example the command line terminal). Flink programs run in a variety of contexts, standalone, or embedded in other programs. The execution can happen in a local JVM, or on clusters of many machines. + +Please refer to the DataStream API overview for an introduction to the basic concepts of the Flink API. That overview is for the DataStream API but the basic concepts of the two APIs are the same. + +In order to create your own Flink DataSet program, we encourage you to start with the anatomy of a Flink Program and gradually add your own transformations. The remaining sections act as references for additional operations and advanced features. + +{{< hint info >}} +Starting with Flink 1.12 the DataSet has been soft deprecated. We recommend that you use the DataStream API with `BATCH` execution mode. The linked section also outlines cases where it makes sense to use the DataSet API but those cases will become rarer as development progresses and the DataSet API will eventually be removed. Please also see FLIP-131 for background information on this decision. +{{< /hint >}} + +## Example Program + +The following program is a complete, working example of WordCount. +You can copy & paste the code to run it locally. +You only have to include the correct Flink’s library into your project and specify the imports. +Then you are ready to go! + +{{< tabs "basic-example" >}} +{{< tab "Java" >}} +```java +public class WordCountExample { + public static void main(String[] args) throws Exception { + final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); + + DataSet text = env.fromElements( + "Who's there?", + "I think I hear them. Stand, ho! Who's there?"); + + DataSet> wordCounts = text + .flatMap(new LineSplitter()) + .groupBy(0) + .sum(1); + + wordCounts.print(); + } + + public static class LineSplitter implements FlatMapFunction> { + @Override + public void flatMap(String line, Collector> out) { + for (String word : line.split(" ")) { + out.collect(new Tuple2(word, 1)); + } + } + } +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.scala._ + +object WordCount { + def main(args: Array[String]) { + + val env = ExecutionEnvironment.getExecutionEnvironment + val text = env.fromElements( + "Who's there?", + "I think I hear them. Stand, ho! Who's there?") + + val counts = text + .flatMap { _.toLowerCase.split("\\W+") filter { _.nonEmpty } } + .map { (_, 1) } + .groupBy(0) + .sum(1) + + counts.print() + } +} +``` +{{< /tab >}} +{{< /tabs >}} + +## DataSet Transformations + +Data transformations transform one or more DataSets into a new DataSet. +Programs can combine multiple transformations into sophisticated assemblies. + +#### Map + +Takes one element and produces one element. + +{{< tabs "mapfun" >}} +{{< tab "Java" >}} +```java +data.map(new MapFunction() { + public Integer map(String value) { return Integer.parseInt(value); } +}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +data.map { x => x.toInt } +``` +{{< /tab >}} +{{< /tabs >}} + +#### FlatMap + +Takes one element and produces zero, one, or more elements. + +{{< tabs "flatmapfunc" >}} +{{< tab "Java" >}} +```java +data.flatMap(new FlatMapFunction() { + public void flatMap(String value, Collector out) { + for (String s : value.split(" ")) { + out.collect(s); + } + } +}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +data.flatMap { str => str.split(" ") } +``` +{{< /tab >}} +{{< /tabs >}} + +#### MapPartition + +Transforms a parallel partition in a single function call. +The function gets the partition as an Iterable stream and can produce an arbitrary number of result values. +The number of elements in each partition depends on the degree-of-parallelism and previous operations. + +{{< tabs "mappartition" >}} +{{< tab "Java" >}} +```java +data.mapPartition(new MapPartitionFunction() { + public void mapPartition(Iterable values, Collector out) { + long c = 0; + for (String s : values) { + c++; + } + out.collect(c); + } +}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +data.mapPartition { in => in map { (_, 1) } } +``` +{{< /tab >}} +{{< /tabs >}} + +#### Filter + +Evaluates a boolean function for each element and retains those for which the function returns true. +**IMPORTANT:** The system assumes that the function does not modify the element on which the predicate is applied. Violating this assumption can lead to incorrect results. + +{{< tabs "filter" >}} +{{< tab "Java" >}} +```java +data.filter(new FilterFunction() { + public boolean filter(Integer value) { return value > 1000; } +}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +data.filter { _ > 1000 } +``` +{{< /tab >}} +{{< /tabs >}} + +#### Reduce + +Combines a group of elements into a single element by repeatedly combining two elements into one. +Reduce may be applied on a full data set or on a grouped data set. + +{{< tabs "reduce" >}} +{{< tab "Java" >}} +```java +data.reduce(new ReduceFunction { + public Integer reduce(Integer a, Integer b) { return a + b; } +}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +data.reduce { _ + _ } +``` +{{< /tab >}} +{{< /tabs >}} + +If the reduce was applied to a grouped data set then you can specify the way that the runtime executes the combine phase of the reduce by supplying a `CombineHint` to `setCombineHint`. +The hash-based strategy should be faster in most cases, especially if the number of different keys is small compared to the number of input elements (eg. 1/10). + +#### ReduceGroup + +Combines a group of elements into one or more elements. +ReduceGroup may be applied on a full data set, or on a grouped data set. + +{{< tabs "reducegroup" >}} +{{< tab "Java" >}} +```java +data.reduceGroup(new GroupReduceFunction { + public void reduce(Iterable values, Collector out) { + int prefixSum = 0; + for (Integer i : values) { + prefixSum += i; + out.collect(prefixSum); + } + } +}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +data.reduceGroup { elements => elements.sum } +``` +{{< /tab >}} +{{< /tabs >}} + +#### Aggregate + +Aggregates a group of values into a single value. +Aggregation functions can be thought of as built-in reduce functions. +Aggregate may be applied on a full data set, or on a grouped data set. + +{{< tabs "aggregate" >}} +{{< tab "Java" >}} +```java +Dataset> input = // [...] +DataSet> output = input.aggregate(SUM, 0).and(MIN, 2); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val input: DataSet[(Int, String, Double)] = // [...] +val output: DataSet[(Int, String, Double)] = input.aggregate(SUM, 0).aggregate(MIN, 2) +``` +{{< /tab >}} +{{< /tabs >}} + +#### Distinct + +Returns the distinct elements of a data set. +It removes the duplicate entries from the input DataSet, with respect to all fields of the elements, or a subset of fields. + +{{< tabs "distinct" >}} +{{< tab "Java" >}} +```java +data.distinct() +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +data.distinct() +``` +{{< /tab >}} +{{< /tabs >}} + +#### Join + +Joins two data sets by creating all pairs of elements that are equal on their keys. +Optionally uses a JoinFunction to turn the pair of elements into a single element, or a FlatJoinFunction to turn the pair of elements into arbitrarily many (including none) elements. +See the keys section to learn how to define join keys. + +{{< tabs "join" >}} +{{< tab "Java" >}} +```java +result = input1.join(input2) + .where(0) // key of the first input (tuple field 0) + .equalTo(1); // key of the second input (tuple field 1) +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// In this case tuple fields are used as keys. "0" is the join field on the first tuple +// "1" is the join field on the second tuple. +val result = input1.join(input2).where(0).equalTo(1) +``` +{{< /tab >}} +{{< /tabs >}} + +You can specify the way that the runtime executes the join via Join Hints. +The hints describe whether the join happens through partitioning or broadcasting, and whether it uses a sort-based or a hash-based algorithm. +Please refer to the Transformations Guide for a list of possible hints and an example. +If no hint is specified, the system will try to make an estimate of the input sizes and pick the best strategy according to those estimates. + +{{< tabs "joinhint" >}} +{{< tab "Java" >}} +```java +// This executes a join by broadcasting the first data set +// using a hash table for the broadcast data +result = input1.join(input2, JoinHint.BROADCAST_HASH_FIRST) + .where(0).equalTo(1); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// This executes a join by broadcasting the first data set +// using a hash table for the broadcast data +val result = input1.join(input2, JoinHint.BROADCAST_HASH_FIRST) + .where(0).equalTo(1) +``` +{{< /tab >}} +{{< /tabs >}} + +Note that the join transformation works only for equi-joins. Other join types need to be expressed using `OuterJoin` or `CoGroup`. + +#### OuterJoin + +Performs a left, right, or full outer join on two data sets. Outer joins are similar to regular (inner) joins and create all pairs of elements that are equal on their keys. +In addition, records of the "outer" side (left, right, or both in case of full) are preserved if no matching key is found in the other side. +Matching pairs of elements (or one element and a null value for the other input) are given to a JoinFunction to turn the pair of elements into a single element, or to a FlatJoinFunction to turn the pair of elements into arbitrarily many (including none) elements. +See the keys section to learn how to define join keys. + +{{< tabs "outerjoin" >}} +{{< tab "Java" >}} +```java +input1.leftOuterJoin(input2) // rightOuterJoin or fullOuterJoin for right or full outer joins + .where(0) // key of the first input (tuple field 0) + .equalTo(1) // key of the second input (tuple field 1) + .with(new JoinFunction() { + public String join(String v1, String v2) { + // NOTE: + // - v2 might be null for leftOuterJoin + // - v1 might be null for rightOuterJoin + // - v1 OR v2 might be null for fullOuterJoin + } + }); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val joined = left.leftOuterJoin(right).where(0).equalTo(1) { + (left, right) => + val a = if (left == null) "none" else left._1 + (a, right) + } +``` +{{< /tab >}} +{{< /tabs >}} + +#### CoGroup + +The two-dimensional variant of the reduce operation. Groups each input on one or more fields and then joins the groups. +The transformation function is called per pair of groups. See the keys section to learn how to define coGroup keys. + +{{< tabs "cogroup" >}} +{{< tab "Java" >}} +```java +data1.coGroup(data2) + .where(0) + .equalTo(1) + .with(new CoGroupFunction() { + public void coGroup(Iterable in1, Iterable in2, Collector out) { + out.collect(...); + } + }); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +data1.coGroup(data2).where(0).equalTo(1) +``` +{{< /tab >}} +{{< /tabs >}} + +#### Cross + +Builds the Cartesian product (cross product) of two inputs, creating all pairs of elements. Optionally uses a CrossFunction to turn the pair of elements into a single element + +{{< tabs "cross" >}} +{{< tab "Java" >}} +```java +DataSet data1 = // [...] +DataSet data2 = // [...] +DataSet> result = data1.cross(data2); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val data1: DataSet[Int] = // [...] +val data2: DataSet[String] = // [...] +val result: DataSet[(Int, String)] = data1.cross(data2) +``` +{{< /tab >}} +{{< /tabs >}} + +{{< hint warning >}} +Cross is potentially a **very** compute-intensive operation which can challenge even large compute clusters! It is advised to hint the system with the `DataSet` sizes by using `crossWithTiny()` and `crossWithHuge()`. +{{< /hint >}} + +#### Union + +Produces the union of two data sets. + +{{< tabs "union" >}} +{{< tab "Java" >}} +```java +data.union(data2) +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +data.union(data2) +``` +{{< /tab >}} +{{< /tabs >}} + +#### Rebalance + +Evenly rebalances the parallel partitions of a data set to eliminate data skew. +Only Map-like transformations may follow a rebalance transformation. + +{{< tabs "rebalance" >}} +{{< tab "Java" >}} +```java +DataSet data1 = // [...] +DataSet> result = data1.rebalance().map(...) +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val data1: DataSet[Int] = // [...] +val result: DataSet[(Int, String)] = data1.rebalance().map(...) +``` +{{< /tab >}} +{{< /tabs >}} + +#### Hash-Partition + +Hash-partitions a data set on a given key. Keys can be specified as position keys, expression keys, and key selector functions. + +{{< tabs "hashpartition" >}} +{{< tab "Java" >}} +```java +DataSet> in = // [...] +DataSet result = in.partitionByHash(0) + .mapPartition(new PartitionMapper()); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val in: DataSet[(Int, String)] = // [...] +val result = in.partitionByHash(0).mapPartition { ... } +``` +{{< /tab >}} +{{< /tabs >}} + +#### Range-Partition + +Range-partitions a data set on a given key. Keys can be specified as position keys, expression keys, and key selector functions. + +{{< tabs "rangepartition" >}} +{{< tab "Java" >}} +```java +DataSet> in = // [...] +DataSet result = in.partitionByRange(0) + .mapPartition(new PartitionMapper()); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val in: DataSet[(Int, String)] = // [...] +val result = in.partitionByRange(0).mapPartition { ... } +``` +{{< /tab >}} +{{< /tabs >}} + +#### Custom Partitioning + +Assigns records based on a key to a specific partition using a custom Partitioner function. +The key can be specified as position key, expression key, and key selector function. +Note: This method only works with a single field key. + +{{< tabs "custompartitioning" >}} +{{< tab "Java" >}} +```java +DataSet> in = // [...] +DataSet result = in.partitionCustom(partitioner, key) + .mapPartition(new PartitionMapper()); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val in: DataSet[(Int, String)] = // [...] +val result = in + .partitionCustom(partitioner, key).mapPartition { ... } +``` +{{< /tab >}} +{{< /tabs >}} + +#### Sort Partitioning + +Locally sorts all partitions of a data set on a specified field in a specified order. +Fields can be specified as tuple positions or field expressions. +Sorting on multiple fields is done by chaining sortPartition() calls. + +{{< tabs "sortpartitioning" >}} +{{< tab "Java" >}} +```java +DataSet> in = // [...] +DataSet result = in.sortPartition(1, Order.ASCENDING) + .mapPartition(new PartitionMapper()); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val in: DataSet[(Int, String)] = // [...] +val result = in.sortPartition(1, Order.ASCENDING).mapPartition { ... } +``` +{{< /tab >}} +{{< /tabs >}} + +#### First-N + +Returns the first n (arbitrary) elements of a data set. +First-n can be applied on a regular data set, a grouped data set, or a grouped-sorted data set. +Grouping keys can be specified as key-selector functions or field position keys. + +{{< tabs "firstn" >}} +{{< tab "Java" >}} +```java +DataSet> in = // [...] +// regular data set +DataSet> result1 = in.first(3); +// grouped data set +DataSet> result2 = in.groupBy(0) + .first(3); +// grouped-sorted data set +DataSet> result3 = in.groupBy(0) + .sortGroup(1, Order.ASCENDING) + .first(3); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val in: DataSet[(Int, String)] = // [...] +// regular data set +val result1 = in.first(3) +// grouped data set +val result2 = in.groupBy(0).first(3) +// grouped-sorted data set +val result3 = in.groupBy(0).sortGroup(1, Order.ASCENDING).first(3) +``` +{{< /tab >}} +{{< /tabs >}} + +#### Project + +Selects a subset of fields from tuples. + +{{< tabs "project" >}} +{{< tab "Java" >}} +```java +DataSet> in = // [...] +DataSet> out = in.project(2,0); +``` +{{< /tab >}} +{{< tab "Scala" >}} +This feature is not available in the Scala API +{{< /tab >}} +{{< /tabs >}} + +#### MinBy / MaxBy + +Selects a tuple from a group of tuples whose values of one or more fields are minimum (maximum). +The fields which are used for comparison must be valid key fields, i.e., comparable. +If multiple tuples have minimum (maximum) field values, an arbitrary tuple of these tuples is returned. MinBy (MaxBy) may be applied on a full data set or a grouped data set. + +{{< tabs "minbymaxby" >}} +{{< tab "Java" >}} +```java +DataSet> in = // [...] +// a DataSet with a single tuple with minimum values for the Integer and String fields. +DataSet> out = in.minBy(0, 2); +// a DataSet with one tuple for each group with the minimum value for the Double field. +DataSet> out2 = in.groupBy(2) + .minBy(1); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val in: DataSet[(Int, Double, String)] = // [...] +// a data set with a single tuple with minimum values for the Int and String fields. +val out: DataSet[(Int, Double, String)] = in.minBy(0, 2) +// a data set with one tuple for each group with the minimum value for the Double field. +val out2: DataSet[(Int, Double, String)] = in.groupBy(2) + .minBy(1) +``` +{{< /tab >}} +{{< /tabs >}} + +## Specifying Keys + +Some transformations (join, coGroup, groupBy) require that a key be defined on a collection of elements. +Other transformations (Reduce, GroupReduce, Aggregate) allow data being grouped on a key before they are applied. + +A DataSet is grouped as + +```java +DataSet<...> input = // [...] +DataSet<...> reduced = input + .groupBy(/*define key here*/) + .reduceGroup(/*do something*/); +``` + +The data model of Flink is not based on key-value pairs. +Therefore, you do not need to physically pack the data set types into keys and values. +Keys are “virtual”: they are defined as functions over the actual data to guide the grouping operator. + +### Define keys for Tuples + +The simplest case is grouping Tuples on one or more fields of the Tuple: + +{{< tabs "keyfortuple" >}} +{{< tab "Java" >}} +```java +DataSet> input = // [...] +UnsortedGrouping,Tuple> keyed = input.groupBy(0) +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val input: DataSet[(Int, String, Long)] = // [...] +val keyed = input.groupBy(0) +``` +{{< /tab >}} +{{< /tabs >}} + +Tuples are grouped on the first field (the one of Integer type). + +{{< tabs "tuplemultigroup" >}} +{{< tab "Java" >}} +```java +DataSet> input = // [...] +UnsortedGrouping,Tuple> keyed = input.groupBy(0,1) +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val input: DataSet[(Int, String, Long)] = // [...] +val grouped = input.groupBy(0,1) +``` +{{< /tab >}} +{{< /tabs >}} + +Here, we group the tuples on a composite key consisting of the first and the second field. + +A note on nested Tuples: If you have a DataSet with a nested tuple, such as: + +```java +DataSet,String,Long>> ds; +``` + +Specifying `groupBy(0)` will cause the system to use the full `Tuple2` as a key (with the Integer and Float being the key). +If you want to “navigate” into the nested Tuple2, you have to use field expression keys which are explained below. + +### Define keys using Field Expressions + +You can use String-based field expressions to reference nested fields and define keys for grouping, sorting, joining, or coGrouping. +Field expressions make it very easy to select fields in (nested) composite types such as Tuple and POJO types. + +In the example below, we have a WC `POJO` with two fields “word” and “count”. +To group by the field word, we just pass its name to the groupBy() function. + +{{< tabs "pojokey" >}} +{{< tab "Java" >}} +```java +// some ordinary POJO (Plain old Java Object) +public class WC { + public String word; + public int count; +} +DataSet words = // [...] +DataSet wordCounts = words.groupBy("word") +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// some ordinary POJO (Plain old Java Object) +class WC(var word: String, var count: Int) { + def this() { this("", 0L) } +} +val words: DataSet[WC] = // [...] +val wordCounts = words.groupBy("word") + +// or, as a case class, which is less typing +case class WC(word: String, count: Int) +val words: DataSet[WC] = // [...] +val wordCounts = words.groupBy("word") +``` +{{< /tab >}} +{{< /tabs >}} + +#### Field Expression Syntax: + + * Select POJO fields by their field name. For example "user" refers to the “user” field of a POJO type. + + * Select Tuple fields by their 1-offset field name or 0-offset field index. For example "_1" and "5" refer to the first and sixth field of a Scala Tuple type, respectively. + + * You can select nested fields in POJOs and Tuples. For example "user.zip" refers to the “zip” field of a POJO which is stored in the “user” field of a POJO type. Arbitrary nesting and mixing of POJOs and Tuples is supported such as "_2.user.zip" or "user._4.1.zip". + + * You can select the full type using the "_" wildcard expressions. This does also work for types which are not Tuple or POJO types. + +#### Field Expression Example: + +{{< tabs "flatmapfunc" >}} +{{< tab "Java" >}} +```java +public static class WC { + public ComplexNestedClass complex; //nested POJO + private int count; + // getter / setter for private field (count) + public int getCount() { + return count; + } + public void setCount(int c) { + this.count = c; + } +} +public static class ComplexNestedClass { + public Integer someNumber; + public float someFloat; + public Tuple3 word; + public IntWritable hadoopCitizen; +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +class WC(var complex: ComplexNestedClass, var count: Int) { + def this() { this(null, 0) } +} + +class ComplexNestedClass( + var someNumber: Int, + someFloat: Float, + word: (Long, Long, String), + hadoopCitizen: IntWritable) { + def this() { this(0, 0, (0, 0, ""), new IntWritable(0)) } +} +``` +{{< /tab >}} +{{< /tabs >}} + +These are valid field expressions for the example code above: + + * "count": The count field in the WC class. + + * "complex": Recursively selects all fields of the field complex of POJO type ComplexNestedClass. + + * "complex.word.f2": Selects the last field of the nested Tuple3. + + * "complex.hadoopCitizen": Selects the Hadoop IntWritable type. + +### Define keys using Key Selector Functions + +An additional way to define keys are “key selector” functions. +A key selector function takes a single element as input and returns the key for the element. The key can be of any type and be derived from deterministic computations. + +The following example shows a key selector function that simply returns the field of an object: + +{{< tabs "flatmapfunc" >}} +{{< tab "Java" >}} +```java +// some ordinary POJO +public class WC {public String word; public int count;} +DataSet words = // [...] +UnsortedGrouping keyed = words + .groupBy(new KeySelector() { + public String getKey(WC wc) { return wc.word; } + }); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// some ordinary case class +case class WC(word: String, count: Int) +val words: DataSet[WC] = // [...] +val keyed = words.groupBy( _.word ) +``` +{{< /tab >}} +{{< /tabs >}} + +## Data Sources + +Data sources create the initial data sets, such as from files or from Java collections. The general mechanism of creating data sets is abstracted behind an InputFormat. Flink comes with several built-in formats to create data sets from common file formats. Many of them have shortcut methods on the ExecutionEnvironment. + +File-based: + + * readTextFile(path) / TextInputFormat - Reads files line wise and returns them as Strings. + + * readTextFileWithValue(path) / TextValueInputFormat - Reads files line wise and returns them as StringValues. StringValues are mutable strings. + + * readCsvFile(path) / CsvInputFormat - Parses files of comma (or another char) delimited fields. Returns a DataSet of tuples or POJOs. Supports the basic java types and their Value counterparts as field types. + + * readFileOfPrimitives(path, Class) / PrimitiveInputFormat - Parses files of new-line (or another char sequence) delimited primitive data types such as String or Integer. + + * readFileOfPrimitives(path, delimiter, Class) / PrimitiveInputFormat - Parses files of new-line (or another char sequence) delimited primitive data types such as String or Integer using the given delimiter. + +Collection-based: + + * fromCollection(Collection) - Creates a data set from a Java.util.Collection. All elements in the collection must be of the same type. + + * fromCollection(Iterator, Class) - Creates a data set from an iterator. The class specifies the data type of the elements returned by the iterator. + + * fromElements(T ...) - Creates a data set from the given sequence of objects. All objects must be of the same type. + + * fromParallelCollection(SplittableIterator, Class) - Creates a data set from an iterator, in parallel. The class specifies the data type of the elements returned by the iterator. + + * generateSequence(from, to) - Generates the sequence of numbers in the given interval, in parallel. + +Generic: + + * readFile(inputFormat, path) / FileInputFormat - Accepts a file input format. + + * createInput(inputFormat) / InputFormat - Accepts a generic input format. + + +{{< tabs "datasources" >}} +{{< tab "Java" >}} +```java +ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); + +// read text file from local files system +DataSet localLines = env.readTextFile("file:///path/to/my/textfile"); + +// read text file from an HDFS running at nnHost:nnPort +DataSet hdfsLines = env.readTextFile("hdfs://nnHost:nnPort/path/to/my/textfile"); + +// read a CSV file with three fields +DataSet> csvInput = env.readCsvFile("hdfs:///the/CSV/file") + .types(Integer.class, String.class, Double.class); + +// read a CSV file with five fields, taking only two of them +DataSet> csvInput = env.readCsvFile("hdfs:///the/CSV/file") + .includeFields("10010") // take the first and the fourth field + .types(String.class, Double.class); + +// read a CSV file with three fields into a POJO (Person.class) with corresponding fields +DataSet> csvInput = env.readCsvFile("hdfs:///the/CSV/file") + .pojoType(Person.class, "name", "age", "zipcode"); + +// read a file from the specified path of type SequenceFileInputFormat +DataSet> tuples = + env.createInput(HadoopInputs.readSequenceFile(IntWritable.class, Text.class, "hdfs://nnHost:nnPort/path/to/file")); + +// creates a set from some given elements +DataSet value = env.fromElements("Foo", "bar", "foobar", "fubar"); + +// generate a number sequence +DataSet numbers = env.generateSequence(1, 10000000); + +// Read data from a relational database using the JDBC input format +DataSet dbData = + env.createInput( + JdbcInputFormat.buildJdbcInputFormat() + .setDrivername("org.apache.derby.jdbc.EmbeddedDriver") + .setDBUrl("jdbc:derby:memory:persons") + .setQuery("select name, age from persons") + .setRowTypeInfo(new RowTypeInfo(BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO)) + .finish() + ); + +// Note: Flink's program compiler needs to infer the data types of the data items which are returned +// by an InputFormat. If this information cannot be automatically inferred, it is necessary to +// manually provide the type information as shown in the examples above. +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + + +val env = ExecutionEnvironment.getExecutionEnvironment + +// read text file from local files system +val localLines = env.readTextFile("file:///path/to/my/textfile") + +// read text file from an HDFS running at nnHost:nnPort +val hdfsLines = env.readTextFile("hdfs://nnHost:nnPort/path/to/my/textfile") + +// read a CSV file with three fields +val csvInput = env.readCsvFile[(Int, String, Double)]("hdfs:///the/CSV/file") + +// read a CSV file with five fields, taking only two of them +val csvInput = env.readCsvFile[(String, Double)]( + "hdfs:///the/CSV/file", + includedFields = Array(0, 3)) // take the first and the fourth field + +// CSV input can also be used with Case Classes +case class MyCaseClass(str: String, dbl: Double) +val csvInput = env.readCsvFile[MyCaseClass]( + "hdfs:///the/CSV/file", + includedFields = Array(0, 3)) // take the first and the fourth field + +// read a CSV file with three fields into a POJO (Person) with corresponding fields +val csvInput = env.readCsvFile[Person]( + "hdfs:///the/CSV/file", + pojoFields = Array("name", "age", "zipcode")) + +// create a set from some given elements +val values = env.fromElements("Foo", "bar", "foobar", "fubar") + +// generate a number sequence +val numbers = env.generateSequence(1, 10000000) + +// read a file from the specified path of type SequenceFileInputFormat +val tuples = env.createInput(HadoopInputs.readSequenceFile(classOf[IntWritable], classOf[Text], + "hdfs://nnHost:nnPort/path/to/file")) +``` +{{< /tab >}} +{{< /tabs >}} + +#### Configuring CSV Parsing + +Flink offers a number of configuration options for CSV parsing: + + * types(Class ... types) specifies the types of the fields to parse. It is mandatory to configure the types of the parsed fields. In case of the type class Boolean.class, “True” (case-insensitive), “False” (case-insensitive), “1” and “0” are treated as booleans. + + * lineDelimiter(String del) specifies the delimiter of individual records. The default line delimiter is the new-line character '\n'. + + * fieldDelimiter(String del) specifies the delimiter that separates fields of a record. The default field delimiter is the comma character ','. + + * includeFields(boolean ... flag), includeFields(String mask), or includeFields(long bitMask) defines which fields to read from the input file (and which to ignore). By default the first n fields (as defined by the number of types in the types() call) are parsed. + + * parseQuotedStrings(char quoteChar) enables quoted string parsing. Strings are parsed as quoted strings if the first character of the string field is the quote character (leading or tailing whitespaces are not trimmed). Field delimiters within quoted strings are ignored. Quoted string parsing fails if the last character of a quoted string field is not the quote character or if the quote character appears at some point which is not the start or the end of the quoted string field (unless the quote character is escaped using ‘'). If quoted string parsing is enabled and the first character of the field is not the quoting string, the string is parsed as unquoted string. By default, quoted string parsing is disabled. + + * ignoreComments(String commentPrefix) specifies a comment prefix. All lines that start with the specified comment prefix are not parsed and ignored. By default, no lines are ignored. + + * ignoreInvalidLines() enables lenient parsing, i.e., lines that cannot be correctly parsed are ignored. By default, lenient parsing is disabled and invalid lines raise an exception. + + * ignoreFirstLine() configures the InputFormat to ignore the first line of the input file. By default no line is ignored. + +#### Recursive Traversal of the Input Path Directory + +For file-based inputs, when the input path is a directory, nested files are not enumerated by default. +Instead, only the files inside the base directory are read, while nested files are ignored. +Recursive enumeration of nested files can be enabled through the recursive.file.enumeration configuration parameter, like in the following example. + +{{< tabs "recursiveinput" >}} +{{< tab "Java" >}} +```java +// enable recursive enumeration of nested input files +ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); + +// create a configuration object +Configuration parameters = new Configuration(); + +// set the recursive enumeration parameter +parameters.setBoolean("recursive.file.enumeration", true); + +// pass the configuration to the data source +DataSet logs = env.readTextFile("file:///path/with.nested/files") + .withParameters(parameters); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// enable recursive enumeration of nested input files +val env = ExecutionEnvironment.getExecutionEnvironment + +// create a configuration object +val parameters = new Configuration + +// set the recursive enumeration parameter +parameters.setBoolean("recursive.file.enumeration", true) + +// pass the configuration to the data source +env.readTextFile("file:///path/with.nested/files").withParameters(parameters) +``` +{{< /tab >}} +{{< /tabs >}} + +### Read Compressed Files + +Flink currently supports transparent decompression of input files if these are marked with an appropriate file extension. In particular, this means that no further configuration of the input formats is necessary and any `FileInputFormat` support the compression, including custom input formats. Please notice that compressed files might not be read in parallel, thus impacting job scalability. + +The following table lists the currently supported compression methods. + +| Compressed Method | File Extensions | Parallelizable | +|-------------------|-----------------|----------------| +| DEFLATE | .deflate | no | +| GZip | .gz, .gzip | no | +| Bzip2 | .bz2 | no | +| XZ | .xz | no | +| ZStandart | .zst | no | + +## Data Sinks + +Data sinks consume DataSets and are used to store or return them. Data sink operations are described using an OutputFormat. Flink comes with a variety of built-in output formats that are encapsulated behind operations on the DataSet: + + * writeAsText() / TextOutputFormat - Writes elements line-wise as Strings. The Strings are obtained by calling the toString() method of each element. + * writeAsFormattedText() / TextOutputFormat - Write elements line-wise as Strings. The Strings are obtained by calling a user-defined format() method for each element. + * writeAsCsv(...) / CsvOutputFormat - Writes tuples as comma-separated value files. Row and field delimiters are configurable. The value for each field comes from the toString() method of the objects. + print() / printToErr() / print(String msg) / printToErr(String msg) - Prints the toString() value of each element on the standard out / standard error stream. Optionally, a prefix (msg) can be provided which is prepended to the output. This can help to distinguish between different calls to print. If the parallelism is greater than 1, the output will also be prepended with the identifier of the task which produced the output. + * write() / FileOutputFormat - Method and base class for custom file outputs. Supports custom object-to-bytes conversion. + * output()/ OutputFormat - Most generic output method, for data sinks that are not file based (such as storing the result in a database). + +A DataSet can be input to multiple operations. Programs can write or print a data set and at the same time run additional transformations on them. + +{{< tabs "datasinkbuiltin" >}} +{{< tab "Java" >}} +```java +// text data +DataSet textData = // [...] + +// write DataSet to a file on the local file system +textData.writeAsText("file:///my/result/on/localFS"); + +// write DataSet to a file on an HDFS with a namenode running at nnHost:nnPort +textData.writeAsText("hdfs://nnHost:nnPort/my/result/on/localFS"); + +// write DataSet to a file and overwrite the file if it exists +textData.writeAsText("file:///my/result/on/localFS", WriteMode.OVERWRITE); + +// tuples as lines with pipe as the separator "a|b|c" +DataSet> values = // [...] +values.writeAsCsv("file:///path/to/the/result/file", "\n", "|"); + +// this writes tuples in the text formatting "(a, b, c)", rather than as CSV lines +values.writeAsText("file:///path/to/the/result/file"); + +// this writes values as strings using a user-defined TextFormatter object +values.writeAsFormattedText("file:///path/to/the/result/file", + new TextFormatter>() { + public String format (Tuple2 value) { + return value.f1 + " - " + value.f0; + } + }); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// text data +val textData: DataSet[String] = // [...] + +// write DataSet to a file on the local file system +textData.writeAsText("file:///my/result/on/localFS") + +// write DataSet to a file on an HDFS with a namenode running at nnHost:nnPort +textData.writeAsText("hdfs://nnHost:nnPort/my/result/on/localFS") + +// write DataSet to a file and overwrite the file if it exists +textData.writeAsText("file:///my/result/on/localFS", WriteMode.OVERWRITE) + +// tuples as lines with pipe as the separator "a|b|c" +val values: DataSet[(String, Int, Double)] = // [...] +values.writeAsCsv("file:///path/to/the/result/file", "\n", "|") + +// this writes tuples in the text formatting "(a, b, c)", rather than as CSV lines +values.writeAsText("file:///path/to/the/result/file") + +// this writes values as strings using a user-defined formatting +values map { tuple => tuple._1 + " - " + tuple._2 } + .writeAsText("file:///path/to/the/result/file") +``` +{{< /tab >}} +{{< /tabs >}} + +Or with a custom output format: + +```java +DataSet> myResult = [...] + +// write Tuple DataSet to a relational database +myResult.output( + // build and configure OutputFormat + JdbcOutputFormat.buildJdbcOutputFormat() + .setDrivername("org.apache.derby.jdbc.EmbeddedDriver") + .setDBUrl("jdbc:derby:memory:persons") + .setQuery("insert into persons (name, age, height) values (?,?,?)") + .finish() + ); +``` + +#### Locally Sorted Output + +The output of a data sink can be locally sorted on specified fields in specified orders using tuple field positions or field expressions. This works for every output format. + +The following examples show how to use this feature: + +```java +DataSet> tData = // [...] +DataSet> pData = // [...] +DataSet sData = // [...] + +// sort output on String field in ascending order +tData.sortPartition(1, Order.ASCENDING).print(); + +// sort output on Double field in descending and Integer field in ascending order +tData.sortPartition(2, Order.DESCENDING).sortPartition(0, Order.ASCENDING).print(); + +// sort output on the "author" field of nested BookPojo in descending order +pData.sortPartition("f0.author", Order.DESCENDING).writeAsText(...); + +// sort output on the full tuple in ascending order +tData.sortPartition("*", Order.ASCENDING).writeAsCsv(...); + +// sort atomic type (String) output in descending order +sData.sortPartition("*", Order.DESCENDING).writeAsText(...); +``` + +Globally sorted output is not supported. + +## Iteration Operators + +Iterations implement loops in Flink programs. +The iteration operators encapsulate a part of the program and execute it repeatedly, feeding back the result of one iteration (the partial solution) into the next iteration. +There are two types of iterations in Flink: `BulkIteration` and `DeltaIteration`. + +This section provides quick examples on how to use both operators. Check out the Introduction to Iterations page for a more detailed introduction. + +{{< tabs "iteration" >}} +{{< tab "Java" >}} + +#### Bulk Iterations + +To create a BulkIteration call the iterate(int) method of the DataSet the iteration should start at. This will return an IterativeDataSet, which can be transformed with the regular operators. The single argument to the iterate call specifies the maximum number of iterations. + +To specify the end of an iteration call the closeWith(DataSet) method on the IterativeDataSet to specify which transformation should be fed back to the next iteration. You can optionally specify a termination criterion with closeWith(DataSet, DataSet), which evaluates the second DataSet and terminates the iteration, if this DataSet is empty. If no termination criterion is specified, the iteration terminates after the given maximum number iterations. + +The following example iteratively estimates the number Pi. The goal is to count the number of random points, which fall into the unit circle. In each iteration, a random point is picked. If this point lies inside the unit circle, we increment the count. Pi is then estimated as the resulting count divided by the number of iterations multiplied by 4. + +```java +final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); + +// Create initial IterativeDataSet +IterativeDataSet initial = env.fromElements(0).iterate(10000); + +DataSet iteration = initial.map(new MapFunction() { + @Override + public Integer map(Integer i) throws Exception { + double x = Math.random(); + double y = Math.random(); + + return i + ((x * x + y * y < 1) ? 1 : 0); + } +}); + +// Iteratively transform the IterativeDataSet +DataSet count = initial.closeWith(iteration); + +count.map(new MapFunction() { + @Override + public Double map(Integer count) throws Exception { + return count / (double) 10000 * 4; + } +}).print(); + +env.execute("Iterative Pi Example"); +``` + +#### Delta Iterations + +Delta iterations exploit the fact that certain algorithms do not change every data point of the solution in each iteration. + +In addition to the partial solution that is fed back (called workset) in every iteration, delta iterations maintain state across iterations (called solution set), which can be updated through deltas. The result of the iterative computation is the state after the last iteration. Please refer to the Introduction to Iterations for an overview of the basic principle of delta iterations. + +Defining a DeltaIteration is similar to defining a BulkIteration. For delta iterations, two data sets form the input to each iteration (workset and solution set), and two data sets are produced as the result (new workset, solution set delta) in each iteration. + +To create a DeltaIteration call the iterateDelta(DataSet, int, int) (or iterateDelta(DataSet, int, int[]) respectively). This method is called on the initial solution set. The arguments are the initial delta set, the maximum number of iterations and the key positions. The returned DeltaIteration object gives you access to the DataSets representing the workset and solution set via the methods iteration.getWorkset() and iteration.getSolutionSet(). + +Below is an example for the syntax of a delta iteration + +```java +// read the initial data sets +DataSet> initialSolutionSet = // [...] + +DataSet> initialDeltaSet = // [...] + +int maxIterations = 100; +int keyPosition = 0; + +DeltaIteration, Tuple2> iteration = initialSolutionSet + .iterateDelta(initialDeltaSet, maxIterations, keyPosition); + +DataSet> candidateUpdates = iteration.getWorkset() + .groupBy(1) + .reduceGroup(new ComputeCandidateChanges()); + +DataSet> deltas = candidateUpdates + .join(iteration.getSolutionSet()) + .where(0) + .equalTo(0) + .with(new CompareChangesToCurrent()); + +DataSet> nextWorkset = deltas + .filter(new FilterByThreshold()); + +iteration.closeWith(deltas, nextWorkset) + .writeAsCsv(outputPath); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +#### Bulk Iterations + +To create a BulkIteration call the iterate(int) method of the DataSet the iteration should start at and also specify a step function. The step function gets the input DataSet for the current iteration and must return a new DataSet. The parameter of the iterate call is the maximum number of iterations after which to stop. + +There is also the iterateWithTermination(int) function that accepts a step function that returns two DataSets: The result of the iteration step and a termination criterion. The iterations are stopped once the termination criterion DataSet is empty. + +The following example iteratively estimates the number Pi. The goal is to count the number of random points, which fall into the unit circle. In each iteration, a random point is picked. If this point lies inside the unit circle, we increment the count. Pi is then estimated as the resulting count divided by the number of iterations multiplied by 4. + +```scala +val env = ExecutionEnvironment.getExecutionEnvironment() + +// Create initial DataSet +val initial = env.fromElements(0) + +val count = initial.iterate(10000) { iterationInput: DataSet[Int] => + val result = iterationInput.map { i => + val x = Math.random() + val y = Math.random() + i + (if (x * x + y * y < 1) 1 else 0) + } + result +} + +val result = count map { c => c / 10000.0 * 4 } + +result.print() + +env.execute("Iterative Pi Example") +``` + +#### Delta Iterations + +Delta iterations exploit the fact that certain algorithms do not change every data point of the solution in each iteration. + +In addition to the partial solution that is fed back (called workset) in every iteration, delta iterations maintain state across iterations (called solution set), which can be updated through deltas. The result of the iterative computation is the state after the last iteration. Please refer to the Introduction to Iterations for an overview of the basic principle of delta iterations. + +Defining a DeltaIteration is similar to defining a BulkIteration. For delta iterations, two data sets form the input to each iteration (workset and solution set), and two data sets are produced as the result (new workset, solution set delta) in each iteration. + +To create a DeltaIteration call the iterateDelta(initialWorkset, maxIterations, key) on the initial solution set. The step function takes two parameters: (solutionSet, workset), and must return two values: (solutionSetDelta, newWorkset). + +Below is an example for the syntax of a delta iteration + +```scala +// read the initial data sets +val initialSolutionSet: DataSet[(Long, Double)] = // [...] + +val initialWorkset: DataSet[(Long, Double)] = // [...] + +val maxIterations = 100 +val keyPosition = 0 + +val result = initialSolutionSet.iterateDelta(initialWorkset, maxIterations, Array(keyPosition)) { + (solution, workset) => + val candidateUpdates = workset.groupBy(1).reduceGroup(new ComputeCandidateChanges()) + val deltas = candidateUpdates.join(solution).where(0).equalTo(0)(new CompareChangesToCurrent()) + + val nextWorkset = deltas.filter(new FilterByThreshold()) + + (deltas, nextWorkset) +} + +result.writeAsCsv(outputPath) + +env.execute() +``` + +{{< /tab >}} +{{< /tabs >}} + +## Operating on Data Objects in Functions + +Flink’s runtime exchanges data with user functions in form of Java objects. Functions receive input objects from the runtime as method parameters and return output objects as result. Because these objects are accessed by user functions and runtime code, it is very important to understand and follow the rules about how the user code may access, i.e., read and modify, these objects. + +User functions receive objects from Flink’s runtime either as regular method parameters (like a MapFunction) or through an Iterable parameter (like a GroupReduceFunction). We refer to objects that the runtime passes to a user function as input objects. User functions can emit objects to the Flink runtime either as a method return value (like a MapFunction) or through a Collector (like a FlatMapFunction). We refer to objects which have been emitted by the user function to the runtime as output objects. + +Flink’s DataSet API features two modes that differ in how Flink’s runtime creates or reuses input objects. This behavior affects the guarantees and constraints for how user functions may interact with input and output objects. The following sections define these rules and give coding guidelines to write safe user function code. + +### Object-Reuse Disabled (DEFAULT) + +By default, Flink operates in object-reuse disabled mode. This mode ensures that functions always receive new input objects within a function call. The object-reuse disabled mode gives better guarantees and is safer to use. However, it comes with a certain processing overhead and might cause higher Java garbage collection activity. The following table explains how user functions may access input and output objects in object-reuse disabled mode. + +| Operation | Guarantees and Restrictions | +|-----------|-----------------------------| +| Reading Input Objects | Within a method call it is guaranteed that the value of an input object does not change. This includes objects served by an Iterable. For example it is safe to collect input objects served by an Iterable in a List or Map. Note that objects may be modified after the method call is left. It is not safe to remember objects across function calls. | +| Modifying Input Objects | You may modify input objects. | +| Emitting Input Objects | You may emit input objects. The value of an input object may have changed after it was emitted. It is **not safe** to read an input object after it was emitted. | +| Reading Output Objects | An object that was given to a Collector or returned as method result might have changed its value. It is **not safe** to read an output object. | +| Modifying Output Objects | You may modify an object after it was emitted and emit it again. | + +**Coding guidelines for the object-reuse disabled (default) mode:** + + * Do not remember the read input objects across method calls. + * Do not read objects after you emitted them. + +### Object-Reuse Enabled + +In object-reuse enabled mode, Flink’s runtime minimizes the number of object instantiations. This can improve the performance and can reduce the Java garbage collection pressure. The object-reuse enabled mode is activated by calling `ExecutionConfig.enableObjectReuse()`. The following table explains how user functions may access input and output objects in object-reuse enabled mode. + +| Operation | Guarantees and Restrictions | +|-----------|-----------------------------| +| Reading input objects received as regular method parameters | Input objects received as regular method arguments are not modified within a function call. Objects may be modified after method call is left. It is **not safe** to remember objects across function calls. | +| Reading input objects received from an Iterable parameter | Input objects received from an Iterable are only valid until the next() method is called. An Iterable or Iterator may serve the same object instance multiple times. It is **not safe** to remember input objects received from an Iterable, e.g., by putting them in a List or Map. | +| Modifying Input Objects | You **must not** modify input objects, except for input objects of MapFunction, FlatMapFunction, MapPartitionFunction, GroupReduceFunction, GroupCombineFunction, CoGroupFunction, and InputFormat.next(reuse). | +| Emitting Input Objects | You **must not** emit input objects, except for input objects of MapFunction, FlatMapFunction, MapPartitionFunction, GroupReduceFunction, GroupCombineFunction, CoGroupFunction, and InputFormat.next(reuse). | +| Reading output Objects | An object that was given to a Collector or returned as method result might have changed its value. It is **not safe** to read an output object. | +| Modifying Output Objects | You may modify an output object and emit it again. | + +**Coding guidelines for object-reuse enabled:** + +* Do not remember input objects received from an Iterable. +* Do not remember and read input objects across method calls. +* Do not modify or emit input objects, except for input objects of MapFunction, FlatMapFunction, MapPartitionFunction, GroupReduceFunction, GroupCombineFunction, CoGroupFunction, and InputFormat.next(reuse). +* To reduce object instantiations, you can always emit a dedicated output object which is repeatedly modified but never read. + + +## Debugging + +Before running a data analysis program on a large data set in a distributed cluster, it is a good idea to make sure that the implemented algorithm works as desired. Hence, implementing data analysis programs is usually an incremental process of checking results, debugging, and improving. + +Flink provides a few nice features to significantly ease the development process of data analysis programs by supporting local debugging from within an IDE, injection of test data, and collection of result data. This section give some hints how to ease the development of Flink programs. + +### Local Execution Envronment + +A LocalEnvironment starts a Flink system within the same JVM process it was created in. If you start the LocalEnvironment from an IDE, you can set breakpoints in your code and easily debug your program. + +A LocalEnvironment is created and used as follows: + +{{< tabs "localenv" >}} +{{< tab "Java" >}} +```java +final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); + +DataSet lines = env.readTextFile(pathToTextFile); +// build your program + +env.execute(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = ExecutionEnvironment.createLocalEnvironment() + +val lines = env.readTextFile(pathToTextFile) +// build your program + +env.execute() +``` +{{< /tab >}} +{{< /tabs >}} + +### Collection Data Sources and Sinks + +Providing input for an analysis program and checking its output is cumbersome when done by creating input files and reading output files. Flink features special data sources and sinks which are backed by Java collections to ease testing. Once a program has been tested, the sources and sinks can be easily replaced by sources and sinks that read from / write to external data stores such as HDFS. + +Collection data sources can be used as follows: + +{{< tabs "collectionenv" >}} +{{< tab "Java" >}} +```java +final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(); + +// Create a DataSet from a list of elements +DataSet myInts = env.fromElements(1, 2, 3, 4, 5); + +// Create a DataSet from any Java collection +List> data = ... +DataSet> myTuples = env.fromCollection(data); + +// Create a DataSet from an Iterator +Iterator longIt = ... +DataSet myLongs = env.fromCollection(longIt, Long.class); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = ExecutionEnvironment.createLocalEnvironment() + +// Create a DataSet from a list of elements +val myInts = env.fromElements(1, 2, 3, 4, 5) + +// Create a DataSet from any Collection +val data: Seq[(String, Int)] = ... +val myTuples = env.fromCollection(data) + +// Create a DataSet from an Iterator +val longIt: Iterator[Long] = ... +val myLongs = env.fromCollection(longIt) +``` +{{< /tab >}} +{{< /tabs >}} + +Note: Currently, the collection data source requires that data types and iterators implement Serializable. Furthermore, collection data sources can not be executed in parallel ( parallelism = 1). + +## Broadcast Variables + +Broadcast variables allow you to make a data set available to all parallel instances of an operation, in addition to the regular input of the operation. This is useful for auxiliary data sets, or data-dependent parameterization. The data set will then be accessible at the operator as a Collection. + +* **Broadcast**: broadcast sets are registered by name via withBroadcastSet(DataSet, String), and +* **Access**: accessible via getRuntimeContext().getBroadcastVariable(String) at the target operator. + +{{< tabs "broadcastvariable" >}} +{{< tab "Java" >}} +```java + Java + Scala + +// 1. The DataSet to be broadcast +DataSet toBroadcast = env.fromElements(1, 2, 3); + +DataSet data = env.fromElements("a", "b"); + +data.map(new RichMapFunction() { + @Override + public void open(Configuration parameters) throws Exception { + // 3. Access the broadcast DataSet as a Collection + Collection broadcastSet = getRuntimeContext().getBroadcastVariable("broadcastSetName"); + } + + + @Override + public String map(String value) throws Exception { + ... + } +}).withBroadcastSet(toBroadcast, "broadcastSetName"); // 2. Broadcast the DataSet +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// 1. The DataSet to be broadcast +val toBroadcast = env.fromElements(1, 2, 3) + +val data = env.fromElements("a", "b") + +data.map(new RichMapFunction[String, String]() { + var broadcastSet: Traversable[String] = null + + override def open(config: Configuration): Unit = { + // 3. Access the broadcast DataSet as a Collection + broadcastSet = getRuntimeContext().getBroadcastVariable[String]("broadcastSetName").asScala + } + + def map(in: String): String = { + ... + } +}).withBroadcastSet(toBroadcast, "broadcastSetName") // 2. Broadcast the DataSet +``` +{{< /tab >}} +{{< /tabs >}} + +Make sure that the names (broadcastSetName in the previous example) match when registering and accessing broadcast data sets. For a complete example program, have a look at K-Means Algorithm. + +Note: As the content of broadcast variables is kept in-memory on each node, it should not become too large. For simpler things like scalar values you can simply make parameters part of the closure of a function, or use the withParameters(...) method to pass in a configuration. + +## Distributed Cache + +Flink offers a distributed cache, similar to Apache Hadoop, to make files locally accessible to parallel instances of user functions. This functionality can be used to share files that contain static external data such as dictionaries or machine-learned regression models. + +The cache works as follows. A program registers a file or directory of a local or remote filesystem such as HDFS or S3 under a specific name in its ExecutionEnvironment as a cached file. When the program is executed, Flink automatically copies the file or directory to the local filesystem of all workers. A user function can look up the file or directory under the specified name and access it from the worker’s local filesystem. + +The distributed cache is used as follows: + +{{< tabs "distributedcache" >}} +{{< tab "Java" >}} +```java +ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); + +// register a file from HDFS +env.registerCachedFile("hdfs:///path/to/your/file", "hdfsFile") + +// register a local executable file (script, executable, ...) +env.registerCachedFile("file:///path/to/exec/file", "localExecFile", true) + +// define your program and execute +... +DataSet input = ... +DataSet result = input.map(new MyMapper()); +... +env.execute(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = ExecutionEnvironment.getExecutionEnvironment + +// register a file from HDFS +env.registerCachedFile("hdfs:///path/to/your/file", "hdfsFile") + +// register a local executable file (script, executable, ...) +env.registerCachedFile("file:///path/to/exec/file", "localExecFile", true) + +// define your program and execute +... +val input: DataSet[String] = ... +val result: DataSet[Integer] = input.map(new MyMapper()) +... +env.execute() +``` +{{< /tab >}} +{{< /tabs >}} + +Access the cached file in a user function (here a MapFunction). +The function must extend a RichFunction class because it needs access to the RuntimeContext. + +{{< tabs "distributedcacheread" >}} +{{< tab "Java" >}} +```java +// extend a RichFunction to have access to the RuntimeContext +public final class MyMapper extends RichMapFunction { + + @Override + public void open(Configuration config) { + + // access cached file via RuntimeContext and DistributedCache + File myFile = getRuntimeContext().getDistributedCache().getFile("hdfsFile"); + // read the file (or navigate the directory) + ... + } + + @Override + public Integer map(String value) throws Exception { + // use content of cached file + ... + } +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// extend a RichFunction to have access to the RuntimeContext +class MyMapper extends RichMapFunction[String, Int] { + + override def open(config: Configuration): Unit = { + + // access cached file via RuntimeContext and DistributedCache + val myFile: File = getRuntimeContext.getDistributedCache.getFile("hdfsFile") + // read the file (or navigate the directory) + ... + } + + override def map(value: String): Int = { + // use content of cached file + ... + } +} +``` +{{< /tab >}} +{{< /tabs >}} + +## Passing Parameters to Functions + +Parameters can be passed to functions using either the constructor or the `withParameters(Configuration)` method. +The parameters are serialized as part of the function object and shipped to all parallel task instances. + +#### Via Constructor + +{{< tabs "constructorparams" >}} +{{< tab "Java" >}} +```java +DataSet toFilter = env.fromElements(1, 2, 3); + +toFilter.filter(new MyFilter(2)); + +private static class MyFilter implements FilterFunction { + + private final int limit; + + public MyFilter(int limit) { + this.limit = limit; + } + + @Override + public boolean filter(Integer value) throws Exception { + return value > limit; + } +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val toFilter = env.fromElements(1, 2, 3) + +toFilter.filter(new MyFilter(2)) + +class MyFilter(limit: Int) extends FilterFunction[Int] { + override def filter(value: Int): Boolean = { + value > limit + } +} +``` +{{< /tab >}} +{{< /tabs >}} + +#### Via `withParameters(Configuration)` + +{{< tabs "withparams" >}} +{{< tab "Java" >}} +```java +DataSet toFilter = env.fromElements(1, 2, 3); + +Configuration config = new Configuration(); +config.setInteger("limit", 2); + +toFilter.filter(new RichFilterFunction() { + private int limit; + + @Override + public void open(Configuration parameters) throws Exception { + limit = parameters.getInteger("limit", 0); + } + + @Override + public boolean filter(Integer value) throws Exception { + return value > limit; + } +}).withParameters(config); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val toFilter = env.fromElements(1, 2, 3) + +val c = new Configuration() +c.setInteger("limit", 2) + +toFilter.filter(new RichFilterFunction[Int]() { + var limit = 0 + + override def open(config: Configuration): Unit = { + limit = config.getInteger("limit", 0) + } + + def filter(in: Int): Boolean = { + in > limit + } +}).withParameters(c) +``` +{{< /tab >}} +{{< /tabs >}} + +#### Globally via the `ExecutionConfig` + + +Flink also allows to pass custom configuration values to the ExecutionConfig interface of the environment. +Since the execution config is accessible in all (rich) user functions, the custom configuration will be available globally in all functions. + +**Setting a custom global configuration** + +{{< tabs "setexecutionconfig" >}} +{{< tab "Java" >}} +```java +Configuration conf = new Configuration(); +conf.setString("mykey","myvalue"); +final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); +env.getConfig().setGlobalJobParameters(conf); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = ExecutionEnvironment.getExecutionEnvironment +val conf = new Configuration() +conf.setString("mykey", "myvalue") +env.getConfig.setGlobalJobParameters(conf) +``` +{{< /tab >}} +{{< /tabs >}} + +Please note that you can also pass a custom class extending the ExecutionConfig.GlobalJobParameters class as the global job parameters to the execution config. The interface allows to implement the `Map toMap()` method which will in turn show the values from the configuration in the web frontend. + +**Accessing values from the global configuration** + +```java +public static final class Tokenizer extends RichFlatMapFunction> { + + private String mykey; + + @Override + public void open(Configuration parameters) throws Exception { + ExecutionConfig.GlobalJobParameters globalParams = getRuntimeContext().getExecutionConfig().getGlobalJobParameters(); + Configuration globConf = (Configuration) globalParams; + mykey = globConf.getString("mykey", null); + } +``` diff --git a/docs/content.zh/docs/dev/dataset/transformations.md b/docs/content.zh/docs/dev/dataset/transformations.md new file mode 100644 index 0000000000000..fd36ca0412d3c --- /dev/null +++ b/docs/content.zh/docs/dev/dataset/transformations.md @@ -0,0 +1,1923 @@ +--- +title: Transformations +nav-title: Transformations +weight: 2 +type: docs +aliases: + - /zh/dev/batch/dataset_transformations.html +--- + + +# DataSet Transformations + +This document gives a deep-dive into the available transformations on DataSets. For a general introduction to the +Flink Java API, please refer to the [Programming Guide](index.html). + +For zipping elements in a data set with a dense index, please refer to the [Zip Elements Guide](zip_elements_guide.html). + + + +### Map + +The Map transformation applies a user-defined map function on each element of a DataSet. +It implements a one-to-one mapping, that is, exactly one element must be returned by +the function. + +The following code transforms a DataSet of Integer pairs into a DataSet of Integers: + +{{< tabs "3a758074-c167-4b66-a787-90525d451ddb" >}} +{{< tab "Java" >}} + +```java +// MapFunction that adds two integer values +public class IntAdder implements MapFunction, Integer> { + @Override + public Integer map(Tuple2 in) { + return in.f0 + in.f1; + } +} + +// [...] +DataSet> intPairs = // [...] +DataSet intSums = intPairs.map(new IntAdder()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val intPairs: DataSet[(Int, Int)] = // [...] +val intSums = intPairs.map { pair => pair._1 + pair._2 } +``` + +{{< /tab >}} +{{< /tabs >}} +### FlatMap + +The FlatMap transformation applies a user-defined flat-map function on each element of a DataSet. +This variant of a map function can return arbitrary many result elements (including none) for each input element. + +The following code transforms a DataSet of text lines into a DataSet of words: + +{{< tabs "3f1a6873-77f0-4113-8339-7617d3b12e41" >}} +{{< tab "Java" >}} + +```java +// FlatMapFunction that tokenizes a String by whitespace characters and emits all String tokens. +public class Tokenizer implements FlatMapFunction { + @Override + public void flatMap(String value, Collector out) { + for (String token : value.split("\\W")) { + out.collect(token); + } + } +} + +// [...] +DataSet textLines = // [...] +DataSet words = textLines.flatMap(new Tokenizer()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val textLines: DataSet[String] = // [...] +val words = textLines.flatMap { _.split(" ") } +``` + +{{< /tab >}} +{{< /tabs >}} +### MapPartition + +MapPartition transforms a parallel partition in a single function call. The map-partition function +gets the partition as Iterable and can produce an arbitrary number of result values. The number of elements in each partition depends on the degree-of-parallelism +and previous operations. + +The following code transforms a DataSet of text lines into a DataSet of counts per partition: + +{{< tabs "a2230f1a-6f85-41a0-86b9-48b9166e18df" >}} +{{< tab "Java" >}} + +```java +public class PartitionCounter implements MapPartitionFunction { + + public void mapPartition(Iterable values, Collector out) { + long c = 0; + for (String s : values) { + c++; + } + out.collect(c); + } +} + +// [...] +DataSet textLines = // [...] +DataSet counts = textLines.mapPartition(new PartitionCounter()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val textLines: DataSet[String] = // [...] +// Some is required because the return value must be a Collection. +// There is an implicit conversion from Option to a Collection. +val counts = texLines.mapPartition { in => Some(in.size) } +``` + +{{< /tab >}} +{{< /tabs >}} +### Filter + +The Filter transformation applies a user-defined filter function on each element of a DataSet and retains only those elements for which the function returns `true`. + +The following code removes all Integers smaller than zero from a DataSet: + +{{< tabs "ac9444c1-31ab-4d7d-9e95-e02ba6599837" >}} +{{< tab "Java" >}} + +```java +// FilterFunction that filters out all Integers smaller than zero. +public class NaturalNumberFilter implements FilterFunction { + @Override + public boolean filter(Integer number) { + return number >= 0; + } +} + +// [...] +DataSet intNumbers = // [...] +DataSet naturalNumbers = intNumbers.filter(new NaturalNumberFilter()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val intNumbers: DataSet[Int] = // [...] +val naturalNumbers = intNumbers.filter { _ > 0 } +``` + +{{< /tab >}} +{{< /tabs >}} +**IMPORTANT:** The system assumes that the function does not modify the elements on which the predicate is applied. Violating this assumption +can lead to incorrect results. + +### Projection of Tuple DataSet + +The Project transformation removes or moves Tuple fields of a Tuple DataSet. +The `project(int...)` method selects Tuple fields that should be retained by their index and defines their order in the output Tuple. + +Projections do not require the definition of a user function. + +The following code shows different ways to apply a Project transformation on a DataSet: + +{{< tabs "91098608-c1e5-4579-bb1d-051d1d06566b" >}} +{{< tab "Java" >}} + +```java +DataSet> in = // [...] +// converts Tuple3 into Tuple2 +DataSet> out = in.project(2,0); +``` + +#### Projection with Type Hint + +Note that the Java compiler cannot infer the return type of `project` operator. This can cause a problem if you call another operator on a result of `project` operator such as: + +```java +DataSet> ds = .... +DataSet> ds2 = ds.project(0).distinct(0); +``` + +This problem can be overcome by hinting the return type of `project` operator like this: + +```java +DataSet> ds2 = ds.>project(0).distinct(0); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +Not supported. +``` + +{{< /tab >}} +{{< /tabs >}} +### Transformations on Grouped DataSet + +The reduce operations can operate on grouped data sets. Specifying the key to +be used for grouping can be done in many ways: + +- key expressions +- a key-selector function +- one or more field position keys (Tuple DataSet only) +- Case Class fields (Case Classes only) + +Please look at the reduce examples to see how the grouping keys are specified. + +### Reduce on Grouped DataSet + +A Reduce transformation that is applied on a grouped DataSet reduces each group to a single +element using a user-defined reduce function. +For each group of input elements, a reduce function successively combines pairs of elements into one +element until only a single element for each group remains. + +Note that for a `ReduceFunction` the keyed fields of the returned object should match the input +values. This is because reduce is implicitly combinable and objects emitted from the combine +operator are again grouped by key when passed to the reduce operator. + +#### Reduce on DataSet Grouped by Key Expression + +Key expressions specify one or more fields of each element of a DataSet. Each key expression is +either the name of a public field or a getter method. A dot can be used to drill down into objects. +The key expression "*" selects all fields. +The following code shows how to group a POJO DataSet using key expressions and to reduce it +with a reduce function. + +{{< tabs "1e955a6e-1aa5-4c86-a783-ad52c13d2464" >}} +{{< tab "Java" >}} + +```java +// some ordinary POJO +public class WC { + public String word; + public int count; + // [...] +} + +// ReduceFunction that sums Integer attributes of a POJO +public class WordCounter implements ReduceFunction { + @Override + public WC reduce(WC in1, WC in2) { + return new WC(in1.word, in1.count + in2.count); + } +} + +// [...] +DataSet words = // [...] +DataSet wordCounts = words + // DataSet grouping on field "word" + .groupBy("word") + // apply ReduceFunction on grouped DataSet + .reduce(new WordCounter()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +// some ordinary POJO +class WC(val word: String, val count: Int) { + def this() { + this(null, -1) + } + // [...] +} + +val words: DataSet[WC] = // [...] +val wordCounts = words.groupBy("word").reduce { + (w1, w2) => new WC(w1.word, w1.count + w2.count) +} +``` + +{{< /tab >}} +{{< /tabs >}} +#### Reduce on DataSet Grouped by KeySelector Function + +A key-selector function extracts a key value from each element of a DataSet. The extracted key +value is used to group the DataSet. +The following code shows how to group a POJO DataSet using a key-selector function and to reduce it +with a reduce function. + +{{< tabs "a502de54-4f61-488f-a2d1-6511cfbb8a20" >}} +{{< tab "Java" >}} + +```java +// some ordinary POJO +public class WC { + public String word; + public int count; + // [...] +} + +// ReduceFunction that sums Integer attributes of a POJO +public class WordCounter implements ReduceFunction { + @Override + public WC reduce(WC in1, WC in2) { + return new WC(in1.word, in1.count + in2.count); + } +} + +// [...] +DataSet words = // [...] +DataSet wordCounts = words + // DataSet grouping on field "word" + .groupBy(new SelectWord()) + // apply ReduceFunction on grouped DataSet + .reduce(new WordCounter()); + +public class SelectWord implements KeySelector { + @Override + public String getKey(Word w) { + return w.word; + } +} +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +// some ordinary POJO +class WC(val word: String, val count: Int) { + def this() { + this(null, -1) + } + // [...] +} + +val words: DataSet[WC] = // [...] +val wordCounts = words.groupBy { _.word } reduce { + (w1, w2) => new WC(w1.word, w1.count + w2.count) +} +``` + +{{< /tab >}} +{{< /tabs >}} +#### Reduce on DataSet Grouped by Field Position Keys (Tuple DataSets only) + +Field position keys specify one or more fields of a Tuple DataSet that are used as grouping keys. +The following code shows how to use field position keys and apply a reduce function + +{{< tabs "6195742f-896b-471b-a35d-75545a9905c3" >}} +{{< tab "Java" >}} + +```java +DataSet> tuples = // [...] +DataSet> reducedTuples = tuples + // group DataSet on first and second field of Tuple + .groupBy(0, 1) + // apply ReduceFunction on grouped DataSet + .reduce(new MyTupleReducer()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val tuples = DataSet[(String, Int, Double)] = // [...] +// group on the first and second Tuple field +val reducedTuples = tuples.groupBy(0, 1).reduce { ... } +``` + +{{< /tab >}} +{{< /tabs >}} +#### Reduce on DataSet grouped by Case Class Fields + +When using Case Classes you can also specify the grouping key using the names of the fields: + +{{< tabs "732febf3-c182-475f-a0e6-44434ea6dd62" >}} +{{< tab "Java" >}} + +```java +Not supported. +``` +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +case class MyClass(val a: String, b: Int, c: Double) +val tuples = DataSet[MyClass] = // [...] +// group on the first and second field +val reducedTuples = tuples.groupBy("a", "b").reduce { ... } +``` + +{{< /tab >}} +{{< /tabs >}} +### GroupReduce on Grouped DataSet + +A GroupReduce transformation that is applied on a grouped DataSet calls a user-defined +group-reduce function for each group. The difference +between this and *Reduce* is that the user defined function gets the whole group at once. +The function is invoked with an Iterable over all elements of a group and can return an arbitrary +number of result elements. + +#### GroupReduce on DataSet Grouped by Field Position Keys (Tuple DataSets only) + +The following code shows how duplicate strings can be removed from a DataSet grouped by Integer. + +{{< tabs "d7c3d396-189e-418b-a326-888172e480dd" >}} +{{< tab "Java" >}} + +```java +public class DistinctReduce + implements GroupReduceFunction, Tuple2> { + + @Override + public void reduce(Iterable> in, Collector> out) { + + Set uniqStrings = new HashSet(); + Integer key = null; + + // add all strings of the group to the set + for (Tuple2 t : in) { + key = t.f0; + uniqStrings.add(t.f1); + } + + // emit all unique strings. + for (String s : uniqStrings) { + out.collect(new Tuple2(key, s)); + } + } +} + +// [...] +DataSet> input = // [...] +DataSet> output = input + .groupBy(0) // group DataSet by the first tuple field + .reduceGroup(new DistinctReduce()); // apply GroupReduceFunction +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input: DataSet[(Int, String)] = // [...] +val output = input.groupBy(0).reduceGroup { + (in, out: Collector[(Int, String)]) => + in.toSet foreach (out.collect) + } +``` + +{{< /tab >}} +{{< /tabs >}} +#### GroupReduce on DataSet Grouped by Key Expression, KeySelector Function, or Case Class Fields + +Work analogous to [key expressions](#reduce-on-dataset-grouped-by-key-expression), +[key-selector functions](#reduce-on-dataset-grouped-by-keyselector-function), +and [case class fields](#reduce-on-dataset-grouped-by-case-class-fields) in *Reduce* transformations. + + +#### GroupReduce on sorted groups + +A group-reduce function accesses the elements of a group using an Iterable. Optionally, the Iterable can hand out the elements of a group in a specified order. In many cases this can help to reduce the complexity of a user-defined +group-reduce function and improve its efficiency. + +The following code shows another example how to remove duplicate Strings in a DataSet grouped by an Integer and sorted by String. + +{{< tabs "94d3e73d-03e1-4f1b-a5b7-4ebfad7f9409" >}} +{{< tab "Java" >}} + +```java +// GroupReduceFunction that removes consecutive identical elements +public class DistinctReduce + implements GroupReduceFunction, Tuple2> { + + @Override + public void reduce(Iterable> in, Collector> out) { + Integer key = null; + String comp = null; + + for (Tuple2 t : in) { + key = t.f0; + String next = t.f1; + + // check if strings are different + if (comp == null || !next.equals(comp)) { + out.collect(new Tuple2(key, next)); + comp = next; + } + } + } +} + +// [...] +DataSet> input = // [...] +DataSet output = input + .groupBy(0) // group DataSet by first field + .sortGroup(1, Order.ASCENDING) // sort groups on second tuple field + .reduceGroup(new DistinctReduce()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input: DataSet[(Int, String)] = // [...] +val output = input.groupBy(0).sortGroup(1, Order.ASCENDING).reduceGroup { + (in, out: Collector[(Int, String)]) => + var prev: (Int, String) = null + for (t <- in) { + if (prev == null || prev != t) + out.collect(t) + prev = t + } + } + +``` + +{{< /tab >}} +{{< /tabs >}} +**Note:** A GroupSort often comes for free if the grouping is established using a sort-based execution strategy of an operator before the reduce operation. + +#### Combinable GroupReduceFunctions + +In contrast to a reduce function, a group-reduce function is not +implicitly combinable. In order to make a group-reduce function +combinable it must implement the `GroupCombineFunction` interface. + +**Important**: The generic input and output types of +the `GroupCombineFunction` interface must be equal to the generic input type +of the `GroupReduceFunction` as shown in the following example: + +{{< tabs "f6680a99-adcb-433f-87d7-390468a7a240" >}} +{{< tab "Java" >}} + +```java +// Combinable GroupReduceFunction that computes a sum. +public class MyCombinableGroupReducer implements + GroupReduceFunction, String>, + GroupCombineFunction, Tuple2> +{ + @Override + public void reduce(Iterable> in, + Collector out) { + + String key = null; + int sum = 0; + + for (Tuple2 curr : in) { + key = curr.f0; + sum += curr.f1; + } + // concat key and sum and emit + out.collect(key + "-" + sum); + } + + @Override + public void combine(Iterable> in, + Collector> out) { + String key = null; + int sum = 0; + + for (Tuple2 curr : in) { + key = curr.f0; + sum += curr.f1; + } + // emit tuple with key and sum + out.collect(new Tuple2<>(key, sum)); + } +} +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala + +// Combinable GroupReduceFunction that computes two sums. +class MyCombinableGroupReducer + extends GroupReduceFunction[(String, Int), String] + with GroupCombineFunction[(String, Int), (String, Int)] +{ + override def reduce( + in: java.lang.Iterable[(String, Int)], + out: Collector[String]): Unit = + { + val r: (String, Int) = + in.iterator.asScala.reduce( (a,b) => (a._1, a._2 + b._2) ) + // concat key and sum and emit + out.collect (r._1 + "-" + r._2) + } + + override def combine( + in: java.lang.Iterable[(String, Int)], + out: Collector[(String, Int)]): Unit = + { + val r: (String, Int) = + in.iterator.asScala.reduce( (a,b) => (a._1, a._2 + b._2) ) + // emit tuple with key and sum + out.collect(r) + } +} +``` + +{{< /tab >}} +{{< /tabs >}} +### GroupCombine on a Grouped DataSet + +The GroupCombine transformation is the generalized form of the combine step in +the combinable GroupReduceFunction. It is generalized in the sense that it +allows combining of input type `I` to an arbitrary output type `O`. In contrast, +the combine step in the GroupReduce only allows combining from input type `I` to +output type `I`. This is because the reduce step in the GroupReduceFunction +expects input type `I`. + +In some applications, it is desirable to combine a DataSet into an intermediate +format before performing additional transformations (e.g. to reduce data +size). This can be achieved with a CombineGroup transformation with very little +costs. + +**Note:** The GroupCombine on a Grouped DataSet is performed in memory with a + greedy strategy which may not process all data at once but in multiple + steps. It is also performed on the individual partitions without a data + exchange like in a GroupReduce transformation. This may lead to partial + results. + +The following example demonstrates the use of a CombineGroup transformation for +an alternative WordCount implementation. + +{{< tabs "e03a56af-1a35-44c1-b677-40bf2e07674f" >}} +{{< tab "Java" >}} + +```java +DataSet input = [..] // The words received as input + +DataSet> combinedWords = input + .groupBy(0) // group identical words + .combineGroup(new GroupCombineFunction() { + + public void combine(Iterable words, Collector>) { // combine + String key = null; + int count = 0; + + for (String word : words) { + key = word; + count++; + } + // emit tuple with word and count + out.collect(new Tuple2(key, count)); + } +}); + +DataSet> output = combinedWords + .groupBy(0) // group by words again + .reduceGroup(new GroupReduceFunction() { // group reduce with full data exchange + + public void reduce(Iterable>, Collector>) { + String key = null; + int count = 0; + + for (Tuple2 word : words) { + key = word; + count++; + } + // emit tuple with word and count + out.collect(new Tuple2(key, count)); + } +}); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input: DataSet[String] = [..] // The words received as input + +val combinedWords: DataSet[(String, Int)] = input + .groupBy(0) + .combineGroup { + (words, out: Collector[(String, Int)]) => + var key: String = null + var count = 0 + + for (word <- words) { + key = word + count += 1 + } + out.collect((key, count)) +} + +val output: DataSet[(String, Int)] = combinedWords + .groupBy(0) + .reduceGroup { + (words, out: Collector[(String, Int)]) => + var key: String = null + var sum = 0 + + for ((word, sum) <- words) { + key = word + sum += count + } + out.collect((key, sum)) +} + +``` + +{{< /tab >}} +{{< /tabs >}} +The above alternative WordCount implementation demonstrates how the GroupCombine +combines words before performing the GroupReduce transformation. The above +example is just a proof of concept. Note, how the combine step changes the type +of the DataSet which would normally require an additional Map transformation +before executing the GroupReduce. + +### Aggregate on Grouped Tuple DataSet + +There are some common aggregation operations that are frequently used. The Aggregate transformation provides the following build-in aggregation functions: + +- Sum, +- Min, and +- Max. + +The Aggregate transformation can only be applied on a Tuple DataSet and supports only field position keys for grouping. + +The following code shows how to apply an Aggregation transformation on a DataSet grouped by field position keys: + +{{< tabs "9d8f27c8-217b-4bda-b56d-0606d5ee77a4" >}} +{{< tab "Java" >}} + +```java +DataSet> input = // [...] +DataSet> output = input + .groupBy(1) // group DataSet on second field + .aggregate(SUM, 0) // compute sum of the first field + .and(MIN, 2); // compute minimum of the third field +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input: DataSet[(Int, String, Double)] = // [...] +val output = input.groupBy(1).aggregate(SUM, 0).and(MIN, 2) +``` + +{{< /tab >}} +{{< /tabs >}} +To apply multiple aggregations on a DataSet it is necessary to use the `.and()` function after the first aggregate, that means `.aggregate(SUM, 0).and(MIN, 2)` produces the sum of field 0 and the minimum of field 2 of the original DataSet. +In contrast to that `.aggregate(SUM, 0).aggregate(MIN, 2)` will apply an aggregation on an aggregation. In the given example it would produce the minimum of field 2 after calculating the sum of field 0 grouped by field 1. + +**Note:** The set of aggregation functions will be extended in the future. + +### MinBy / MaxBy on Grouped Tuple DataSet + +The MinBy (MaxBy) transformation selects a single tuple for each group of tuples. The selected tuple is the tuple whose values of one or more specified fields are minimum (maximum). The fields which are used for comparison must be valid key fields, i.e., comparable. If multiple tuples have minimum (maximum) fields values, an arbitrary tuple of these tuples is returned. + +The following code shows how to select the tuple with the minimum values for the `Integer` and `Double` fields for each group of tuples with the same `String` value from a `DataSet>`: + +{{< tabs "d40f26af-4669-4c36-8193-aca1cf547cdb" >}} +{{< tab "Java" >}} + +```java +DataSet> input = // [...] +DataSet> output = input + .groupBy(1) // group DataSet on second field + .minBy(0, 2); // select tuple with minimum values for first and third field. +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input: DataSet[(Int, String, Double)] = // [...] +val output: DataSet[(Int, String, Double)] = input + .groupBy(1) // group DataSet on second field + .minBy(0, 2) // select tuple with minimum values for first and third field. +``` + +{{< /tab >}} +{{< /tabs >}} +### Reduce on full DataSet + +The Reduce transformation applies a user-defined reduce function to all elements of a DataSet. +The reduce function subsequently combines pairs of elements into one element until only a single element remains. + +The following code shows how to sum all elements of an Integer DataSet: + +{{< tabs "c11498cd-b758-4fd6-ad0b-223458b238e2" >}} +{{< tab "Java" >}} + +```java +// ReduceFunction that sums Integers +public class IntSummer implements ReduceFunction { + @Override + public Integer reduce(Integer num1, Integer num2) { + return num1 + num2; + } +} + +// [...] +DataSet intNumbers = // [...] +DataSet sum = intNumbers.reduce(new IntSummer()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val intNumbers = env.fromElements(1,2,3) +val sum = intNumbers.reduce (_ + _) +``` + +{{< /tab >}} +{{< /tabs >}} +Reducing a full DataSet using the Reduce transformation implies that the final Reduce operation cannot be done in parallel. However, a reduce function is automatically combinable such that a Reduce transformation does not limit scalability for most use cases. + +### GroupReduce on full DataSet + +The GroupReduce transformation applies a user-defined group-reduce function on all elements of a DataSet. +A group-reduce can iterate over all elements of DataSet and return an arbitrary number of result elements. + +The following example shows how to apply a GroupReduce transformation on a full DataSet: + +{{< tabs "9f1bf18b-5a2e-473b-bfc1-331421b52ef2" >}} +{{< tab "Java" >}} + +```java +DataSet input = // [...] +// apply a (preferably combinable) GroupReduceFunction to a DataSet +DataSet output = input.reduceGroup(new MyGroupReducer()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input: DataSet[Int] = // [...] +val output = input.reduceGroup(new MyGroupReducer()) +``` + +{{< /tab >}} +{{< /tabs >}} +**Note:** A GroupReduce transformation on a full DataSet cannot be done in parallel if the +group-reduce function is not combinable. Therefore, this can be a very compute intensive operation. +See the paragraph on "Combinable GroupReduceFunctions" above to learn how to implement a +combinable group-reduce function. + +### GroupCombine on a full DataSet + +The GroupCombine on a full DataSet works similar to the GroupCombine on a +grouped DataSet. The data is partitioned on all nodes and then combined in a +greedy fashion (i.e. only data fitting into memory is combined at once). + +### Aggregate on full Tuple DataSet + +There are some common aggregation operations that are frequently used. The Aggregate transformation +provides the following build-in aggregation functions: + +- Sum, +- Min, and +- Max. + +The Aggregate transformation can only be applied on a Tuple DataSet. + +The following code shows how to apply an Aggregation transformation on a full DataSet: + +{{< tabs "03da5cf6-dec9-4a70-9c6a-b89b0a139c41" >}} +{{< tab "Java" >}} + +```java +DataSet> input = // [...] +DataSet> output = input + .aggregate(SUM, 0) // compute sum of the first field + .and(MIN, 1); // compute minimum of the second field +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input: DataSet[(Int, String, Double)] = // [...] +val output = input.aggregate(SUM, 0).and(MIN, 2) + +``` + +{{< /tab >}} +{{< /tabs >}} +**Note:** Extending the set of supported aggregation functions is on our roadmap. + +### MinBy / MaxBy on full Tuple DataSet + +The MinBy (MaxBy) transformation selects a single tuple from a DataSet of tuples. The selected tuple is the tuple whose values of one or more specified fields are minimum (maximum). The fields which are used for comparison must be valid key fields, i.e., comparable. If multiple tuples have minimum (maximum) fields values, an arbitrary tuple of these tuples is returned. + +The following code shows how to select the tuple with the maximum values for the `Integer` and `Double` fields from a `DataSet>`: + +{{< tabs "1b5a62f0-0463-415e-92d9-05aa10df2393" >}} +{{< tab "Java" >}} + +```java +DataSet> input = // [...] +DataSet> output = input + .maxBy(0, 2); // select tuple with maximum values for first and third field. +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input: DataSet[(Int, String, Double)] = // [...] +val output: DataSet[(Int, String, Double)] = input + .maxBy(0, 2) // select tuple with maximum values for first and third field. +``` + +{{< /tab >}} +{{< /tabs >}} +### Distinct + +The Distinct transformation computes the DataSet of the distinct elements of the source DataSet. +The following code removes all duplicate elements from the DataSet: + +{{< tabs "47904f04-4af7-4c9d-ae85-e245f13ce3bc" >}} +{{< tab "Java" >}} + +```java +DataSet> input = // [...] +DataSet> output = input.distinct(); + +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input: DataSet[(Int, String, Double)] = // [...] +val output = input.distinct() + +``` + +{{< /tab >}} +{{< /tabs >}} +It is also possible to change how the distinction of the elements in the DataSet is decided, using: + +- one or more field position keys (Tuple DataSets only), +- a key-selector function, or +- a key expression. + +#### Distinct with field position keys + +{{< tabs "51da20ca-143d-48a6-bef7-fd74df136083" >}} +{{< tab "Java" >}} + +```java +DataSet> input = // [...] +DataSet> output = input.distinct(0,2); + +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input: DataSet[(Int, Double, String)] = // [...] +val output = input.distinct(0,2) + +``` + +{{< /tab >}} +{{< /tabs >}} +#### Distinct with KeySelector function + +{{< tabs "3f5974bb-f5db-43cb-81f5-fcb653e81902" >}} +{{< tab "Java" >}} + +```java +private static class AbsSelector implements KeySelector { +private static final long serialVersionUID = 1L; + @Override + public Integer getKey(Integer t) { + return Math.abs(t); + } +} +DataSet input = // [...] +DataSet output = input.distinct(new AbsSelector()); + +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input: DataSet[Int] = // [...] +val output = input.distinct {x => Math.abs(x)} + +``` + +{{< /tab >}} +{{< /tabs >}} +#### Distinct with key expression + +{{< tabs "bd8e5712-84ab-4851-9ad1-6aa79cacac37" >}} +{{< tab "Java" >}} + +```java +// some ordinary POJO +public class CustomType { + public String aName; + public int aNumber; + // [...] +} + +DataSet input = // [...] +DataSet output = input.distinct("aName", "aNumber"); + +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +// some ordinary POJO +case class CustomType(aName : String, aNumber : Int) { } + +val input: DataSet[CustomType] = // [...] +val output = input.distinct("aName", "aNumber") + +``` + +{{< /tab >}} +{{< /tabs >}} +It is also possible to indicate to use all the fields by the wildcard character: + +{{< tabs "2219a467-df97-4f7f-bd6b-783ff95f03d7" >}} +{{< tab "Java" >}} + +```java +DataSet input = // [...] +DataSet output = input.distinct("*"); + +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +// some ordinary POJO +val input: DataSet[CustomType] = // [...] +val output = input.distinct("_") + +``` + +{{< /tab >}} +{{< /tabs >}} +### Join + +The Join transformation joins two DataSets into one DataSet. The elements of both DataSets are joined on one or more keys which can be specified using + +- a key expression +- a key-selector function +- one or more field position keys (Tuple DataSet only). +- Case Class Fields + +There are a few different ways to perform a Join transformation which are shown in the following. + +#### Default Join (Join into Tuple2) + +The default Join transformation produces a new Tuple DataSet with two fields. Each tuple holds a joined element of the first input DataSet in the first tuple field and a matching element of the second input DataSet in the second field. + +The following code shows a default Join transformation using field position keys: + +{{< tabs "e44b7d2f-474f-4b47-bd5b-873b9230fd90" >}} +{{< tab "Java" >}} + +```java +public static class User { public String name; public int zip; } +public static class Store { public Manager mgr; public int zip; } +DataSet input1 = // [...] +DataSet input2 = // [...] +// result dataset is typed as Tuple2 +DataSet> + result = input1.join(input2) + .where("zip") // key of the first input (users) + .equalTo("zip"); // key of the second input (stores) +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input1: DataSet[(Int, String)] = // [...] +val input2: DataSet[(Double, Int)] = // [...] +val result = input1.join(input2).where(0).equalTo(1) +``` + +{{< /tab >}} +{{< /tabs >}} +#### Join with Join Function + +A Join transformation can also call a user-defined join function to process joining tuples. +A join function receives one element of the first input DataSet and one element of the second input DataSet and returns exactly one element. + +The following code performs a join of DataSet with custom java objects and a Tuple DataSet using key-selector functions and shows how to use a user-defined join function: + +{{< tabs "260fd4ce-1c12-44cd-a22a-dbb7de50c1b9" >}} +{{< tab "Java" >}} + +```java +// some POJO +public class Rating { + public String name; + public String category; + public int points; +} + +// Join function that joins a custom POJO with a Tuple +public class PointWeighter + implements JoinFunction, Tuple2> { + + @Override + public Tuple2 join(Rating rating, Tuple2 weight) { + // multiply the points and rating and construct a new output tuple + return new Tuple2(rating.name, rating.points * weight.f1); + } +} + +DataSet ratings = // [...] +DataSet> weights = // [...] +DataSet> + weightedRatings = + ratings.join(weights) + + // key of the first input + .where("category") + + // key of the second input + .equalTo("f0") + + // applying the JoinFunction on joining pairs + .with(new PointWeighter()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +case class Rating(name: String, category: String, points: Int) + +val ratings: DataSet[Ratings] = // [...] +val weights: DataSet[(String, Double)] = // [...] + +val weightedRatings = ratings.join(weights).where("category").equalTo(0) { + (rating, weight) => (rating.name, rating.points * weight._2) +} +``` + +{{< /tab >}} +{{< /tabs >}} +#### Join with Flat-Join Function + +Analogous to Map and FlatMap, a FlatJoin behaves in the same +way as a Join, but instead of returning one element, it can +return (collect), zero, one, or more elements. + +{{< tabs "1dcde17c-e76e-490d-affc-efd891138e6b" >}} +{{< tab "Java" >}} + +```java +public class PointWeighter + implements FlatJoinFunction, Tuple2> { + @Override + public void join(Rating rating, Tuple2 weight, + Collector> out) { + if (weight.f1 > 0.1) { + out.collect(new Tuple2(rating.name, rating.points * weight.f1)); + } + } +} + +DataSet> + weightedRatings = + ratings.join(weights) // [...] +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +case class Rating(name: String, category: String, points: Int) + +val ratings: DataSet[Ratings] = // [...] +val weights: DataSet[(String, Double)] = // [...] + +val weightedRatings = ratings.join(weights).where("category").equalTo(0) { + (rating, weight, out: Collector[(String, Double)]) => + if (weight._2 > 0.1) out.collect(rating.name, rating.points * weight._2) +} + +``` + +{{< /tab >}} +{{< /tabs >}} +#### Join with Projection (Java Only) + +A Join transformation can construct result tuples using a projection as shown here: + +{{< tabs "1b86db6d-cf3d-484a-b478-110af7482e64" >}} +{{< tab "Java" >}} + +```java +DataSet> input1 = // [...] +DataSet> input2 = // [...] +DataSet> + result = + input1.join(input2) + // key definition on first DataSet using a field position key + .where(0) + // key definition of second DataSet using a field position key + .equalTo(0) + // select and reorder fields of matching tuples + .projectFirst(0,2).projectSecond(1).projectFirst(1); +``` + +`projectFirst(int...)` and `projectSecond(int...)` select the fields of the first and second joined input that should be assembled into an output Tuple. The order of indexes defines the order of fields in the output tuple. +The join projection works also for non-Tuple DataSets. In this case, `projectFirst()` or `projectSecond()` must be called without arguments to add a joined element to the output Tuple. + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +Not supported. +``` + +{{< /tab >}} +{{< /tabs >}} + +#### Join with DataSet Size Hint + +In order to guide the optimizer to pick the right execution strategy, you can hint the size of a DataSet to join as shown here: + +{{< tabs "65a746da-b8a7-4be4-a39a-54fac41d8cb6" >}} +{{< tab "Java" >}} + +```java +DataSet> input1 = // [...] +DataSet> input2 = // [...] + +DataSet, Tuple2>> + result1 = + // hint that the second DataSet is very small + input1.joinWithTiny(input2) + .where(0) + .equalTo(0); + +DataSet, Tuple2>> + result2 = + // hint that the second DataSet is very large + input1.joinWithHuge(input2) + .where(0) + .equalTo(0); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input1: DataSet[(Int, String)] = // [...] +val input2: DataSet[(Int, String)] = // [...] + +// hint that the second DataSet is very small +val result1 = input1.joinWithTiny(input2).where(0).equalTo(0) + +// hint that the second DataSet is very large +val result1 = input1.joinWithHuge(input2).where(0).equalTo(0) + +``` +{{< /tab >}} +{{< /tabs >}} + +#### Join Algorithm Hints + +The Flink runtime can execute joins in various ways. Each possible way outperforms the others under +different circumstances. The system tries to pick a reasonable way automatically, but allows you +to manually pick a strategy, in case you want to enforce a specific way of executing the join. + +{{< tabs "4de39b7c-3f5c-4002-8b0d-65027fd3dbcb" >}} +{{< tab "Java" >}} + +```java +DataSet input1 = // [...] +DataSet input2 = // [...] + +DataSet result = + input1.join(input2, JoinHint.BROADCAST_HASH_FIRST) + .where("id").equalTo("key"); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input1: DataSet[SomeType] = // [...] +val input2: DataSet[AnotherType] = // [...] + +// hint that the second DataSet is very small +val result1 = input1.join(input2, JoinHint.BROADCAST_HASH_FIRST).where("id").equalTo("key") + +``` + +{{< /tab >}} +{{< /tabs >}} + +The following hints are available: + +* `OPTIMIZER_CHOOSES`: Equivalent to not giving a hint at all, leaves the choice to the system. + +* `BROADCAST_HASH_FIRST`: Broadcasts the first input and builds a hash table from it, which is + probed by the second input. A good strategy if the first input is very small. + +* `BROADCAST_HASH_SECOND`: Broadcasts the second input and builds a hash table from it, which is + probed by the first input. A good strategy if the second input is very small. + +* `REPARTITION_HASH_FIRST`: The system partitions (shuffles) each input (unless the input is already + partitioned) and builds a hash table from the first input. This strategy is good if the first + input is smaller than the second, but both inputs are still large. + *Note:* This is the default fallback strategy that the system uses if no size estimates can be made + and no pre-existing partitions and sort-orders can be re-used. + +* `REPARTITION_HASH_SECOND`: The system partitions (shuffles) each input (unless the input is already + partitioned) and builds a hash table from the second input. This strategy is good if the second + input is smaller than the first, but both inputs are still large. + +* `REPARTITION_SORT_MERGE`: The system partitions (shuffles) each input (unless the input is already + partitioned) and sorts each input (unless it is already sorted). The inputs are joined by + a streamed merge of the sorted inputs. This strategy is good if one or both of the inputs are + already sorted. + + +### OuterJoin + +The OuterJoin transformation performs a left, right, or full outer join on two data sets. Outer joins are similar to regular (inner) joins and create all pairs of elements that are equal on their keys. In addition, records of the "outer" side (left, right, or both in case of full) are preserved if no matching key is found in the other side. Matching pair of elements (or one element and a `null` value for the other input) are given to a `JoinFunction` to turn the pair of elements into a single element, or to a `FlatJoinFunction` to turn the pair of elements into arbitrarily many (including none) elements. + +The elements of both DataSets are joined on one or more keys which can be specified using + +- a key expression +- a key-selector function +- one or more field position keys (Tuple DataSet only). +- Case Class Fields + +**OuterJoins are only supported for the Java and Scala DataSet API.** + +#### OuterJoin with Join Function + +A OuterJoin transformation calls a user-defined join function to process joining tuples. +A join function receives one element of the first input DataSet and one element of the second input DataSet and returns exactly one element. Depending on the type of the outer join (left, right, full) one of both input elements of the join function can be `null`. + +The following code performs a left outer join of DataSet with custom java objects and a Tuple DataSet using key-selector functions and shows how to use a user-defined join function: + +{{< tabs "769adbfd-3f74-40d5-b762-cdf76ca42e13" >}} +{{< tab "Java" >}} + +```java +// some POJO +public class Rating { + public String name; + public String category; + public int points; +} + +// Join function that joins a custom POJO with a Tuple +public class PointAssigner + implements JoinFunction, Rating, Tuple2> { + + @Override + public Tuple2 join(Tuple2 movie, Rating rating) { + // Assigns the rating points to the movie. + // NOTE: rating might be null + return new Tuple2(movie.f0, rating == null ? -1 : rating.points; + } +} + +DataSet> movies = // [...] +DataSet ratings = // [...] +DataSet> + moviesWithPoints = + movies.leftOuterJoin(ratings) + + // key of the first input + .where("f0") + + // key of the second input + .equalTo("name") + + // applying the JoinFunction on joining pairs + .with(new PointAssigner()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +case class Rating(name: String, category: String, points: Int) + +val movies: DataSet[(String, String)] = // [...] +val ratings: DataSet[Ratings] = // [...] + +val moviesWithPoints = movies.leftOuterJoin(ratings).where(0).equalTo("name") { + (movie, rating) => (movie._1, if (rating == null) -1 else rating.points) +} +``` + +{{< /tab >}} +{{< /tabs >}} + +#### OuterJoin with Flat-Join Function + +Analogous to Map and FlatMap, an OuterJoin with flat-join function behaves in the same +way as an OuterJoin with join function, but instead of returning one element, it can +return (collect), zero, one, or more elements. + +{{< tabs "a00c96d4-2fa2-4531-927c-18001fd99002" >}} +{{< tab "Java" >}} + +```java +public class PointAssigner + implements FlatJoinFunction, Rating, Tuple2> { + @Override + public void join(Tuple2 movie, Rating rating, + Collector> out) { + if (rating == null ) { + out.collect(new Tuple2(movie.f0, -1)); + } else if (rating.points < 10) { + out.collect(new Tuple2(movie.f0, rating.points)); + } else { + // do not emit + } +} + +DataSet> + moviesWithPoints = + movies.leftOuterJoin(ratings) // [...] +``` + +{{< /tab >}} +{{< tab "Scala" >}} +Not supported. +{{< /tab >}} +{{< /tabs >}} + +#### Join Algorithm Hints + +The Flink runtime can execute outer joins in various ways. Each possible way outperforms the others under +different circumstances. The system tries to pick a reasonable way automatically, but allows you +to manually pick a strategy, in case you want to enforce a specific way of executing the outer join. + +{{< tabs "fbfb3a6e-a27f-4578-b838-df2180c8e19d" >}} +{{< tab "Java" >}} + +```java +DataSet input1 = // [...] +DataSet input2 = // [...] + +DataSet result1 = + input1.leftOuterJoin(input2, JoinHint.REPARTITION_SORT_MERGE) + .where("id").equalTo("key"); + +DataSet result2 = + input1.rightOuterJoin(input2, JoinHint.BROADCAST_HASH_FIRST) + .where("id").equalTo("key"); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input1: DataSet[SomeType] = // [...] +val input2: DataSet[AnotherType] = // [...] + +// hint that the second DataSet is very small +val result1 = input1.leftOuterJoin(input2, JoinHint.REPARTITION_SORT_MERGE).where("id").equalTo("key") + +val result2 = input1.rightOuterJoin(input2, JoinHint.BROADCAST_HASH_FIRST).where("id").equalTo("key") + +``` + +{{< /tab >}} +{{< /tabs >}} +The following hints are available. + +* `OPTIMIZER_CHOOSES`: Equivalent to not giving a hint at all, leaves the choice to the system. + +* `BROADCAST_HASH_FIRST`: Broadcasts the first input and builds a hash table from it, which is + probed by the second input. A good strategy if the first input is very small. + +* `BROADCAST_HASH_SECOND`: Broadcasts the second input and builds a hash table from it, which is + probed by the first input. A good strategy if the second input is very small. + +* `REPARTITION_HASH_FIRST`: The system partitions (shuffles) each input (unless the input is already + partitioned) and builds a hash table from the first input. This strategy is good if the first + input is smaller than the second, but both inputs are still large. + +* `REPARTITION_HASH_SECOND`: The system partitions (shuffles) each input (unless the input is already + partitioned) and builds a hash table from the second input. This strategy is good if the second + input is smaller than the first, but both inputs are still large. + +* `REPARTITION_SORT_MERGE`: The system partitions (shuffles) each input (unless the input is already + partitioned) and sorts each input (unless it is already sorted). The inputs are joined by + a streamed merge of the sorted inputs. This strategy is good if one or both of the inputs are + already sorted. + +**NOTE:** Not all execution strategies are supported by every outer join type, yet. + +* `LeftOuterJoin` supports: + * `OPTIMIZER_CHOOSES` + * `BROADCAST_HASH_SECOND` + * `REPARTITION_HASH_SECOND` + * `REPARTITION_SORT_MERGE` + +* `RightOuterJoin` supports: + * `OPTIMIZER_CHOOSES` + * `BROADCAST_HASH_FIRST` + * `REPARTITION_HASH_FIRST` + * `REPARTITION_SORT_MERGE` + +* `FullOuterJoin` supports: + * `OPTIMIZER_CHOOSES` + * `REPARTITION_SORT_MERGE` + + +### Cross + +The Cross transformation combines two DataSets into one DataSet. It builds all pairwise combinations of the elements of both input DataSets, i.e., it builds a Cartesian product. +The Cross transformation either calls a user-defined cross function on each pair of elements or outputs a Tuple2. Both modes are shown in the following. + +**Note:** Cross is potentially a *very* compute-intensive operation which can challenge even large compute clusters! + +#### Cross with User-Defined Function + +A Cross transformation can call a user-defined cross function. A cross function receives one element of the first input and one element of the second input and returns exactly one result element. + +The following code shows how to apply a Cross transformation on two DataSets using a cross function: + +{{< tabs "ad2606a0-151e-4cc5-be3c-d3998439d8c0" >}} +{{< tab "Java" >}} + +```java +public class Coord { + public int id; + public int x; + public int y; +} + +// CrossFunction computes the Euclidean distance between two Coord objects. +public class EuclideanDistComputer + implements CrossFunction> { + + @Override + public Tuple3 cross(Coord c1, Coord c2) { + // compute Euclidean distance of coordinates + double dist = sqrt(pow(c1.x - c2.x, 2) + pow(c1.y - c2.y, 2)); + return new Tuple3(c1.id, c2.id, dist); + } +} + +DataSet coords1 = // [...] +DataSet coords2 = // [...] +DataSet> + distances = + coords1.cross(coords2) + // apply CrossFunction + .with(new EuclideanDistComputer()); +``` + +#### Cross with Projection + +A Cross transformation can also construct result tuples using a projection as shown here: + +```java +DataSet> input1 = // [...] +DataSet> input2 = // [...] +DataSet + result = + input1.cross(input2) + // select and reorder fields of matching tuples + .projectSecond(0).projectFirst(1,0).projectSecond(1); +``` + +The field selection in a Cross projection works the same way as in the projection of Join results. + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +case class Coord(id: Int, x: Int, y: Int) + +val coords1: DataSet[Coord] = // [...] +val coords2: DataSet[Coord] = // [...] + +val distances = coords1.cross(coords2) { + (c1, c2) => + val dist = sqrt(pow(c1.x - c2.x, 2) + pow(c1.y - c2.y, 2)) + (c1.id, c2.id, dist) +} +``` + + +{{< /tab >}} +{{< /tabs >}} +#### Cross with DataSet Size Hint + +In order to guide the optimizer to pick the right execution strategy, you can hint the size of a DataSet to cross as shown here: + +{{< tabs "073c8892-cbfe-4fe6-b3bd-bf406565e48b" >}} +{{< tab "Java" >}} + +```java +DataSet> input1 = // [...] +DataSet> input2 = // [...] + +DataSet> + udfResult = + // hint that the second DataSet is very small + input1.crossWithTiny(input2) + // apply any Cross function (or projection) + .with(new MyCrosser()); + +DataSet> + projectResult = + // hint that the second DataSet is very large + input1.crossWithHuge(input2) + // apply a projection (or any Cross function) + .projectFirst(0,1).projectSecond(1); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val input1: DataSet[(Int, String)] = // [...] +val input2: DataSet[(Int, String)] = // [...] + +// hint that the second DataSet is very small +val result1 = input1.crossWithTiny(input2) + +// hint that the second DataSet is very large +val result1 = input1.crossWithHuge(input2) + +``` + +{{< /tab >}} +{{< /tabs >}} +### CoGroup + +The CoGroup transformation jointly processes groups of two DataSets. Both DataSets are grouped on a defined key and groups of both DataSets that share the same key are handed together to a user-defined co-group function. If for a specific key only one DataSet has a group, the co-group function is called with this group and an empty group. +A co-group function can separately iterate over the elements of both groups and return an arbitrary number of result elements. + +Similar to Reduce, GroupReduce, and Join, keys can be defined using the different key-selection methods. + +#### CoGroup on DataSets + +{{< tabs "48677022-ad18-4a24-b5b8-b51171eba128" >}} +{{< tab "Java" >}} + +The example shows how to group by Field Position Keys (Tuple DataSets only). You can do the same with Pojo-types and key expressions. + +```java +// Some CoGroupFunction definition +class MyCoGrouper + implements CoGroupFunction, Tuple2, Double> { + + @Override + public void coGroup(Iterable> iVals, + Iterable> dVals, + Collector out) { + + Set ints = new HashSet(); + + // add all Integer values in group to set + for (Tuple2> val : iVals) { + ints.add(val.f1); + } + + // multiply each Double value with each unique Integer values of group + for (Tuple2 val : dVals) { + for (Integer i : ints) { + out.collect(val.f1 * i); + } + } + } +} + +// [...] +DataSet> iVals = // [...] +DataSet> dVals = // [...] +DataSet output = iVals.coGroup(dVals) + // group first DataSet on first tuple field + .where(0) + // group second DataSet on first tuple field + .equalTo(0) + // apply CoGroup function on each pair of groups + .with(new MyCoGrouper()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val iVals: DataSet[(String, Int)] = // [...] +val dVals: DataSet[(String, Double)] = // [...] + +val output = iVals.coGroup(dVals).where(0).equalTo(0) { + (iVals, dVals, out: Collector[Double]) => + val ints = iVals map { _._2 } toSet + + for (dVal <- dVals) { + for (i <- ints) { + out.collect(dVal._2 * i) + } + } +} +``` + +{{< /tab >}} +{{< /tabs >}} + +### Union + +Produces the union of two DataSets, which have to be of the same type. A union of more than two DataSets can be implemented with multiple union calls, as shown here: + +{{< tabs "8e81e50d-0055-45fc-9c32-971149b2665e" >}} +{{< tab "Java" >}} + +```java +DataSet> vals1 = // [...] +DataSet> vals2 = // [...] +DataSet> vals3 = // [...] +DataSet> unioned = vals1.union(vals2).union(vals3); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val vals1: DataSet[(String, Int)] = // [...] +val vals2: DataSet[(String, Int)] = // [...] +val vals3: DataSet[(String, Int)] = // [...] + +val unioned = vals1.union(vals2).union(vals3) +``` + +{{< /tab >}} +{{< /tabs >}} +### Rebalance +Evenly rebalances the parallel partitions of a DataSet to eliminate data skew. + +{{< tabs "e3255bfc-4d57-435c-ae39-a15aae75691b" >}} +{{< tab "Java" >}} + +```java +DataSet in = // [...] +// rebalance DataSet and apply a Map transformation. +DataSet> out = in.rebalance() + .map(new Mapper()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val in: DataSet[String] = // [...] +// rebalance DataSet and apply a Map transformation. +val out = in.rebalance().map { ... } +``` + +{{< /tab >}} +{{< /tabs >}} + +### Hash-Partition + +Hash-partitions a DataSet on a given key. +Keys can be specified as position keys, expression keys, and key selector functions (see [Reduce examples](#reduce-on-grouped-dataset) for how to specify keys). + +{{< tabs "b71f7167-56c1-4e1b-acc2-1f47c1fca0a0" >}} +{{< tab "Java" >}} + +```java +DataSet> in = // [...] +// hash-partition DataSet by String value and apply a MapPartition transformation. +DataSet> out = in.partitionByHash(0) + .mapPartition(new PartitionMapper()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val in: DataSet[(String, Int)] = // [...] +// hash-partition DataSet by String value and apply a MapPartition transformation. +val out = in.partitionByHash(0).mapPartition { ... } +``` + +{{< /tab >}} +{{< /tabs >}} +### Range-Partition + +Range-partitions a DataSet on a given key. +Keys can be specified as position keys, expression keys, and key selector functions (see [Reduce examples](#reduce-on-grouped-dataset) for how to specify keys). + +{{< tabs "2d618ef2-4286-4536-b485-c3a8e3f49b86" >}} +{{< tab "Java" >}} + +```java +DataSet> in = // [...] +// range-partition DataSet by String value and apply a MapPartition transformation. +DataSet> out = in.partitionByRange(0) + .mapPartition(new PartitionMapper()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val in: DataSet[(String, Int)] = // [...] +// range-partition DataSet by String value and apply a MapPartition transformation. +val out = in.partitionByRange(0).mapPartition { ... } +``` + +{{< /tab >}} +{{< /tabs >}} + +### Sort Partition + +Locally sorts all partitions of a DataSet on a specified field in a specified order. +Fields can be specified as field expressions or field positions (see [Reduce examples](#reduce-on-grouped-dataset) for how to specify keys). +Partitions can be sorted on multiple fields by chaining `sortPartition()` calls. + +{{< tabs "6d24783a-75e1-44c9-a7e7-633ee75c9b47" >}} +{{< tab "Java" >}} + +```java +DataSet> in = // [...] +// Locally sort partitions in ascending order on the second String field and +// in descending order on the first String field. +// Apply a MapPartition transformation on the sorted partitions. +DataSet> out = in.sortPartition(1, Order.ASCENDING) + .sortPartition(0, Order.DESCENDING) + .mapPartition(new PartitionMapper()); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val in: DataSet[(String, Int)] = // [...] +// Locally sort partitions in ascending order on the second String field and +// in descending order on the first String field. +// Apply a MapPartition transformation on the sorted partitions. +val out = in.sortPartition(1, Order.ASCENDING) + .sortPartition(0, Order.DESCENDING) + .mapPartition { ... } +``` + +{{< /tab >}} +{{< /tabs >}} +### First-n + +Returns the first n (arbitrary) elements of a DataSet. First-n can be applied on a regular DataSet, a grouped DataSet, or a grouped-sorted DataSet. Grouping keys can be specified as key-selector functions or field position keys (see [Reduce examples](#reduce-on-grouped-dataset) for how to specify keys). + +{{< tabs "250a1c92-b5b0-4b34-a307-6d50e93a0508" >}} +{{< tab "Java" >}} + +```java +DataSet> in = // [...] +// Return the first five (arbitrary) elements of the DataSet +DataSet> out1 = in.first(5); + +// Return the first two (arbitrary) elements of each String group +DataSet> out2 = in.groupBy(0) + .first(2); + +// Return the first three elements of each String group ordered by the Integer field +DataSet> out3 = in.groupBy(0) + .sortGroup(1, Order.ASCENDING) + .first(3); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +val in: DataSet[(String, Int)] = // [...] +// Return the first five (arbitrary) elements of the DataSet +val out1 = in.first(5) + +// Return the first two (arbitrary) elements of each String group +val out2 = in.groupBy(0).first(2) + +// Return the first three elements of each String group ordered by the Integer field +val out3 = in.groupBy(0).sortGroup(1, Order.ASCENDING).first(3) +``` + +{{< /tab >}} +{{< /tabs >}} + diff --git a/docs/content.zh/docs/dev/dataset/zip_elements_guide.md b/docs/content.zh/docs/dev/dataset/zip_elements_guide.md new file mode 100644 index 0000000000000..b4ac43ec24ca6 --- /dev/null +++ b/docs/content.zh/docs/dev/dataset/zip_elements_guide.md @@ -0,0 +1,124 @@ +--- +title: Zipping Elements +nav-title: Zipping Elements +weight: 3 +type: docs +aliases: + - /zh/dev/batch/zip_elements_guide.html +--- + + +# Zipping Elements in a DataSet + +In certain algorithms, one may need to assign unique identifiers to data set elements. +This document shows how {{< gh_link file="/flink-java/src/main/java/org/apache/flink/api/java/utils/DataSetUtils.java" name="DataSetUtils" >}} can be used for that purpose. + +### Zip with a Dense Index + +`zipWithIndex` assigns consecutive labels to the elements, receiving a data set as input and returning a new data set of `(unique id, initial value)` 2-tuples. +This process requires two passes, first counting then labeling elements, and cannot be pipelined due to the synchronization of counts. +The alternative `zipWithUniqueId` works in a pipelined fashion and is preferred when a unique labeling is sufficient. +For example, the following code: + +{{< tabs "083bbdc6-b9f9-4989-86a8-f32f0ac53111" >}} +{{< tab "Java" >}} +```java +ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); +env.setParallelism(2); +DataSet in = env.fromElements("A", "B", "C", "D", "E", "F", "G", "H"); + +DataSet> result = DataSetUtils.zipWithIndex(in); + +result.writeAsCsv(resultPath, "\n", ","); +env.execute(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.scala._ + +val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment +env.setParallelism(2) +val input: DataSet[String] = env.fromElements("A", "B", "C", "D", "E", "F", "G", "H") + +val result: DataSet[(Long, String)] = input.zipWithIndex + +result.writeAsCsv(resultPath, "\n", ",") +env.execute() +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +from flink.plan.Environment import get_environment + +env = get_environment() +env.set_parallelism(2) +input = env.from_elements("A", "B", "C", "D", "E", "F", "G", "H") + +result = input.zip_with_index() + +result.write_text(result_path) +env.execute() +``` +{{< /tab >}} +{{< /tabs >}} + +may yield the tuples: (0,G), (1,H), (2,A), (3,B), (4,C), (5,D), (6,E), (7,F) + +{{< top >}} + +### Zip with a Unique Identifier + +In many cases one may not need to assign consecutive labels. +`zipWithUniqueId` works in a pipelined fashion, speeding up the label assignment process. This method receives a data set as input and returns a new data set of `(unique id, initial value)` 2-tuples. +For example, the following code: + +{{< tabs "49a5535f-7835-4204-afd4-40bb1cbfa404" >}} +{{< tab "Java" >}} +```java +ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); +env.setParallelism(2); +DataSet in = env.fromElements("A", "B", "C", "D", "E", "F", "G", "H"); + +DataSet> result = DataSetUtils.zipWithUniqueId(in); + +result.writeAsCsv(resultPath, "\n", ","); +env.execute(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.scala._ + +val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment +env.setParallelism(2) +val input: DataSet[String] = env.fromElements("A", "B", "C", "D", "E", "F", "G", "H") + +val result: DataSet[(Long, String)] = input.zipWithUniqueId + +result.writeAsCsv(resultPath, "\n", ",") +env.execute() +``` +{{< /tab >}} +{{< /tabs >}} + +may yield the tuples: (0,G), (1,A), (2,H), (3,B), (5,C), (7,D), (9,E), (11,F) + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/_index.md b/docs/content.zh/docs/dev/datastream/_index.md new file mode 100644 index 0000000000000..1a3281350eb61 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/_index.md @@ -0,0 +1,23 @@ +--- +title: DataStream API +bookCollapseSection: true +weight: 1 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/dev/datastream/application_parameters.md b/docs/content.zh/docs/dev/datastream/application_parameters.md new file mode 100644 index 0000000000000..29a2024ee1356 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/application_parameters.md @@ -0,0 +1,141 @@ +--- +title: "Handling Application Parameters" +weight: 51 +type: docs +aliases: + - /zh/dev/application_parameters.html +--- + + +# Handling Application Parameters + + + +Handling Application Parameters +------------------------------- +Almost all Flink applications, both batch and streaming, rely on external configuration parameters. +They are used to specify input and output sources (like paths or addresses), system parameters (parallelism, runtime configuration), and application specific parameters (typically used within user functions). + +Flink provides a simple utility called `ParameterTool` to provide some basic tooling for solving these problems. +Please note that you don't have to use the `ParameterTool` described here. Other frameworks such as [Commons CLI](https://commons.apache.org/proper/commons-cli/) and +[argparse4j](http://argparse4j.sourceforge.net/) also work well with Flink. + + +### Getting your configuration values into the `ParameterTool` + +The `ParameterTool` provides a set of predefined static methods for reading the configuration. The tool is internally expecting a `Map`, so it's very easy to integrate it with your own configuration style. + + +#### From `.properties` files + +The following method will read a [Properties](https://docs.oracle.com/javase/tutorial/essential/environment/properties.html) file and provide the key/value pairs: +```java +String propertiesFilePath = "/home/sam/flink/myjob.properties"; +ParameterTool parameter = ParameterTool.fromPropertiesFile(propertiesFilePath); + +File propertiesFile = new File(propertiesFilePath); +ParameterTool parameter = ParameterTool.fromPropertiesFile(propertiesFile); + +InputStream propertiesFileInputStream = new FileInputStream(file); +ParameterTool parameter = ParameterTool.fromPropertiesFile(propertiesFileInputStream); +``` + + +#### From the command line arguments + +This allows getting arguments like `--input hdfs:///mydata --elements 42` from the command line. +```java +public static void main(String[] args) { + ParameterTool parameter = ParameterTool.fromArgs(args); + // .. regular code .. +``` + + +#### From system properties + +When starting a JVM, you can pass system properties to it: `-Dinput=hdfs:///mydata`. You can also initialize the `ParameterTool` from these system properties: + +```java +ParameterTool parameter = ParameterTool.fromSystemProperties(); +``` + + +### Using the parameters in your Flink program + +Now that we've got the parameters from somewhere (see above) we can use them in various ways. + +**Directly from the `ParameterTool`** + +The `ParameterTool` itself has methods for accessing the values. +```java +ParameterTool parameters = // ... +parameter.getRequired("input"); +parameter.get("output", "myDefaultValue"); +parameter.getLong("expectedCount", -1L); +parameter.getNumberOfParameters() +// .. there are more methods available. +``` + +You can use the return values of these methods directly in the `main()` method of the client submitting the application. +For example, you could set the parallelism of a operator like this: + +```java +ParameterTool parameters = ParameterTool.fromArgs(args); +int parallelism = parameters.get("mapParallelism", 2); +DataSet> counts = text.flatMap(new Tokenizer()).setParallelism(parallelism); +``` + +Since the `ParameterTool` is serializable, you can pass it to the functions itself: + +```java +ParameterTool parameters = ParameterTool.fromArgs(args); +DataSet> counts = text.flatMap(new Tokenizer(parameters)); +``` + +and then use it inside the function for getting values from the command line. + +#### Register the parameters globally + +Parameters registered as global job parameters in the `ExecutionConfig` can be accessed as configuration values from the JobManager web interface and in all functions defined by the user. + +Register the parameters globally: + +```java +ParameterTool parameters = ParameterTool.fromArgs(args); + +// set up the execution environment +final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); +env.getConfig().setGlobalJobParameters(parameters); +``` + +Access them in any rich user function: + +```java +public static final class Tokenizer extends RichFlatMapFunction> { + + @Override + public void flatMap(String value, Collector> out) { + ParameterTool parameters = (ParameterTool) + getRuntimeContext().getExecutionConfig().getGlobalJobParameters(); + parameters.getRequired("input"); + // .. do more .. +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/event-time/_index.md b/docs/content.zh/docs/dev/datastream/event-time/_index.md new file mode 100644 index 0000000000000..c11a544830afb --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/event-time/_index.md @@ -0,0 +1,23 @@ +--- +title: 事件时间 +bookCollapseSection: true +weight: 4 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/dev/datastream/event-time/built_in.md b/docs/content.zh/docs/dev/datastream/event-time/built_in.md new file mode 100644 index 0000000000000..a33e12d03e5e9 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/event-time/built_in.md @@ -0,0 +1,75 @@ +--- +title: "内置 Watermark 生成器" +weight: 3 +type: docs +aliases: + - /zh/dev/event_timestamp_extractors.html + - /zh/apis/streaming/event_timestamp_extractors.html +--- + + +# 内置 Watermark 生成器 + +如[生成 Watermark]({{< ref "docs/dev/datastream/event-time/generating_watermarks" >}}) 小节中所述,Flink 提供的抽象方法可以允许用户自己去定义时间戳分配方式和 watermark 生成的方式。你可以通过实现 `WatermarkGenerator` 接口来实现上述功能。 + +为了进一步简化此类任务的编程工作,Flink 框架预设了一些时间戳分配器。本节后续内容有举例。除了开箱即用的已有实现外,其还可以作为自定义实现的示例以供参考。 + + + +## 单调递增时间戳分配器 + +*周期性* watermark 生成方式的一个最简单特例就是你给定的数据源中数据的时间戳升序出现。在这种情况下,当前时间戳就可以充当 watermark,因为后续到达数据的时间戳不会比当前的小。 + +注意:在 Flink 应用程序中,如果是并行数据源,则只要求并行数据源中的每个*单分区数据源任务*时间戳递增。例如,设置每一个并行数据源实例都只读取一个 Kafka 分区,则时间戳只需在每个 Kafka 分区内递增即可。Flink 的 watermark 合并机制会在并行数据流进行分发(shuffle)、联合(union)、连接(connect)或合并(merge)时生成正确的 watermark。 + +{{< tabs "5fd544cc-922b-43e0-9f44-8619bf6424fd" >}} +{{< tab "Java" >}} +```java +WatermarkStrategy.forMonotonousTimestamps(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +WatermarkStrategy.forMonotonousTimestamps() +``` +{{< /tab >}} +{{< /tabs >}} + + + +## 数据之间存在最大固定延迟的时间戳分配器 + +另一个周期性 watermark 生成的典型例子是,watermark 滞后于数据流中最大(事件时间)时间戳一个固定的时间量。该示例可以覆盖的场景是你预先知道数据流中的数据可能遇到的最大延迟,例如,在测试场景下创建了一个自定义数据源,并且这个数据源的产生的数据的时间戳在一个固定范围之内。Flink 针对上述场景提供了 `boundedOutfordernessWatermarks` 生成器,该生成器将 `maxOutOfOrderness` 作为参数,该参数代表在计算给定窗口的结果时,允许元素被忽略计算之前延迟到达的最长时间。其中延迟时长就等于 `t_w - t` ,其中 `t` 代表元素的(事件时间)时间戳,`t_w` 代表前一个 watermark 对应的(事件时间)时间戳。如果 `lateness > 0`,则认为该元素迟到了,并且在计算相应窗口的结果时默认会被忽略。有关使用延迟元素的详细内容,请参阅有关[允许延迟]({{< ref "docs/dev/datastream/operators/windows" >}}#allowed-lateness)的文档。 + +{{< tabs "9ef0eae9-f6ea-49f6-ab4c-7347a8b49197" >}} +{{< tab "Java" >}} +```java +WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofSeconds(10)); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +WatermarkStrategy + .forBoundedOutOfOrderness(Duration.ofSeconds(10)) +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/event-time/generating_watermarks.md b/docs/content.zh/docs/dev/datastream/event-time/generating_watermarks.md new file mode 100644 index 0000000000000..8bcd0e6b54f21 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/event-time/generating_watermarks.md @@ -0,0 +1,403 @@ +--- +title: "生成 Watermark" +weight: 2 +type: docs +aliases: + - /zh/dev/event_timestamps_watermarks.html + - /zh/apis/streaming/event_time.html + - /zh/apis/streaming/event_timestamps_watermarks.html +--- + + +# 生成 Watermark + +在本节中,你将了解 Flink 中用于处理**事件时间**的时间戳和 watermark 相关的 API。有关*事件时间*,*处理时间*和*摄取时间*的介绍,请参阅[事件时间概览]({{< ref "docs/concepts/time" >}})小节。 + + + +## Watermark 策略简介 + +为了使用*事件时间*语义,Flink 应用程序需要知道事件*时间戳*对应的字段,意味着数据流中的每个元素都需要拥有*可分配*的事件时间戳。其通常通过使用 `TimestampAssigner` API 从元素中的某个字段去访问/提取时间戳。 + +时间戳的分配与 watermark 的生成是齐头并进的,其可以告诉 Flink 应用程序事件时间的进度。其可以通过指定 `WatermarkGenerator` 来配置 watermark 的生成方式。 + +使用 Flink API 时需要设置一个同时包含 `TimestampAssigner` 和 `WatermarkGenerator` 的 `WatermarkStrategy`。`WatermarkStrategy` 工具类中也提供了许多常用的 watermark 策略,并且用户也可以在某些必要场景下构建自己的 watermark 策略。WatermarkStrategy 接口如下: + +```java +public interface WatermarkStrategy + extends TimestampAssignerSupplier, WatermarkGeneratorSupplier{ + + /** + * 根据策略实例化一个可分配时间戳的 {@link TimestampAssigner}。 + */ + @Override + TimestampAssigner createTimestampAssigner(TimestampAssignerSupplier.Context context); + + /** + * 根据策略实例化一个 watermark 生成器。 + */ + @Override + WatermarkGenerator createWatermarkGenerator(WatermarkGeneratorSupplier.Context context); +} +``` + +如上所述,通常情况下,你不用实现此接口,而是可以使用 `WatermarkStrategy` 工具类中通用的 watermark 策略,或者可以使用这个工具类将自定义的 `TimestampAssigner` 与 `WatermarkGenerator` 进行绑定。例如,你想要要使用有界无序(bounded-out-of-orderness)watermark 生成器和一个 lambda 表达式作为时间戳分配器,那么可以按照如下方式实现: + +{{< tabs "8f2bfdc8-b97e-48a0-bb38-de86e1070f81" >}} +{{< tab "Java" >}} +```java +WatermarkStrategy + .>forBoundedOutOfOrderness(Duration.ofSeconds(20)) + .withTimestampAssigner((event, timestamp) -> event.f0); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +WatermarkStrategy + .forBoundedOutOfOrderness[(Long, String)](Duration.ofSeconds(20)) + .withTimestampAssigner(new SerializableTimestampAssigner[(Long, String)] { + override def extractTimestamp(element: (Long, String), recordTimestamp: Long): Long = element._1 + }) +``` +{{< /tab >}} +{{< /tabs >}} + +其中 `TimestampAssigner` 的设置与否是可选的,大多数情况下,可以不用去特别指定。例如,当使用 Kafka 或 Kinesis 数据源时,你可以直接从 Kafka/Kinesis 数据源记录中获取到时间戳。 + +稍后我们将在[自定义 WatermarkGenerator](#writing-watermarkgenerators) 小节学习 WatermarkGenerator 接口。 + +
    +注意:时间戳和 watermark 都是从 1970-01-01T00:00:00Z 起的 Java 纪元开始,并以毫秒为单位。 +
    + + + +## 使用 Watermark 策略 + +`WatermarkStrategy` 可以在 Flink 应用程序中的两处使用,第一种是直接在数据源上使用,第二种是直接在非数据源的操作之后使用。 + +第一种方式相比会更好,因为数据源可以利用 watermark 生成逻辑中有关分片/分区(shards/partitions/splits)的信息。使用这种方式,数据源通常可以更精准地跟踪 watermark,整体 watermark 生成将更精确。直接在源上指定 `WatermarkStrategy` 意味着你必须使用特定数据源接口,参阅 [Watermark 策略与 Kafka 连接器](#watermark-策略与-kafka-连接器)以了解如何使用 Kafka Connector,以及有关每个分区的 watermark 是如何生成以及工作的。 + +仅当无法直接在数据源上设置策略时,才应该使用第二种方式(在任意转换操作之后设置 `WatermarkStrategy`): + +{{< tabs "0fced029-7487-414d-ad19-5641c02ff7bd" >}} +{{< tab "Java" >}} +```java +final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +DataStream stream = env.readFile( + myFormat, myFilePath, FileProcessingMode.PROCESS_CONTINUOUSLY, 100, + FilePathFilter.createDefaultFilter(), typeInfo); + +DataStream withTimestampsAndWatermarks = stream + .filter( event -> event.severity() == WARNING ) + .assignTimestampsAndWatermarks(); + +withTimestampsAndWatermarks + .keyBy( (event) -> event.getGroup() ) + .window(TumblingEventTimeWindows.of(Time.seconds(10))) + .reduce( (a, b) -> a.add(b) ) + .addSink(...); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment + +val stream: DataStream[MyEvent] = env.readFile( + myFormat, myFilePath, FileProcessingMode.PROCESS_CONTINUOUSLY, 100, + FilePathFilter.createDefaultFilter()) + +val withTimestampsAndWatermarks: DataStream[MyEvent] = stream + .filter( _.severity == WARNING ) + .assignTimestampsAndWatermarks() + +withTimestampsAndWatermarks + .keyBy( _.getGroup ) + .window(TumblingEventTimeWindows.of(Time.seconds(10))) + .reduce( (a, b) => a.add(b) ) + .addSink(...) +``` +{{< /tab >}} +{{< /tabs >}} + +使用 `WatermarkStrategy` 去获取流并生成带有时间戳的元素和 watermark 的新流时,如果原始流已经具有时间戳或 watermark,则新指定的时间戳分配器将覆盖原有的时间戳和 watermark。 + + + +## 处理空闲数据源 + +如果数据源中的某一个分区/分片在一段时间内未发送事件数据,则意味着 `WatermarkGenerator` 也不会获得任何新数据去生成 watermark。我们称这类数据源为*空闲输入*或*空闲源*。在这种情况下,当某些其他分区仍然发送事件数据的时候就会出现问题。由于下游算子 watermark 的计算方式是取所有不同的上游并行数据源 watermark 的最小值,则其 watermark 将不会发生变化。 + +为了解决这个问题,你可以使用 `WatermarkStrategy` 来检测空闲输入并将其标记为空闲状态。`WatermarkStrategy` 为此提供了一个工具接口: + +{{< tabs "962e09cc-a39d-4658-8530-bbc6aae20b5c" >}} +{{< tab "Java" >}} +```java +WatermarkStrategy + .>forBoundedOutOfOrderness(Duration.ofSeconds(20)) + .withIdleness(Duration.ofMinutes(1)); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +WatermarkStrategy + .forBoundedOutOfOrderness[(Long, String)](Duration.ofSeconds(20)) + .withIdleness(Duration.ofMinutes(1)) +``` +{{< /tab >}} +{{< /tabs >}} + + + + +## 自定义 WatermarkGenerator + +`TimestampAssigner` 是一个可以从事件数据中提取时间戳字段的简单函数,我们无需详细查看其实现。但是 `WatermarkGenerator` 的编写相对就要复杂一些了,我们将在接下来的两小节中介绍如何实现此接口。WatermarkGenerator 接口代码如下: + +```java +/** + * {@code WatermarkGenerator} 可以基于事件或者周期性的生成 watermark。 + * + *

    注意: WatermarkGenerator 将以前互相独立的 {@code AssignerWithPunctuatedWatermarks} + * 和 {@code AssignerWithPeriodicWatermarks} 一同包含了进来。 + */ +@Public +public interface WatermarkGenerator { + + /** + * 每来一条事件数据调用一次,可以检查或者记录事件的时间戳,或者也可以基于事件数据本身去生成 watermark。 + */ + void onEvent(T event, long eventTimestamp, WatermarkOutput output); + + /** + * 周期性的调用,也许会生成新的 watermark,也许不会。 + * + *

    调用此方法生成 watermark 的间隔时间由 {@link ExecutionConfig#getAutoWatermarkInterval()} 决定。 + */ + void onPeriodicEmit(WatermarkOutput output); +} +``` + +watermark 的生成方式本质上是有两种:*周期性生成*和*标记生成*。 + +周期性生成器通常通过 `onEvent()` 观察传入的事件数据,然后在框架调用 `onPeriodicEmit()` 时发出 watermark。 + +标记生成器将查看 `onEvent()` 中的事件数据,并等待检查在流中携带 watermark 的特殊标记事件或打点数据。当获取到这些事件数据时,它将立即发出 watermark。通常情况下,标记生成器不会通过 `onPeriodicEmit()` 发出 watermark。 + +接下来,我们将学习如何实现上述两类生成器。 + + + +### 自定义周期性 Watermark 生成器 + +周期性生成器会观察流事件数据并定期生成 watermark(其生成可能取决于流数据,或者完全基于处理时间)。 + +生成 watermark 的时间间隔(每 *n* 毫秒)可以通过 `ExecutionConfig.setAutoWatermarkInterval(...)` 指定。每次都会调用生成器的 `onPeriodicEmit()` 方法,如果返回的 watermark 非空且值大于前一个 watermark,则将发出新的 watermark。 + +如下是两个使用周期性 watermark 生成器的简单示例。注意:Flink 已经附带了 `BoundedOutOfOrdernessWatermarks`,它实现了 `WatermarkGenerator`,其工作原理与下面的 `BoundedOutOfOrdernessGenerator` 相似。可以在[这里]({{< ref "docs/dev/datastream/event-time/built_in" >}})参阅如何使用它的内容。 + +{{< tabs "24fa8f1b-702c-4d6e-88e5-afa94c31a8e4" >}} +{{< tab "Java" >}} +```java +/** + * 该 watermark 生成器可以覆盖的场景是:数据源在一定程度上乱序。 + * 即某个最新到达的时间戳为 t 的元素将在最早到达的时间戳为 t 的元素之后最多 n 毫秒到达。 + */ +public class BoundedOutOfOrdernessGenerator implements WatermarkGenerator { + + private final long maxOutOfOrderness = 3500; // 3.5 秒 + + private long currentMaxTimestamp; + + @Override + public void onEvent(MyEvent event, long eventTimestamp, WatermarkOutput output) { + currentMaxTimestamp = Math.max(currentMaxTimestamp, eventTimestamp); + } + + @Override + public void onPeriodicEmit(WatermarkOutput output) { + // 发出的 watermark = 当前最大时间戳 - 最大乱序时间 + output.emitWatermark(new Watermark(currentMaxTimestamp - maxOutOfOrderness - 1)); + } + +} + +/** + * 该生成器生成的 watermark 滞后于处理时间固定量。它假定元素会在有限延迟后到达 Flink。 + */ +public class TimeLagWatermarkGenerator implements WatermarkGenerator { + + private final long maxTimeLag = 5000; // 5 秒 + + @Override + public void onEvent(MyEvent event, long eventTimestamp, WatermarkOutput output) { + // 处理时间场景下不需要实现 + } + + @Override + public void onPeriodicEmit(WatermarkOutput output) { + output.emitWatermark(new Watermark(System.currentTimeMillis() - maxTimeLag)); + } +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +/** + * 该 watermark 生成器可以覆盖的场景是:数据源在一定程度上乱序。 + * 即某个最新到达的时间戳为 t 的元素将在最早到达的时间戳为 t 的元素之后最多 n 毫秒到达。 + */ +class BoundedOutOfOrdernessGenerator extends AssignerWithPeriodicWatermarks[MyEvent] { + + val maxOutOfOrderness = 3500L // 3.5 秒 + + var currentMaxTimestamp: Long = _ + + override def onEvent(element: MyEvent, eventTimestamp: Long): Unit = { + currentMaxTimestamp = max(eventTimestamp, currentMaxTimestamp) + } + + override def onPeriodicEmit(): Unit = { + // 发出的 watermark = 当前最大时间戳 - 最大乱序时间 + output.emitWatermark(new Watermark(currentMaxTimestamp - maxOutOfOrderness - 1)); + } +} + +/** + * 该生成器生成的 watermark 滞后于处理时间固定量。它假定元素会在有限延迟后到达 Flink。 + */ +class TimeLagWatermarkGenerator extends AssignerWithPeriodicWatermarks[MyEvent] { + + val maxTimeLag = 5000L // 5 秒 + + override def onEvent(element: MyEvent, eventTimestamp: Long): Unit = { + // 处理时间场景下不需要实现 + } + + override def onPeriodicEmit(): Unit = { + output.emitWatermark(new Watermark(System.currentTimeMillis() - maxTimeLag)); + } +} +``` +{{< /tab >}} +{{< /tabs >}} + + + +### 自定义标记 Watermark 生成器 + +标记 watermark 生成器观察流事件数据并在获取到带有 watermark 信息的特殊事件元素时发出 watermark。 + +如下是实现标记生成器的方法,当事件带有某个指定标记时,该生成器就会发出 watermark: + +{{< tabs "54b557f5-44cb-4fc4-80ee-a0906d8540aa" >}} +{{< tab "Java" >}} +```java +public class PunctuatedAssigner implements WatermarkGenerator { + + @Override + public void onEvent(MyEvent event, long eventTimestamp, WatermarkOutput output) { + if (event.hasWatermarkMarker()) { + output.emitWatermark(new Watermark(event.getWatermarkTimestamp())); + } + } + + @Override + public void onPeriodicEmit(WatermarkOutput output) { + // onEvent 中已经实现 + } +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +class PunctuatedAssigner extends AssignerWithPunctuatedWatermarks[MyEvent] { + + override def onEvent(element: MyEvent, eventTimestamp: Long): Unit = { + if (event.hasWatermarkMarker()) { + output.emitWatermark(new Watermark(event.getWatermarkTimestamp())) + } + } + + override def onPeriodicEmit(): Unit = { + // onEvent 中已经实现 + } +} +``` +{{< /tab >}} +{{< /tabs >}} + +

    +注意:可以针对每个事件去生成 watermark。但是由于每个 watermark 都会在下游做一些计算,因此过多的 watermark 会降低程序性能。 +
    + + + +## Watermark 策略与 Kafka 连接器 + +当使用 [Apache Kafka 连接器](connectors/kafka.html)作为数据源时,每个 Kafka 分区可能有一个简单的事件时间模式(递增的时间戳或有界无序)。然而,当使用 Kafka 数据源时,多个分区常常并行使用,因此交错来自各个分区的事件数据就会破坏每个分区的事件时间模式(这是 Kafka 消费客户端所固有的)。 + +在这种情况下,你可以使用 Flink 中可识别 Kafka 分区的 watermark 生成机制。使用此特性,将在 Kafka 消费端内部针对每个 Kafka 分区生成 watermark,并且不同分区 watermark 的合并方式与在数据流 shuffle 时的合并方式相同。 + +例如,如果每个 Kafka 分区中的事件时间戳严格递增,则使用[时间戳单调递增](event_timestamp_extractors.html#时间戳单调递增)按分区生成的 watermark 将生成完美的全局 watermark。注意,我们在示例中未使用 `TimestampAssigner`,而是使用了 Kafka 记录自身的时间戳。 + +下图展示了如何使用单 kafka 分区 watermark 生成机制,以及在这种情况下 watermark 如何通过 dataflow 传播。 + +{{< tabs "bd763159-5532-4f69-ae15-a4836886e4fe" >}} +{{< tab "Java" >}} +```java +FlinkKafkaConsumer kafkaSource = new FlinkKafkaConsumer<>("myTopic", schema, props); +kafkaSource.assignTimestampsAndWatermarks( + WatermarkStrategy. + .forBoundedOutOfOrderness(Duration.ofSeconds(20))); + +DataStream stream = env.addSource(kafkaSource); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val kafkaSource = new FlinkKafkaConsumer[MyType]("myTopic", schema, props) +kafkaSource.assignTimestampsAndWatermarks( + WatermarkStrategy + .forBoundedOutOfOrderness(Duration.ofSeconds(20))) + +val stream: DataStream[MyType] = env.addSource(kafkaSource) +``` +{{< /tab >}} +{{< /tabs >}} + +{{< img src="/fig/parallel_kafka_watermarks.svg" alt="Generating Watermarks with awareness for Kafka-partitions" class="center" width="80%" >}} + + + +## 算子处理 Watermark 的方式 + +一般情况下,在将 watermark 转发到下游之前,需要算子对其进行触发的事件完全进行处理。例如,`WindowOperator` 将首先计算该 watermark 触发的所有窗口数据,当且仅当由此 watermark 触发计算进而生成的所有数据被转发到下游之后,其才会被发送到下游。换句话说,由于此 watermark 的出现而产生的所有数据元素都将在此 watermark 之前发出。 + +相同的规则也适用于 `TwoInputStreamOperator`。但是,在这种情况下,算子当前的 watermark 会取其两个输入的最小值。 + +详细内容可查看对应算子的实现:`OneInputStreamOperator#processWatermark`、`TwoInputStreamOperator#processWatermark1` 和 `TwoInputStreamOperator#processWatermark2`。 + +## 可以弃用 AssignerWithPeriodicWatermarks 和 AssignerWithPunctuatedWatermarks 了 + +在 Flink 新的 `WatermarkStrategy`,`TimestampAssigner` 和 `WatermarkGenerator` 的抽象接口之前,Flink 使用的是 `AssignerWithPeriodicWatermarks` 和 `AssignerWithPunctuatedWatermarks`。你仍可以在 API 中看到它们,但建议使用新接口,因为其对时间戳和 watermark 等重点的抽象和分离很清晰,并且还统一了周期性和标记形式的 watermark 生成方式。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/execution_mode.md b/docs/content.zh/docs/dev/datastream/execution_mode.md new file mode 100644 index 0000000000000..969f1579f06dd --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/execution_mode.md @@ -0,0 +1,395 @@ +--- +title: "Execution Mode (Batch/Streaming)" +weight: 2 +type: docs +aliases: + - /zh/dev/datastream_execution_mode.html +--- + + +# Execution Mode (Batch/Streaming) + +The DataStream API supports different runtime execution modes from which you +can choose depending on the requirements of your use case and the +characteristics of your job. + +There is the "classic" execution behavior of the DataStream API, which we call +`STREAMING` execution mode. This should be used for unbounded jobs that require +continuous incremental processing and are expected to stay online indefinitely. + +Additionally, there is a batch-style execution mode that we call `BATCH` +execution mode. This executes jobs in a way that is more reminiscent of batch +processing frameworks such as MapReduce. This should be used for bounded jobs +for which you have a known fixed input and which do not run continuously. + +Apache Flink's unified approach to stream and batch processing means that a +DataStream application executed over bounded input will produce the same +*final* results regardless of the configured execution mode. It is important to +note what *final* means here: a job executing in `STREAMING` mode might produce +incremental updates (think upserts in a database) while a `BATCH` job would +only produce one final result at the end. The final result will be the same if +interpreted correctly but the way to get there can be different. + +By enabling `BATCH` execution, we allow Flink to apply additional optimizations +that we can only do when we know that our input is bounded. For example, +different join/aggregation strategies can be used, in addition to a different +shuffle implementation that allows more efficient task scheduling and failure +recovery behavior. We will go into some of the details of the execution +behavior below. + +## When can/should I use BATCH execution mode? + +The `BATCH` execution mode can only be used for Jobs/Flink Programs that are +_bounded_. Boundedness is a property of a data source that tells us whether all +the input coming from that source is known before execution or whether new data +will show up, potentially indefinitely. A job, in turn, is bounded if all its +sources are bounded, and unbounded otherwise. + +`STREAMING` execution mode, on the other hand, can be used for both bounded and +unbounded jobs. + +As a rule of thumb, you should be using `BATCH` execution mode when your program +is bounded because this will be more efficient. You have to use `STREAMING` +execution mode when your program is unbounded because only this mode is general +enough to be able to deal with continuous data streams. + +One obvious outlier is when you want to use a bounded job to bootstrap some job +state that you then want to use in an unbounded job. For example, by running a +bounded job using `STREAMING` mode, taking a savepoint, and then restoring that +savepoint on an unbounded job. This is a very specific use case and one that +might soon become obsolete when we allow producing a savepoint as additional +output of a `BATCH` execution job. + +Another case where you might run a bounded job using `STREAMING` mode is when +writing tests for code that will eventually run with unbounded sources. For +testing it can be more natural to use a bounded source in those cases. + +## Configuring BATCH execution mode + +The execution mode can be configured via the `execution.runtime-mode` setting. +There are three possible values: + + - `STREAMING`: The classic DataStream execution mode (default) + - `BATCH`: Batch-style execution on the DataStream API + - `AUTOMATIC`: Let the system decide based on the boundedness of the sources + +This can be configured via command line parameters of `bin/flink run ...`, or +programmatically when creating/configuring the `StreamExecutionEnvironment`. + +Here's how you can configure the execution mode via the command line: + +```bash +$ bin/flink run -Dexecution.runtime-mode=BATCH examples/streaming/WordCount.jar +``` + +This example shows how you can configure the execution mode in code: + + ```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +env.setRuntimeMode(RuntimeExecutionMode.BATCH); + ``` + +{{< hint info >}} +We recommend users to **NOT** set the runtime mode in their program but to instead +set it using the command-line when submitting the application. Keeping the +application code configuration-free allows for more flexibility as the same +application can be executed in any execution mode. +{{< /hint >}} + +## Execution Behavior + +This section provides an overview of the execution behavior of `BATCH` +execution mode and contrasts it with `STREAMING` execution mode. For more +details, please refer to the FLIPs that introduced this feature: +[FLIP-134](https://cwiki.apache.org/confluence/x/4i94CQ) and +[FLIP-140](https://cwiki.apache.org/confluence/x/kDh4CQ). + +### Task Scheduling And Network Shuffle + +Flink jobs consist of different operations that are connected together in a +dataflow graph. The system decides how to schedule the execution of these +operations on different processes/machines (TaskManagers) and how data is +shuffled (sent) between them. + +Multiple operations/operators can be chained together using a feature called +[chaining]({{< ref "docs/dev/datastream/operators/overview" >}}#task-chaining-and-resource-groups). +A group of one or multiple (chained) +operators that Flink considers as a unit of scheduling is called a _task_. +Often the term _subtask_ is used to refer to the individual instances of tasks +that are running in parallel on multiple TaskManagers but we will only use the +term _task_ here. + +Task scheduling and network shuffles work differently for `BATCH` and +`STREAMING` execution mode. Mostly due to the fact that we know our input data +is bounded in `BATCH` execution mode, which allows Flink to use more efficient +data structures and algorithms. + +We will use this example to explain the differences in task scheduling and +network transfer: + +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +DataStreamSource source = env.fromElements(...); + +source.name("source") + .map(...).name("map1") + .map(...).name("map2") + .rebalance() + .map(...).name("map3") + .map(...).name("map4") + .keyBy((value) -> value) + .map(...).name("map5") + .map(...).name("map6") + .sinkTo(...).name("sink"); +``` + +Operations that imply a 1-to-1 connection pattern between operations, such as +`map()`, `flatMap()`, or `filter()` can just forward data straight to the next +operation, which allows these operations to be chained together. This means +that Flink would not normally insert a network shuffle between them. + +Operation such as `keyBy()` or `rebalance()` on the other hand require data to +be shuffled between different parallel instances of tasks. This induces a +network shuffle. + +For the above example Flink would group operations together as tasks like this: + +- Task1: `source`, `map1`, and `map2` +- Task2: `map3`, `map4` +- Task3: `map5`, `map6`, and `sink` + +And we have a network shuffle between Tasks 1 and 2, and also Tasks 2 and 3. +This is a visual representation of that job: + +{{< img src="/fig/datastream-example-job-graph.svg" alt="Example Job Graph" >}} + +#### STREAMING Execution Mode + +In `STREAMING` execution mode, all tasks need to be online/running all the +time. This allows Flink to immediately process new records through the whole +pipeline, which we need for continuous and low-latency stream processing. This +also means that the TaskManagers that are allotted to a job need to have enough +resources to run all the tasks at the same time. + +Network shuffles are _pipelined_, meaning that records are immediately sent to +downstream tasks, with some buffering on the network layer. Again, this is +required because when processing a continuous stream of data there are no +natural points (in time) where data could be materialized between tasks (or +pipelines of tasks). This contrasts with `BATCH` execution mode where +intermediate results can be materialized, as explained below. + +#### BATCH Execution Mode + +In `BATCH` execution mode, the tasks of a job can be separated into stages that +can be executed one after another. We can do this because the input is bounded +and Flink can therefore fully process one stage of the pipeline before moving +on to the next. In the above example the job would have three stages that +correspond to the three tasks that are separated by the shuffle barriers. + +Instead of sending records immediately to downstream tasks, as explained above +for `STREAMING` mode, processing in stages requires Flink to materialize +intermediate results of tasks to some non-ephemeral storage which allows +downstream tasks to read them after upstream tasks have already gone off line. +This will increase the latency of processing but comes with other interesting +properties. For one, this allows Flink to backtrack to the latest available +results when a failure happens instead of restarting the whole job. Another +side effect is that `BATCH` jobs can execute on fewer resources (in terms of +available slots at TaskManagers) because the system can execute tasks +sequentially one after the other. + +TaskManagers will keep intermediate results at least as long as downstream +tasks have not consumed them. (Technically, they will be kept until the +consuming *pipelined regions* have produced their output.) After +that, they will be kept for as long as space allows in order to allow the +aforementioned backtracking to earlier results in case of a failure. + +### State Backends / State + +In `STREAMING` mode, Flink uses a [StateBackend]({{< ref "docs/dev/datastream/fault-tolerance/state_backends" >}}) to control how state is stored and how +checkpointing works. + +In `BATCH` mode, the configured state backend is ignored. Instead, the input of +a keyed operation is grouped by key (using sorting) and then we process all +records of a key in turn. This allows keeping only the state of only one key at +the same time. State for a given key will be discarded when moving on to the +next key. + +See [FLIP-140](https://cwiki.apache.org/confluence/x/kDh4CQ) for background +information on this. + +### Order of Processing + +The order in which records are processed in operators or user-defined functions (UDFs) can differ between `BATCH` and `STREAMING` execution. + +In `STREAMING` mode, user-defined functions should not make any assumptions about incoming records' order. +Data is processed as soon as it arrives. + +In `BATCH` execution mode, there are some operations where Flink guarantees order. +The ordering can be a side effect of the particular task scheduling, +network shuffle, and state backend (see above), or a conscious choice by the system. + +There are three general types of input that we can differentiate: + +- _broadcast input_: input from a broadcast stream (see also [Broadcast + State]({{< ref "docs/dev/datastream/fault-tolerance/broadcast_state" >}})) +- _regular input_: input that is neither broadcast nor keyed +- _keyed input_: input from a `KeyedStream` + +Functions, or Operators, that consume multiple input types will process them in the following order: + +- broadcast inputs are processed first +- regular inputs are processed second +- keyed inputs are processed last + +For functions that consume from multiple regular or broadcast inputs — such as a `CoProcessFunction` — Flink has the right to process data from any input of that type in any order. + +For functions that consume from multiple keyed inputs — such as a `KeyedCoProcessFunction` — Flink processes all records for a single key from all keyed inputs before moving on to the next. + + +### Event Time / Watermarks + +When it comes to supporting [event time]({{< ref "docs/dev/datastream/event-time/generating_watermarks" >}}), Flink’s +streaming runtime builds on the pessimistic assumption that events may come +out-of-order, _i.e._ an event with timestamp `t` may come after an event with +timestamp `t+1`. Because of this, the system can never be sure that no more +elements with timestamp `t < T` for a given timestamp `T` can come in the +future. To amortise the impact of this out-of-orderness on the final result +while making the system practical, in `STREAMING` mode, Flink uses a heuristic +called [Watermarks]({{< ref "docs/concepts/time" >}}#event-time-and-watermarks). +A watermark with timestamp `T` signals that no element with timestamp `t < T` will follow. + +In `BATCH` mode, where the input dataset is known in advance, there is no need +for such a heuristic as, at the very least, elements can be sorted by timestamp +so that they are processed in temporal order. For readers familiar with +streaming, in `BATCH` we can assume “perfect watermarks”. + +Given the above, in `BATCH` mode, we only need a `MAX_WATERMARK` at the end of +the input associated with each key, or at the end of input if the input stream +is not keyed. Based on this scheme, all registered timers will fire at the *end +of time* and user-defined `WatermarkAssigners` or `WatermarkGenerators` are +ignored. Specifying a `WatermarkStrategy` is still important, though, because +its `TimestampAssigner` will still be used to assign timestamps to records. + +### Processing Time + +Processing Time is the wall-clock time on the machine that a record is +processed, at the specific instance that the record is being processed. Based +on this definition, we see that the results of a computation that is based on +processing time are not reproducible. This is because the same record processed +twice will have two different timestamps. + +Despite the above, using processing time in `STREAMING` mode can be useful. The +reason has to do with the fact that streaming pipelines often ingest their +unbounded input in *real time* so there is a correlation between event time and +processing time. In addition, because of the above, in `STREAMING` mode `1h` in +event time can often be almost `1h` in processing time, or wall-clock time. So +using processing time can be used for early (incomplete) firings that give +hints about the expected results. + +This correlation does not exist in the batch world where the input dataset is +static and known in advance. Given this, in `BATCH` mode we allow users to +request the current processing time and register processing time timers, but, +as in the case of Event Time, all the timers are going to fire at the end of +the input. + +Conceptually, we can imagine that processing time does not advance during the +execution of a job and we fast-forward to the *end of time* when the whole +input is processed. + +### Failure Recovery + +In `STREAMING` execution mode, Flink uses checkpoints for failure recovery. +Take a look at the [checkpointing documentation]({{< ref "docs/dev/datastream/fault-tolerance/checkpointing" >}}) for hands-on documentation about this and +how to configure it. There is also a more introductory section about [fault +tolerance via state snapshots]({{< ref "docs/learn-flink/fault_tolerance" >}}) that +explains the concepts at a higher level. + +One of the characteristics of checkpointing for failure recovery is that Flink +will restart all the running tasks from a checkpoint in case of a failure. This +can be more costly than what we have to do in `BATCH` mode (as explained +below), which is one of the reasons that you should use `BATCH` execution mode +if your job allows it. + +In `BATCH` execution mode, Flink will try and backtrack to previous processing +stages for which intermediate results are still available. Potentially, only +the tasks that failed (or their predecessors in the graph) will have to be +restarted, which can improve processing efficiency and overall processing time +of the job compared to restarting all tasks from a checkpoint. + +## Important Considerations + +Compared to classic `STREAMING` execution mode, in `BATCH` mode some things +might not work as expected. Some features will work slightly differently while +others are not supported. + +Behavior Change in BATCH mode: + +* "Rolling" operations such as [reduce()]({{< ref "docs/dev/datastream/operators/overview" >}}#reduce) + or [sum()]({{< ref "docs/dev/datastream/operators/overview" >}}#aggregations) + emit an incremental update for every new record that arrives in `STREAMING` + mode. In `BATCH` mode, these operations are not "rolling". They emit only the + final result. + + +Unsupported in BATCH mode: + +* [Checkpointing]({{< ref "docs/concepts/stateful-stream-processing" >}}#stateful-stream-processing) + and any operations that depend on checkpointing do not work. +* [Iterations]({{< ref "docs/dev/datastream/operators/overview" >}}#iterate) + +Custom operators should be implemented with care, otherwise they might behave +improperly. See also additional explanations below for more details. + +### Checkpointing + +As explained [above](#failure-recovery), failure recovery for batch programs +does not use checkpointing. + +It is important to remember that because there are no checkpoints, certain +features such as {{< javadoc file="org/apache/flink/api/common/state/CheckpointListener.html" name="CheckpointListener">}} +and, as a result, Kafka's [EXACTLY_ONCE]({{< ref "docs/connectors/datastream/kafka" >}}#kafka-producers-and-fault-tolerance) mode or `StreamingFileSink`'s +[OnCheckpointRollingPolicy]({{< ref "docs/connectors/datastream/streamfile_sink" >}}#rolling-policy) +won't work. If you need a transactional sink that works in +`BATCH` mode make sure it uses the Unified Sink API as proposed in +[FLIP-143](https://cwiki.apache.org/confluence/x/KEJ4CQ). + +You can still use all the [state primitives]({{< ref "docs/dev/datastream/fault-tolerance/state" >}}#working-with-state), +it's just that the mechanism used for failure recovery will be different. + +### Writing Custom Operators + +{{< hint info >}} +**Note:** Custom operators are an advanced usage pattern of Apache Flink. For most +use-cases, consider using a (keyed-)process function instead. +{{< /hint >}} + +It is important to remember the assumptions made for `BATCH` execution mode +when writing a custom operator. Otherwise, an operator that works just fine for +`STREAMING` mode might produce wrong results in `BATCH` mode. Operators are +never scoped to a particular key which means they see some properties of +`BATCH` processing Flink tries to leverage. + +First of all you should not cache the last seen watermark within an operator. +In `BATCH` mode we process records key by key. As a result, the Watermark will +switch from `MAX_VALUE` to `MIN_VALUE` between each key. You should not assume +that the Watermark will always be ascending in an operator. For the same +reasons timers will fire first in key order and then in timestamp order within +each key. Moreover, operations that change a key manually are not supported. diff --git a/docs/content.zh/docs/dev/datastream/experimental.md b/docs/content.zh/docs/dev/datastream/experimental.md new file mode 100644 index 0000000000000..90212a5364c99 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/experimental.md @@ -0,0 +1,85 @@ +--- +title: "实验功能" +weight: 101 +type: docs +aliases: + - /zh/dev/stream/experimental.html +--- + + +# 实验功能 + +This section describes experimental features in the DataStream API. Experimental features are still evolving and can be either unstable, +incomplete, or subject to heavy change in future versions. + +Reinterpreting a pre-partitioned data stream as keyed stream +------------------------------------------------------------ + +We can re-interpret a pre-partitioned data stream as a keyed stream to avoid shuffling. + +{{< hint danger >}} +**WARNING**: The re-interpreted data stream **MUST** already be pre-partitioned in **EXACTLY** the same way Flink's keyBy would partition +the data in a shuffle w.r.t. key-group assignment. +{{< /hint >}} + +One use-case for this could be a materialized shuffle between two jobs: the first job performs a keyBy shuffle and materializes +each output into a partition. A second job has sources that, for each parallel instance, reads from the corresponding partitions +created by the first job. Those sources can now be re-interpreted as keyed streams, e.g. to apply windowing. Notice that this trick +makes the second job embarrassingly parallel, which can be helpful for a fine-grained recovery scheme. + +This re-interpretation functionality is exposed through `DataStreamUtils`: + +```java +static KeyedStream reinterpretAsKeyedStream( + DataStream stream, + KeySelector keySelector, + TypeInformation typeInfo) +``` + +Given a base stream, a key selector, and type information, +the method creates a keyed stream from the base stream. + +Code example: + +{{< tabs "0bf7bee6-23be-42b5-bf90-afab0a4f8dc2" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +DataStreamSource source = ... +DataStreamUtils.reinterpretAsKeyedStream(source, (in) -> in, TypeInformation.of(Integer.class)) + .window(TumblingEventTimeWindows.of(Time.seconds(1))) + .reduce((a, b) -> a + b) + .addSink(new DiscardingSink<>()); +env.execute(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment +env.setParallelism(1) +val source = ... +new DataStreamUtils(source).reinterpretAsKeyedStream((in) => in) + .window(TumblingEventTimeWindows.of(Time.seconds(1))) + .reduce((a, b) => a + b) + .addSink(new DiscardingSink[Int]) +env.execute() +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/fault-tolerance/_index.md b/docs/content.zh/docs/dev/datastream/fault-tolerance/_index.md new file mode 100644 index 0000000000000..c6abd401c973f --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/fault-tolerance/_index.md @@ -0,0 +1,23 @@ +--- +title: 状态与容错 +bookCollapseSection: true +weight: 5 +--- + diff --git a/docs/content.zh/docs/dev/datastream/fault-tolerance/broadcast_state.md b/docs/content.zh/docs/dev/datastream/fault-tolerance/broadcast_state.md new file mode 100644 index 0000000000000..e6e5b12d24f0e --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/fault-tolerance/broadcast_state.md @@ -0,0 +1,235 @@ +--- +title: "Broadcast State 模式" +weight: 3 +type: docs +aliases: + - /zh/dev/stream/state/broadcast_state.html +--- + + +# Broadcast State 模式 + +你将在本节中了解到如何实际使用 broadcast state。想了解更多有状态流处理的概念,请参考 +[Stateful Stream Processing]({{< ref "docs/concepts/stateful-stream-processing" >}})。 + +## 提供的 API + +在这里我们使用一个例子来展现 broadcast state 提供的接口。假设存在一个序列,序列中的元素是具有不同颜色与形状的图形,我们希望在序列里相同颜色的图形中寻找满足一定顺序模式的图形对(比如在红色的图形里,有一个长方形跟着一个三角形)。 +同时,我们希望寻找的模式也会随着时间而改变。 + +在这个例子中,我们定义两个流,一个流包含`图形(Item)`,具有`颜色`和`形状`两个属性。另一个流包含特定的`规则(Rule)`,代表希望寻找的模式。 + +在`图形`流中,我们需要首先使用`颜色`将流进行进行分区(keyBy),这能确保相同颜色的图形会流转到相同的物理机上。 + +```java +// 将图形使用颜色进行划分 +KeyedStream colorPartitionedStream = itemStream + .keyBy(new KeySelector(){...}); +``` + +对于`规则`流,它应该被广播到所有的下游 task 中,下游 task 应当存储这些规则并根据它寻找满足规则的图形对。下面这段代码会完成: +i) 将`规则`广播给所有下游 task; +ii) 使用 `MapStateDescriptor` 来描述并创建 broadcast state 在下游的存储结构 + +```java + +// 一个 map descriptor,它描述了用于存储规则名称与规则本身的 map 存储结构 +MapStateDescriptor ruleStateDescriptor = new MapStateDescriptor<>( + "RulesBroadcastState", + BasicTypeInfo.STRING_TYPE_INFO, + TypeInformation.of(new TypeHint() {})); + +// 广播流,广播规则并且创建 broadcast state +BroadcastStream ruleBroadcastStream = ruleStream + .broadcast(ruleStateDescriptor); +``` + +最终,为了使用`规则`来筛选`图形`序列,我们需要: + 1. 将两个流关联起来 + 2. 完成我们的模式识别逻辑 + +为了关联一个非广播流(keyed 或者 non-keyed)与一个广播流(`BroadcastStream`),我们可以调用非广播流的方法 `connect()`,并将 `BroadcastStream` 当做参数传入。 +这个方法的返回参数是 `BroadcastConnectedStream`,具有类型方法 `process()`,传入一个特殊的 `CoProcessFunction` 来书写我们的模式识别逻辑。 +具体传入 `process()` 的是哪个类型取决于非广播流的类型: + - 如果流是一个 **keyed** 流,那就是 `KeyedBroadcastProcessFunction` 类型; + - 如果流是一个 **non-keyed** 流,那就是 `BroadcastProcessFunction` 类型。 + +在我们的例子中,`图形`流是一个 keyed stream,所以我们书写的代码如下: + +{{< hint warning >}} +`connect()` 方法需要由非广播流来进行调用,`BroadcastStream` 作为参数传入。 +{{< /hint >}} + +```java +DataStream output = colorPartitionedStream + .connect(ruleBroadcastStream) + .process( + + // KeyedBroadcastProcessFunction 中的类型参数表示: + // 1. key stream 中的 key 类型 + // 2. 非广播流中的元素类型 + // 3. 广播流中的元素类型 + // 4. 结果的类型,在这里是 string + + new KeyedBroadcastProcessFunction() { + // 模式匹配逻辑 + } + ); +``` + +### BroadcastProcessFunction 和 KeyedBroadcastProcessFunction + +在传入的 `BroadcastProcessFunction` 或 `KeyedBroadcastProcessFunction` 中,我们需要实现两个方法。`processBroadcastElement()` 方法负责处理广播流中的元素,`processElement()` 负责处理非广播流中的元素。 +两个子类型定义如下: + +```java +public abstract class BroadcastProcessFunction extends BaseBroadcastProcessFunction { + + public abstract void processElement(IN1 value, ReadOnlyContext ctx, Collector out) throws Exception; + + public abstract void processBroadcastElement(IN2 value, Context ctx, Collector out) throws Exception; +} +``` + +```java +public abstract class KeyedBroadcastProcessFunction { + + public abstract void processElement(IN1 value, ReadOnlyContext ctx, Collector out) throws Exception; + + public abstract void processBroadcastElement(IN2 value, Context ctx, Collector out) throws Exception; + + public void onTimer(long timestamp, OnTimerContext ctx, Collector out) throws Exception; +} +``` + +需要注意的是 `processBroadcastElement()` 负责处理广播流的元素,而 `processElement()` 负责处理另一个流的元素。两个方法的第二个参数(Context)不同,均有以下方法: + 1. 得到广播流的存储状态:`ctx.getBroadcastState(MapStateDescriptor stateDescriptor)` + 2. 查询元素的时间戳:`ctx.timestamp()` + 3. 查询目前的Watermark:`ctx.currentWatermark()` + 4. 目前的处理时间(processing time):`ctx.currentProcessingTime()` + 5. 产生旁路输出:`ctx.output(OutputTag outputTag, X value)` + +在 `getBroadcastState()` 方法中传入的 `stateDescriptor` 应该与调用 `.broadcast(ruleStateDescriptor)` 的参数相同。 + +这两个方法的区别在于对 broadcast state 的访问权限不同。在处理广播流元素这端,是**具有读写权限的**,而对于处理非广播流元素这端是**只读**的。 +这样做的原因是,Flink 中是不存在跨 task 通讯的。所以为了保证 broadcast state 在所有的并发实例中是一致的,我们在处理广播流元素的时候给予写权限,在所有的 task 中均可以看到这些元素,并且要求对这些元素处理是一致的, +那么最终所有 task 得到的 broadcast state 是一致的。 + +{{< hint warning >}} +`processBroadcastElement()` 的实现必须在所有的并发实例中具有确定性的结果。 +{{< /hint >}} + +同时,`KeyedBroadcastProcessFunction` 在 Keyed Stream 上工作,所以它提供了一些 `BroadcastProcessFunction` 没有的功能: + 1. `processElement()` 的参数 `ReadOnlyContext` 提供了方法能够访问 Flink 的定时器服务,可以注册事件定时器(event-time timer)或者处理时间的定时器(processing-time timer)。当定时器触发时,会调用 `onTimer()` 方法, + 提供了 `OnTimerContext`,它具有 `ReadOnlyContext` 的全部功能,并且提供: + - 查询当前触发的是一个事件还是处理时间的定时器 + - 查询定时器关联的key + 2. `processBroadcastElement()` 方法中的参数 `Context` 会提供方法 `applyToKeyedState(StateDescriptor stateDescriptor, KeyedStateFunction function)`。 + 这个方法使用一个 `KeyedStateFunction` 能够对 `stateDescriptor` 对应的 state 中**所有 key 的存储状态**进行某些操作。 + +{{< hint warning >}} +注册一个定时器只能在 `KeyedBroadcastProcessFunction` 的 `processElement()` 方法中进行。 + 在 `processBroadcastElement()` 方法中不能注册定时器,因为广播的元素中并没有关联的 key。 +{{< /hint >}} + +回到我们当前的例子中,`KeyedBroadcastProcessFunction` 应该实现如下: + +```java +new KeyedBroadcastProcessFunction() { + + // 存储部分匹配的结果,即匹配了一个元素,正在等待第二个元素 + // 我们用一个数组来存储,因为同时可能有很多第一个元素正在等待 + private final MapStateDescriptor> mapStateDesc = + new MapStateDescriptor<>( + "items", + BasicTypeInfo.STRING_TYPE_INFO, + new ListTypeInfo<>(Item.class)); + + // 与之前的 ruleStateDescriptor 相同 + private final MapStateDescriptor ruleStateDescriptor = + new MapStateDescriptor<>( + "RulesBroadcastState", + BasicTypeInfo.STRING_TYPE_INFO, + TypeInformation.of(new TypeHint() {})); + + @Override + public void processBroadcastElement(Rule value, + Context ctx, + Collector out) throws Exception { + ctx.getBroadcastState(ruleStateDescriptor).put(value.name, value); + } + + @Override + public void processElement(Item value, + ReadOnlyContext ctx, + Collector out) throws Exception { + + final MapState> state = getRuntimeContext().getMapState(mapStateDesc); + final Shape shape = value.getShape(); + + for (Map.Entry entry : + ctx.getBroadcastState(ruleStateDescriptor).immutableEntries()) { + final String ruleName = entry.getKey(); + final Rule rule = entry.getValue(); + + List stored = state.get(ruleName); + if (stored == null) { + stored = new ArrayList<>(); + } + + if (shape == rule.second && !stored.isEmpty()) { + for (Item i : stored) { + out.collect("MATCH: " + i + " - " + value); + } + stored.clear(); + } + + // 不需要额外的 else{} 段来考虑 rule.first == rule.second 的情况 + if (shape.equals(rule.first)) { + stored.add(value); + } + + if (stored.isEmpty()) { + state.remove(ruleName); + } else { + state.put(ruleName, stored); + } + } + } +} +``` + +## 重要注意事项 + +这里有一些 broadcast state 的重要注意事项,在使用它时需要时刻清楚: + + - **没有跨 task 通讯:**如上所述,这就是为什么**只有**在 `(Keyed)-BroadcastProcessFunction` 中处理广播流元素的方法里可以更改 broadcast state 的内容。 + 同时,用户需要保证所有 task 对于 broadcast state 的处理方式是一致的,否则会造成不同 task 读取 broadcast state 时内容不一致的情况,最终导致结果不一致。 + + - **broadcast state 在不同的 task 的事件顺序可能是不同的:**虽然广播流中元素的过程能够保证所有的下游 task 全部能够收到,但在不同 task 中元素的到达顺序可能不同。 + 所以 broadcast state 的更新*不能依赖于流中元素到达的顺序*。 + + - **所有的 task 均会对 broadcast state 进行 checkpoint:**虽然所有 task 中的 broadcast state 是一致的,但当 checkpoint 来临时所有 task 均会对 broadcast state 做 checkpoint。 + 这个设计是为了防止在作业恢复后读文件造成的文件热点。当然这种方式会造成 checkpoint 一定程度的写放大,放大倍数为 p(=并行度)。Flink 会保证在恢复状态/改变并发的时候数据**没有重复**且**没有缺失**。 + 在作业恢复时,如果与之前具有相同或更小的并发度,所有的 task 读取之前已经 checkpoint 过的 state。在增大并发的情况下,task 会读取本身的 state,多出来的并发(`p_new` - `p_old`)会使用轮询调度算法读取之前 task 的 state。 + + - **不使用 RocksDB state backend:** broadcast state 在运行时保存在内存中,需要保证内存充足。这一特性同样适用于所有其他 Operator State。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/fault-tolerance/checkpointing.md b/docs/content.zh/docs/dev/datastream/fault-tolerance/checkpointing.md new file mode 100644 index 0000000000000..e7e53e660df63 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/fault-tolerance/checkpointing.md @@ -0,0 +1,189 @@ +--- +title: "Checkpointing" +weight: 4 +type: docs +aliases: + - /zh/dev/stream/state/checkpointing.html + - /zh/apis/streaming/fault_tolerance.html +--- + + +# Checkpointing + +Flink 中的每个方法或算子都能够是**有状态的**(阅读 [working with state](state.html) 了解更多)。 +状态化的方法在处理单个 元素/事件 的时候存储数据,让状态成为使各个类型的算子更加精细的重要部分。 +为了让状态容错,Flink 需要为状态添加 **checkpoint(检查点)**。Checkpoint 使得 Flink 能够恢复状态和在流中的位置,从而向应用提供和无故障执行时一样的语义。 + +[容错文档]({{< ref "docs/learn-flink/fault_tolerance" >}}) 中介绍了 Flink 流计算容错机制内部的技术原理。 + + +## 前提条件 + +Flink 的 checkpoint 机制会和持久化存储进行交互,读写流与状态。一般需要: + + - 一个能够回放一段时间内数据的持久化数据源,例如持久化消息队列(例如 Apache Kafka、RabbitMQ、 Amazon Kinesis、 Google PubSub 等)或文件系统(例如 HDFS、 S3、 GFS、 NFS、 Ceph 等)。 + - 存放状态的持久化存储,通常为分布式文件系统(比如 HDFS、 S3、 GFS、 NFS、 Ceph 等)。 + +## 开启与配置 Checkpoint + +默认情况下 checkpoint 是禁用的。通过调用 `StreamExecutionEnvironment` 的 `enableCheckpointing(n)` 来启用 checkpoint,里面的 *n* 是进行 checkpoint 的间隔,单位毫秒。 + +Checkpoint 其他的属性包括: + + - *精确一次(exactly-once)对比至少一次(at-least-once)*:你可以选择向 `enableCheckpointing(long interval, CheckpointingMode mode)` 方法中传入一个模式来选择使用两种保证等级中的哪一种。 + 对于大多数应用来说,精确一次是较好的选择。至少一次可能与某些延迟超低(始终只有几毫秒)的应用的关联较大。 + + - *checkpoint 超时*:如果 checkpoint 执行的时间超过了该配置的阈值,还在进行中的 checkpoint 操作就会被抛弃。 + + - *checkpoints 之间的最小时间*:该属性定义在 checkpoint 之间需要多久的时间,以确保流应用在 checkpoint 之间有足够的进展。如果值设置为了 *5000*, + 无论 checkpoint 持续时间与间隔是多久,在前一个 checkpoint 完成时的至少五秒后会才开始下一个 checkpoint。 + + 往往使用“checkpoints 之间的最小时间”来配置应用会比 checkpoint 间隔容易很多,因为“checkpoints 之间的最小时间”在 checkpoint 的执行时间超过平均值时不会受到影响(例如如果目标的存储系统忽然变得很慢)。 + + 注意这个值也意味着并发 checkpoint 的数目是*一*。 + + - *并发 checkpoint 的数目*: 默认情况下,在上一个 checkpoint 未完成(失败或者成功)的情况下,系统不会触发另一个 checkpoint。这确保了拓扑不会在 checkpoint 上花费太多时间,从而影响正常的处理流程。 + 不过允许多个 checkpoint 并行进行是可行的,对于有确定的处理延迟(例如某方法所调用比较耗时的外部服务),但是仍然想进行频繁的 checkpoint 去最小化故障后重跑的 pipelines 来说,是有意义的。 + + 该选项不能和 "checkpoints 间的最小时间"同时使用。 + + - *externalized checkpoints*: 你可以配置周期存储 checkpoint 到外部系统中。Externalized checkpoints 将他们的元数据写到持久化存储上并且在 job 失败的时候*不会*被自动删除。 + 这种方式下,如果你的 job 失败,你将会有一个现有的 checkpoint 去恢复。更多的细节请看 [Externalized checkpoints 的部署文档]({{< ref "docs/ops/state/checkpoints" >}}#externalized-checkpoints)。 + + - *在 checkpoint 出错时使 task 失败或者继续进行 task*:他决定了在 task checkpoint 的过程中发生错误时,是否使 task 也失败,使失败是默认的行为。 + 或者禁用它时,这个任务将会简单的把 checkpoint 错误信息报告给 checkpoint coordinator 并继续运行。 + + - *优先从 checkpoint 恢复(prefer checkpoint for recovery)*:该属性确定 job 是否在最新的 checkpoint 回退,即使有更近的 savepoint 可用,这可以潜在地减少恢复时间(checkpoint 恢复比 savepoint 恢复更快)。 + +{{< tabs "5ef78d6e-3c62-43e9-b0a8-a987df37a8da" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +// 每 1000ms 开始一次 checkpoint +env.enableCheckpointing(1000); + +// 高级选项: + +// 设置模式为精确一次 (这是默认值) +env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); + +// 确认 checkpoints 之间的时间会进行 500 ms +env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500); + +// Checkpoint 必须在一分钟内完成,否则就会被抛弃 +env.getCheckpointConfig().setCheckpointTimeout(60000); + +// 同一时间只允许一个 checkpoint 进行 +env.getCheckpointConfig().setMaxConcurrentCheckpoints(1); + +// 开启在 job 中止后仍然保留的 externalized checkpoints +env.getCheckpointConfig().enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); + +// 开启实验性的 unaligned checkpoints +env.getCheckpointConfig().enableUnalignedCheckpoints(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() + +// 每 1000ms 开始一次 checkpoint +env.enableCheckpointing(1000) + +// 高级选项: + +// 设置模式为精确一次 (这是默认值) +env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE) + +// 确认 checkpoints 之间的时间会进行 500 ms +env.getCheckpointConfig.setMinPauseBetweenCheckpoints(500) + +// Checkpoint 必须在一分钟内完成,否则就会被抛弃 +env.getCheckpointConfig.setCheckpointTimeout(60000) + +// 如果 task 的 checkpoint 发生错误,会阻止 task 失败,checkpoint 仅仅会被抛弃 +env.getCheckpointConfig.setFailTasksOnCheckpointingErrors(false) + +// 同一时间只允许一个 checkpoint 进行 +env.getCheckpointConfig.setMaxConcurrentCheckpoints(1) + +// 开启实验性的 unaligned checkpoints +env.getCheckpointConfig.enableUnalignedCheckpoints() +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +env = StreamExecutionEnvironment.get_execution_environment() + +# 每 1000ms 开始一次 checkpoint +env.enable_checkpointing(1000) + +# 高级选项: + +# 设置模式为精确一次 (这是默认值) +env.get_checkpoint_config().set_checkpointing_mode(CheckpointingMode.EXACTLY_ONCE) + +# 确认 checkpoints 之间的时间会进行 500 ms +env.get_checkpoint_config().set_min_pause_between_checkpoints(500) + +# Checkpoint 必须在一分钟内完成,否则就会被抛弃 +env.get_checkpoint_config().set_checkpoint_timeout(60000) + +# 同一时间只允许一个 checkpoint 进行 +env.get_checkpoint_config().set_max_concurrent_checkpoints(1) + +# 开启在 job 中止后仍然保留的 externalized checkpoints +env.get_checkpoint_config().enable_externalized_checkpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) + +# 允许在有更近 savepoint 时回退到 checkpoint +env.get_checkpoint_config().set_prefer_checkpoint_for_recovery(True) +``` +{{< /tab >}} +{{< /tabs >}} + +### 相关的配置选项 + +更多的属性与默认值能在 `conf/flink-conf.yaml` 中设置(完整教程请阅读 [配置]({{< ref "docs/deployment/config" >}}))。 + +{{< generated/checkpointing_configuration >}} + +{{< top >}} + + +## 选择一个 State Backend + +Flink 的 [checkpointing 机制]({{< ref "docs/learn-flink/fault_tolerance" >}}) 会将 timer 以及 stateful 的 operator 进行快照,然后存储下来, +包括连接器(connectors),窗口(windows)以及任何用户[自定义的状态](state.html)。 +Checkpoint 存储在哪里取决于所配置的 **State Backend**(比如 JobManager memory、 file system、 database)。 + +默认情况下,状态是保持在 TaskManagers 的内存中,checkpoint 保存在 JobManager 的内存中。为了合适地持久化大体量状态, +Flink 支持各种各样的途径去存储 checkpoint 状态到其他的 state backends 上。通过 `StreamExecutionEnvironment.setStateBackend(…)` 来配置所选的 state backends。 + +阅读 [state backends]({{< ref "docs/ops/state/state_backends" >}}) 来查看在 job 范围和集群范围上可用的 state backends 与选项的更多细节。 + +## 迭代作业中的状态和 checkpoint + +Flink 现在为没有迭代(iterations)的作业提供一致性的处理保证。在迭代作业上开启 checkpoint 会导致异常。为了在迭代程序中强制进行 checkpoint,用户需要在开启 checkpoint 时设置一个特殊的标志: `env.enableCheckpointing(interval, CheckpointingMode.EXACTLY_ONCE, force = true)`。 + +请注意在环形边上游走的记录(以及与之相关的状态变化)在故障时会丢失。 + +{{< top >}} + diff --git a/docs/content.zh/docs/dev/datastream/fault-tolerance/custom_serialization.md b/docs/content.zh/docs/dev/datastream/fault-tolerance/custom_serialization.md new file mode 100644 index 0000000000000..905e5f0c46a97 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/fault-tolerance/custom_serialization.md @@ -0,0 +1,449 @@ +--- +title: "Custom State Serialization" +weight: 8 +type: docs +aliases: + - /zh/dev/stream/state/custom_serialization.html +--- + + +# Custom Serialization for Managed State + + +This page is targeted as a guideline for users who require the use of custom serialization for their state, covering +how to provide a custom state serializer as well as guidelines and best practices for implementing serializers that allow +state schema evolution. + +If you're simply using Flink's own serializers, this page is irrelevant and can be ignored. + +## Using custom state serializers + +When registering a managed operator or keyed state, a `StateDescriptor` is required +to specify the state's name, as well as information about the type of the state. The type information is used by Flink's +[type serialization framework](../../types_serialization.html) to create appropriate serializers for the state. + +It is also possible to completely bypass this and let Flink use your own custom serializer to serialize managed states, +simply by directly instantiating the `StateDescriptor` with your own `TypeSerializer` implementation: + +{{< tabs "ee215ff6-2e21-4a40-a1b4-7f114560546f" >}} +{{< tab "Java" >}} +```java +public class CustomTypeSerializer extends TypeSerializer> {...}; + +ListStateDescriptor> descriptor = + new ListStateDescriptor<>( + "state-name", + new CustomTypeSerializer()); + +checkpointedState = getRuntimeContext().getListState(descriptor); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +class CustomTypeSerializer extends TypeSerializer[(String, Integer)] {...} + +val descriptor = new ListStateDescriptor[(String, Integer)]( + "state-name", + new CustomTypeSerializer) +) + +checkpointedState = getRuntimeContext.getListState(descriptor) +``` +{{< /tab >}} +{{< /tabs >}} + +## State serializers and schema evolution + +This section explains the user-facing abstractions related to state serialization and schema evolution, and necessary +internal details about how Flink interacts with these abstractions. + +When restoring from savepoints, Flink allows changing the serializers used to read and write previously registered state, +so that users are not locked in to any specific serialization schema. When state is restored, a new serializer will be +registered for the state (i.e., the serializer that comes with the `StateDescriptor` used to access the state in the +restored job). This new serializer may have a different schema than that of the previous serializer. Therefore, when +implementing state serializers, besides the basic logic of reading / writing data, another important thing to keep in +mind is how the serialization schema can be changed in the future. + +When speaking of *schema*, in this context the term is interchangeable between referring to the *data model* of a state +type and the *serialized binary format* of a state type. The schema, generally speaking, can change for a few cases: + + 1. Data schema of the state type has evolved, i.e. adding or removing a field from a POJO that is used as state. + 2. Generally speaking, after a change to the data schema, the serialization format of the serializer will need to be upgraded. + 3. Configuration of the serializer has changed. + +In order for the new execution to have information about the *written schema* of state and detect whether or not the +schema has changed, upon taking a savepoint of an operator's state, a *snapshot* of the state serializer needs to be +written along with the state bytes. This is abstracted a `TypeSerializerSnapshot`, explained in the next subsection. + +### The `TypeSerializerSnapshot` abstraction + +```java +public interface TypeSerializerSnapshot { + int getCurrentVersion(); + void writeSnapshot(DataOuputView out) throws IOException; + void readSnapshot(int readVersion, DataInputView in, ClassLoader userCodeClassLoader) throws IOException; + TypeSerializerSchemaCompatibility resolveSchemaCompatibility(TypeSerializer newSerializer); + TypeSerializer restoreSerializer(); +} +``` +```java +public abstract class TypeSerializer { + + // ... + + public abstract TypeSerializerSnapshot snapshotConfiguration(); +} +``` + +A serializer's `TypeSerializerSnapshot` is a point-in-time information that serves as the single source of truth about +the state serializer's write schema, as well as any additional information mandatory to restore a serializer that +would be identical to the given point-in-time. The logic about what should be written and read at restore time +as the serializer snapshot is defined in the `writeSnapshot` and `readSnapshot` methods. + +Note that the snapshot's own write schema may also need to change over time (e.g. when you wish to add more information +about the serializer to the snapshot). To facilitate this, snapshots are versioned, with the current version +number defined in the `getCurrentVersion` method. On restore, when the serializer snapshot is read from savepoints, +the version of the schema in which the snapshot was written in will be provided to the `readSnapshot` method so that +the read implementation can handle different versions. + +At restore time, the logic that detects whether or not the new serializer's schema has changed should be implemented in +the `resolveSchemaCompatibility` method. When previous registered state is registered again with new serializers in the +restored execution of an operator, the new serializer is provided to the previous serializer's snapshot via this method. +This method returns a `TypeSerializerSchemaCompatibility` representing the result of the compatibility resolution, +which can be one of the following: + + 1. **`TypeSerializerSchemaCompatibility.compatibleAsIs()`**: this result signals that the new serializer is compatible, + meaning that the new serializer has identical schema with the previous serializer. It is possible that the new + serializer has been reconfigured in the `resolveSchemaCompatibility` method so that it is compatible. + 2. **`TypeSerializerSchemaCompatibility.compatibleAfterMigration()`**: this result signals that the new serializer has a + different serialization schema, and it is possible to migrate from the old schema by using the previous serializer + (which recognizes the old schema) to read bytes into state objects, and then rewriting the object back to bytes with + the new serializer (which recognizes the new schema). + 3. **`TypeSerializerSchemaCompatibility.incompatible()`**: this result signals that the new serializer has a + different serialization schema, but it is not possible to migrate from the old schema. + +The last bit of detail is how the previous serializer is obtained in the case that migration is required. +Another important role of a serializer's `TypeSerializerSnapshot` is that it serves as a factory to restore +the previous serializer. More specifically, the `TypeSerializerSnapshot` should implement the `restoreSerializer` method +to instantiate a serializer instance that recognizes the previous serializer's schema and configuration, and can therefore +safely read data written by the previous serializer. + +### How Flink interacts with the `TypeSerializer` and `TypeSerializerSnapshot` abstractions + +To wrap up, this section concludes how Flink, or more specifically the state backends, interact with the +abstractions. The interaction is slightly different depending on the state backend, but this is orthogonal +to the implementation of state serializers and their serializer snapshots. + +#### Off-heap state backends (e.g. `RocksDBStateBackend`) + + 1. **Register new state with a state serializer that has schema _A_** + - the registered `TypeSerializer` for the state is used to read / write state on every state access. + - State is written in schema *A*. + 2. **Take a savepoint** + - The serializer snapshot is extracted via the `TypeSerializer#snapshotConfiguration` method. + - The serializer snapshot is written to the savepoint, as well as the already-serialized state bytes (with schema *A*). + 3. **Restored execution re-accesses restored state bytes with new state serializer that has schema _B_** + - The previous state serializer's snapshot is restored. + - State bytes are not deserialized on restore, only loaded back to the state backends (therefore, still in schema *A*). + - Upon receiving the new serializer, it is provided to the restored previous serializer's snapshot via the + `TypeSerializer#resolveSchemaCompatibility` to check for schema compatibility. + 4. **Migrate state bytes in backend from schema _A_ to schema _B_** + - If the compatibility resolution reflects that the schema has changed and migration is possible, schema migration is + performed. The previous state serializer which recognizes schema _A_ will be obtained from the serializer snapshot, via + `TypeSerializerSnapshot#restoreSerializer()`, and is used to deserialize state bytes to objects, which in turn + are re-written again with the new serializer, which recognizes schema _B_ to complete the migration. All entries + of the accessed state is migrated all-together before processing continues. + - If the resolution signals incompatibility, then the state access fails with an exception. + +#### Heap state backends (e.g. `MemoryStateBackend`, `FsStateBackend`) + + 1. **Register new state with a state serializer that has schema _A_** + - the registered `TypeSerializer` is maintained by the state backend. + 2. **Take a savepoint, serializing all state with schema _A_** + - The serializer snapshot is extracted via the `TypeSerializer#snapshotConfiguration` method. + - The serializer snapshot is written to the savepoint. + - State objects are now serialized to the savepoint, written in schema _A_. + 3. **On restore, deserialize state into objects in heap** + - The previous state serializer's snapshot is restored. + - The previous serializer, which recognizes schema _A_, is obtained from the serializer snapshot, via + `TypeSerializerSnapshot#restoreSerializer()`, and is used to deserialize state bytes to objects. + - From now on, all of the state is already deserialized. + 4. **Restored execution re-accesses previous state with new state serializer that has schema _B_** + - Upon receiving the new serializer, it is provided to the restored previous serializer's snapshot via the + `TypeSerializer#resolveSchemaCompatibility` to check for schema compatibility. + - If the compatibility check signals that migration is required, nothing happens in this case since for + heap backends, all state is already deserialized into objects. + - If the resolution signals incompatibility, then the state access fails with an exception. + 5. **Take another savepoint, serializing all state with schema _B_** + - Same as step 2., but now state bytes are all in schema _B_. + +## Predefined convenient `TypeSerializerSnapshot` classes + +Flink provides two abstract base `TypeSerializerSnapshot` classes that can be used for typical scenarios: +`SimpleTypeSerializerSnapshot` and `CompositeTypeSerializerSnapshot`. + +Serializers that provide these predefined snapshots as their serializer snapshot must always have their own, independent +subclass implementation. This corresponds to the best practice of not sharing snapshot classes +across different serializers, which is more thoroughly explained in the next section. + +### Implementing a `SimpleTypeSerializerSnapshot` + +The `SimpleTypeSerializerSnapshot` is intended for serializers that do not have any state or configuration, +essentially meaning that the serialization schema of the serializer is solely defined by the serializer's class. + +There will only be 2 possible results of the compatibility resolution when using the `SimpleTypeSerializerSnapshot` +as your serializer's snapshot class: + + - `TypeSerializerSchemaCompatibility.compatibleAsIs()`, if the new serializer class remains identical, or + - `TypeSerializerSchemaCompatibility.incompatible()`, if the new serializer class is different then the previous one. + +Below is an example of how the `SimpleTypeSerializerSnapshot` is used, using Flink's `IntSerializer` as an example: +
    +```java +public class IntSerializerSnapshot extends SimpleTypeSerializerSnapshot { + public IntSerializerSnapshot() { + super(() -> IntSerializer.INSTANCE); + } +} +``` +
    + +The `IntSerializer` has no state or configurations. Serialization format is solely defined by the serializer +class itself, and can only be read by another `IntSerializer`. Therefore, it suits the use case of the +`SimpleTypeSerializerSnapshot`. + +The base super constructor of the `SimpleTypeSerializerSnapshot` expects a `Supplier` of instances +of the corresponding serializer, regardless of whether the snapshot is currently being restored or being written during +snapshots. That supplier is used to create the restore serializer, as well as type checks to verify that the +new serializer is of the same expected serializer class. + +### Implementing a `CompositeTypeSerializerSnapshot` + +The `CompositeTypeSerializerSnapshot` is intended for serializers that rely on multiple nested serializers for serialization. + +Before further explanation, we call the serializer, which relies on multiple nested serializer(s), as the "outer" serializer in this context. +Examples for this could be `MapSerializer`, `ListSerializer`, `GenericArraySerializer`, etc. +Consider the `MapSerializer`, for example - the key and value serializers would be the nested serializers, +while `MapSerializer` itself is the "outer" serializer. + +In this case, the snapshot of the outer serializer should also contain snapshots of the nested serializers, so that +the compatibility of the nested serializers can be independently checked. When resolving the compatibility of the +outer serializer, the compatibility of each nested serializer needs to be considered. + +`CompositeTypeSerializerSnapshot` is provided to assist in the implementation of snapshots for these kind of +composite serializers. It deals with reading and writing the nested serializer snapshots, as well as resolving +the final compatibility result taking into account the compatibility of all nested serializers. + +Below is an example of how the `CompositeTypeSerializerSnapshot` is used, using Flink's `MapSerializer` as an example: +
    +```java +public class MapSerializerSnapshot extends CompositeTypeSerializerSnapshot, MapSerializer> { + + private static final int CURRENT_VERSION = 1; + + public MapSerializerSnapshot() { + super(MapSerializer.class); + } + + public MapSerializerSnapshot(MapSerializer mapSerializer) { + super(mapSerializer); + } + + @Override + public int getCurrentOuterSnapshotVersion() { + return CURRENT_VERSION; + } + + @Override + protected MapSerializer createOuterSerializerWithNestedSerializers(TypeSerializer[] nestedSerializers) { + TypeSerializer keySerializer = (TypeSerializer) nestedSerializers[0]; + TypeSerializer valueSerializer = (TypeSerializer) nestedSerializers[1]; + return new MapSerializer<>(keySerializer, valueSerializer); + } + + @Override + protected TypeSerializer[] getNestedSerializers(MapSerializer outerSerializer) { + return new TypeSerializer[] { outerSerializer.getKeySerializer(), outerSerializer.getValueSerializer() }; + } +} +``` +
    + +When implementing a new serializer snapshot as a subclass of `CompositeTypeSerializerSnapshot`, +the following three methods must be implemented: + * `#getCurrentOuterSnapshotVersion()`: This method defines the version of + the current outer serializer snapshot's serialized binary format. + * `#getNestedSerializers(TypeSerializer)`: Given the outer serializer, returns its nested serializers. + * `#createOuterSerializerWithNestedSerializers(TypeSerializer[])`: + Given the nested serializers, create an instance of the outer serializer. + +The above example is a `CompositeTypeSerializerSnapshot` where there are no extra information to be snapshotted +apart from the nested serializers' snapshots. Therefore, its outer snapshot version can be expected to never +require an uptick. Some other serializers, however, contains some additional static configuration +that needs to be persisted along with the nested component serializer. An example for this would be Flink's +`GenericArraySerializer`, which contains as configuration the class of the array element type, besides +the nested element serializer. + +In these cases, an additional three methods need to be implemented on the `CompositeTypeSerializerSnapshot`: + * `#writeOuterSnapshot(DataOutputView)`: defines how the outer snapshot information is written. + * `#readOuterSnapshot(int, DataInputView, ClassLoader)`: defines how the outer snapshot information is read. + * `#resolveOuterSchemaCompatibility(TypeSerializer)`: checks the compatibility based on the outer snapshot information. + +By default, the `CompositeTypeSerializerSnapshot` assumes that there isn't any outer snapshot information to +read / write, and therefore have empty default implementations for the above methods. If the subclass +has outer snapshot information, then all three methods must be implemented. + +Below is an example of how the `CompositeTypeSerializerSnapshot` is used for composite serializer snapshots +that do have outer snapshot information, using Flink's `GenericArraySerializer` as an example: + +
    +```java +public final class GenericArraySerializerSnapshot extends CompositeTypeSerializerSnapshot { + + private static final int CURRENT_VERSION = 1; + + private Class componentClass; + + public GenericArraySerializerSnapshot() { + super(GenericArraySerializer.class); + } + + public GenericArraySerializerSnapshot(GenericArraySerializer genericArraySerializer) { + super(genericArraySerializer); + this.componentClass = genericArraySerializer.getComponentClass(); + } + + @Override + protected int getCurrentOuterSnapshotVersion() { + return CURRENT_VERSION; + } + + @Override + protected void writeOuterSnapshot(DataOutputView out) throws IOException { + out.writeUTF(componentClass.getName()); + } + + @Override + protected void readOuterSnapshot(int readOuterSnapshotVersion, DataInputView in, ClassLoader userCodeClassLoader) throws IOException { + this.componentClass = InstantiationUtil.resolveClassByName(in, userCodeClassLoader); + } + + @Override + protected boolean resolveOuterSchemaCompatibility(GenericArraySerializer newSerializer) { + return (this.componentClass == newSerializer.getComponentClass()) + ? OuterSchemaCompatibility.COMPATIBLE_AS_IS + : OuterSchemaCompatibility.INCOMPATIBLE; + } + + @Override + protected GenericArraySerializer createOuterSerializerWithNestedSerializers(TypeSerializer[] nestedSerializers) { + TypeSerializer componentSerializer = (TypeSerializer) nestedSerializers[0]; + return new GenericArraySerializer<>(componentClass, componentSerializer); + } + + @Override + protected TypeSerializer[] getNestedSerializers(GenericArraySerializer outerSerializer) { + return new TypeSerializer[] { outerSerializer.getComponentSerializer() }; + } +} +``` +
    + +There are two important things to notice in the above code snippet. First of all, since this +`CompositeTypeSerializerSnapshot` implementation has outer snapshot information that is written as part of the snapshot, +the outer snapshot version, as defined by `getCurrentOuterSnapshotVersion()`, must be upticked whenever the +serialization format of the outer snapshot information changes. + +Second of all, notice how we avoid using Java serialization when writing the component class, by only writing +the classname and dynamically loading it when reading back the snapshot. Avoiding Java serialization for writing +contents of serializer snapshots is in general a good practice to follow. More details about this is covered in the +next section. + +## Implementation notes and best practices + +#### 1. Flink restores serializer snapshots by instantiating them with their classname + +A serializer's snapshot, being the single source of truth for how a registered state was serialized, serves as an +entry point to reading state in savepoints. In order to be able to restore and access previous state, the previous state +serializer's snapshot must be able to be restored. + +Flink restores serializer snapshots by first instantiating the `TypeSerializerSnapshot` with its classname (written +along with the snapshot bytes). Therefore, to avoid being subject to unintended classname changes or instantiation +failures, `TypeSerializerSnapshot` classes should: + + - avoid being implemented as anonymous classes or nested classes, + - have a public, nullary constructor for instantiation + +#### 2. Avoid sharing the same `TypeSerializerSnapshot` class across different serializers + +Since schema compatibility checks goes through the serializer snapshots, having multiple serializers returning +the same `TypeSerializerSnapshot` class as their snapshot would complicate the implementation for the +`TypeSerializerSnapshot#resolveSchemaCompatibility` and `TypeSerializerSnapshot#restoreSerializer()` method. + +This would also be a bad separation of concerns; a single serializer's serialization schema, +configuration, as well as how to restore it, should be consolidated in its own dedicated `TypeSerializerSnapshot` class. + +#### 3. Avoid using Java serialization for serializer snapshot content + +Java serialization should not be used at all when writing contents of a persisted serializer snapshot. +Take for example, a serializer which needs to persist a class of its target type as part of its snapshot. +Information about the class should be persisted by writing the class name, instead of directly serializing the class +using Java. When reading the snapshot, the class name is read, and used to dynamically load the class via the name. + +This practice ensures that serializer snapshots can always be safely read. In the above example, if the type class +was persisted using Java serialization, the snapshot may no longer be readable once the class implementation has changed +and is no longer binary compatible according to Java serialization specifics. + +## Migrating from deprecated serializer snapshot APIs before Flink 1.7 + +This section is a guide for API migration from serializers and serializer snapshots that existed before Flink 1.7. + +Before Flink 1.7, serializer snapshots were implemented as a `TypeSerializerConfigSnapshot` (which is now deprecated, +and will eventually be removed in the future to be fully replaced by the new `TypeSerializerSnapshot` interface). +Moreover, the responsibility of serializer schema compatibility checks lived within the `TypeSerializer`, +implemented in the `TypeSerializer#ensureCompatibility(TypeSerializerConfigSnapshot)` method. + +Another major difference between the new and old abstractions is that the deprecated `TypeSerializerConfigSnapshot` +did not have the capability of instantiating the previous serializer. Therefore, in the case where your serializer +still returns a subclass of `TypeSerializerConfigSnapshot` as its snapshot, the serializer instance itself will always +be written to savepoints using Java serialization so that the previous serializer may be available at restore time. +This is very undesirable, since whether or not restoring the job will be successful is susceptible to availability +of the previous serializer's class, or in general, whether or not the serializer instance can be read back at restore +time using Java serialization. This means that you be limited to the same serializer for your state, +and could be problematic once you want to upgrade serializer classes or perform schema migration. + +To be future-proof and have flexibility to migrate your state serializers and schema, it is highly recommended to +migrate from the old abstractions. The steps to do this is as follows: + + 1. Implement a new subclass of `TypeSerializerSnapshot`. This will be the new snapshot for your serializer. + 2. Return the new `TypeSerializerSnapshot` as the serializer snapshot for your serializer in the + `TypeSerializer#snapshotConfiguration()` method. + 3. Restore the job from the savepoint that existed before Flink 1.7, and then take a savepoint again. + Note that at this step, the old `TypeSerializerConfigSnapshot` of the serializer must still exist in the classpath, + and the implementation for the `TypeSerializer#ensureCompatibility(TypeSerializerConfigSnapshot)` method must not be + removed. The purpose of this process is to replace the `TypeSerializerConfigSnapshot` written in old savepoints + with the newly implemented `TypeSerializerSnapshot` for the serializer. + 4. Once you have a savepoint taken with Flink 1.7, the savepoint will contain `TypeSerializerSnapshot` as the + state serializer snapshot, and the serializer instance will no longer be written in the savepoint. + At this point, it is now safe to remove all implementations of the old abstraction (remove the old + `TypeSerializerConfigSnapshot` implementation as will as the + `TypeSerializer#ensureCompatibility(TypeSerializerConfigSnapshot)` from the serializer). + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/fault-tolerance/queryable_state.md b/docs/content.zh/docs/dev/datastream/fault-tolerance/queryable_state.md new file mode 100644 index 0000000000000..e930f0facd58e --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/fault-tolerance/queryable_state.md @@ -0,0 +1,270 @@ +--- +title: "Queryable State" +weight: 5 +type: docs +aliases: + - /zh/dev/stream/state/queryable_state.html +is_beta: true +--- + + +# Queryable State + +{{< hint warning >}} +目前 querable state 的客户端 API 还在不断演进,不保证现有接口的稳定性。在后续的 Flink 版本中有可能发生 API 变化。 +{{< /hint >}} + +简而言之, 这个特性将 Flink 的 managed keyed (partitioned) state +(参考 [Working with State]({{< ref "docs/dev/datastream/fault-tolerance/state" >}})) 暴露给外部,从而用户可以在 Flink 外部查询作业 state。 +在某些场景中,Queryable State 消除了对外部系统的分布式操作以及事务的需求,比如 KV 存储系统,而这些外部系统往往会成为瓶颈。除此之外,这个特性对于调试作业非常有用。 + +{{< hint warning >}} + 注意: 进行查询时,state 会在并发线程中被访问,但 state 不会进行同步和拷贝。这种设计是为了避免同步和拷贝带来的作业延时。对于使用 Java 堆内存的 state backend, + 比如 MemoryStateBackend 或者 FsStateBackend,它们获取状态时不会进行拷贝,而是直接引用状态对象,所以对状态的 read-modify-write 是不安全的,并且可能会因为并发修改导致查询失败。但 RocksDBStateBackend 是安全的,不会遇到上述问题。 +{{< /hint >}} + +## 架构 + +在展示如何使用 Queryable State 之前,先简单描述一下该特性的组成部分,主要包括以下三部分: + + 1. `QueryableStateClient`,默认运行在 Flink 集群外部,负责提交用户的查询请求; + 2. `QueryableStateClientProxy`,运行在每个 `TaskManager` 上(*即* Flink 集群内部),负责接收客户端的查询请求,从所负责的 Task Manager 获取请求的 state,并返回给客户端; + 3. `QueryableStateServer`, 运行在 `TaskManager` 上,负责服务本地存储的 state。 + +客户端连接到一个代理,并发送请求获取特定 `k` 对应的 state。 如 [Working with State]({{< ref "docs/dev/datastream/fault-tolerance/state" >}}) 所述,keyed state 按照 +*Key Groups* 进行划分,每个 `TaskManager` 会分配其中的一些 key groups。代理会询问 `JobManager` 以找到 `k` 所属 key group 的 TaskManager。根据返回的结果, 代理将会向运行在 `TaskManager` 上的 `QueryableStateServer` 查询 `k` 对应的 state, 并将结果返回给客户端。 + +## 激活 Queryable State + +为了在 Flink 集群上使用 queryable state,需要进行以下操作: + + 1. 将 `flink-queryable-state-runtime{{ site.scala_version_suffix }}-{{site.version }}.jar` +从 [Flink distribution](https://flink.apache.org/downloads.html "Apache Flink: Downloads") 的 `opt/` 目录拷贝到 `lib/` 目录; + 2. 将参数 `queryable-state.enable` 设置为 `true`。详细信息以及其它配置可参考文档 [Configuration]({{< ref "docs/deployment/config" >}}#queryable-state)。 + +为了验证集群的 queryable state 已经被激活,可以检查任意 task manager 的日志中是否包含 "Started the Queryable State Proxy Server @ ..."。 + +## 将 state 设置为可查询的 + +激活集群的 queryable state 功能后,还要将 state 设置为可查询的才能对外可见,可以通过以下两种方式进行设置: + +* 创建 `QueryableStateStream`,它会作为一个 sink,并将输入数据转化为 queryable state; +* 通过 `stateDescriptor.setQueryable(String queryableStateName)` 将 state 描述符所表示的 keyed state 设置成可查询的。 + +接下来的部分将详细解释这两种方式。 + +### Queryable State Stream + +在 `KeyedStream` 上调用 `.asQueryableState(stateName, stateDescriptor)` 将会返回一个 `QueryableStateStream`, 它会将流数据转化为 queryable state。 +对应不同的 state 类型,`asQueryableState()` 有以下一些方法变体: + +```java +// ValueState +QueryableStateStream asQueryableState( + String queryableStateName, + ValueStateDescriptor stateDescriptor) + +// Shortcut for explicit ValueStateDescriptor variant +QueryableStateStream asQueryableState(String queryableStateName) + +// ReducingState +QueryableStateStream asQueryableState( + String queryableStateName, + ReducingStateDescriptor stateDescriptor) +``` + + +
    + 注意: 没有可查询的 ListState sink,因为这种情况下 list 会不断增长,并且可能不会被清理,最终会消耗大量的内存。 +
    + +返回的 `QueryableStateStream` 可以被视作一个sink,而且**不能再**被进一步转换。在内部实现上,一个 `QueryableStateStream` 被转换成一个 operator,使用输入的数据来更新 queryable state。state 如何更新是由 `asQueryableState` 提供的 `StateDescriptor` 来决定的。在下面的代码中, keyed stream 的所有数据将会通过 `ValueState.update(value)` 来更新状态: + +```java +stream.keyBy(value -> value.f0).asQueryableState("query-name") +``` + +这个行为类似于 Scala API 中的 `flatMapWithState`。 + +### Managed Keyed State + +operator 中的 Managed keyed state +(参考 [Using Managed Keyed State]({{< ref "docs/dev/datastream/fault-tolerance/state" >}}#using-managed-keyed-state)) +可以通过 `StateDescriptor.setQueryable(String queryableStateName)` 将 state descriptor 设置成可查询的,从而使 state 可查询,如下所示: + +```java +ValueStateDescriptor> descriptor = + new ValueStateDescriptor<>( + "average", // the state name + TypeInformation.of(new TypeHint>() {})); // type information +descriptor.setQueryable("query-name"); // queryable state name +``` + +
    + 注意: 参数 queryableStateName 可以任意选取,并且只被用来进行查询,它可以和 state 的名称不同。 +
    + +这种方式不会限制 state 类型,即任意的 `ValueState`、`ReduceState`、`ListState`、`MapState`、`AggregatingState` 以及已弃用的 `FoldingState` +均可作为 queryable state。 + +## 查询 state + +目前为止,你已经激活了集群的 queryable state 功能,并且将一些 state 设置成了可查询的,接下来将会展示如何进行查询。 + +为了进行查询,可以使用辅助类 `QueryableStateClient`,这个类位于 `flink-queryable-state-client` 的 jar 中,在项目的 `pom.xml` 需要显示添加对 `flink-queryable-state-client` 和 `flink-core` 的依赖, 如下所示: + +
    +```xml + + org.apache.flink + flink-core + {{< version >}} + + + org.apache.flink + flink-queryable-state-client-java + {{< version >}} + +``` +
    + +关于依赖的更多信息, 可以参考如何 [配置 Flink 项目]({{< ref "docs/dev/datastream/project-configuration" >}}). + +`QueryableStateClient` 将提交你的请求到内部代理,代理会处理请求并返回结果。客户端的初始化只需要提供一个有效的 `TaskManager` 主机名 +(每个 task manager 上都运行着一个 queryable state 代理),以及代理监听的端口号。关于如何配置代理以及端口号可以参考 [Configuration Section](#configuration). + +```java +QueryableStateClient client = new QueryableStateClient(tmHostname, proxyPort); +``` + +客户端就绪后,为了查询类型为 `K` 的 key,以及类型为 `V` 的state,可以使用如下方法: + +```java +CompletableFuture getKvState( + JobID jobId, + String queryableStateName, + K key, + TypeInformation keyTypeInfo, + StateDescriptor stateDescriptor) +``` + +该方法会返回一个最终将包含 state 的 queryable state 实例,该实例可通过 JobID 和 queryableStateName 识别。在方法参数中,`key` 用来指定所要查询的状态所属的 key。 +`keyTypeInfo` 告诉 Flink 如何对 key 进行序列化和反序列化。`stateDescriptor` 包含了所请求 state 的必要信息,即 state 类型(`Value`,`Reduce` 等等), +以及如何对其进行序列化和反序列。 + +细心的读者会注意到返回的 future 包含类型为 `S` 的值,*即*一个存储实际值的 `State` 对象。它可以是Flink支持的任何类型的 state:`ValueState`、`ReduceState`、 +`ListState`、`MapState`、`AggregatingState` 以及弃用的 `FoldingState`。 + +
    + 注意: 这些 state 对象不允许对其中的 state 进行修改。你可以通过 valueState.get() 获取实际的 state, + 或者通过 mapState.entries() 遍历所有 ,但是不能修改它们。举例来说,对返回的 list state 调用 add() + 方法将会导致 UnsupportedOperationException。 +
    + +
    + 注意: 客户端是异步的,并且可能被多个线程共享。客户端不再使用后需要通过 QueryableStateClient.shutdown() + 来终止,从而释放资源。 +
    + +### 示例 + +下面的例子扩展自 `CountWindowAverage` +(参考 [Using Managed Keyed State]({{< ref "docs/dev/datastream/fault-tolerance/state" >}}#using-managed-keyed-state)), +将其中的 state 设置成可查询的,并展示了如何进行查询: + +```java +public class CountWindowAverage extends RichFlatMapFunction, Tuple2> { + + private transient ValueState> sum; // a tuple containing the count and the sum + + @Override + public void flatMap(Tuple2 input, Collector> out) throws Exception { + Tuple2 currentSum = sum.value(); + currentSum.f0 += 1; + currentSum.f1 += input.f1; + sum.update(currentSum); + + if (currentSum.f0 >= 2) { + out.collect(new Tuple2<>(input.f0, currentSum.f1 / currentSum.f0)); + sum.clear(); + } + } + + @Override + public void open(Configuration config) { + ValueStateDescriptor> descriptor = + new ValueStateDescriptor<>( + "average", // the state name + TypeInformation.of(new TypeHint>() {})); // type information + descriptor.setQueryable("query-name"); + sum = getRuntimeContext().getState(descriptor); + } +} +``` + +上面的代码作为作业运行后,可以获取作业的 ID,然后可以通过下面的方式查询任何 key 下的 state。 + +```java +QueryableStateClient client = new QueryableStateClient(tmHostname, proxyPort); + +// the state descriptor of the state to be fetched. +ValueStateDescriptor> descriptor = + new ValueStateDescriptor<>( + "average", + TypeInformation.of(new TypeHint>() {})); + +CompletableFuture>> resultFuture = + client.getKvState(jobId, "query-name", key, BasicTypeInfo.LONG_TYPE_INFO, descriptor); + +// now handle the returned value +resultFuture.thenAccept(response -> { + try { + Tuple2 res = response.get(); + } catch (Exception e) { + e.printStackTrace(); + } +}); +``` + +## Configuration + +下面的配置会影响 queryable state 服务器端和客户端的行为,它们定义在 `QueryableStateOptions`。 + +### State Server +* `queryable-state.server.ports`: 服务器端口范围,如果同一台机器上运行了多个 task manager,可以避免端口冲突。指定的可以是一个具体的端口号,如 "9123", + 可以是一个端口范围,如 "50100-50200",或者可以是端口范围以及端口号的组合,如 "50100-50200,50300-50400,51234"。默认端口号是 9067。 +* `queryable-state.server.network-threads`: 服务器端 network (event loop) thread 的数量,用来接收查询请求 (如果设置为0,则线程数为 slot 数)。 +* `queryable-state.server.query-threads`: 服务器端处理查询请求的线程数 (如果设置为0,则线程数为 slot 数)。 + + +### Proxy +* `queryable-state.proxy.ports`: 代理的服务端口范围。如果同一台机器上运行了多个 task manager,可以避免端口冲突。指定的可以是一个具体的端口号,如 "9123", + 可以是一个端口范围,如"50100-50200",或者可以是端口范围以及端口号的组合,如 "50100-50200,50300-50400,51234"。默认端口号是 9069。 +* `queryable-state.proxy.network-threads`: 代理上 network (event loop) thread 的数量,用来接收查询请求 (如果设置为0,则线程数为 slot 数)。 +* `queryable-state.proxy.query-threads`: 代理上处理查询请求的线程数 (如果设置为0,则线程数为 slot 数)。 + +## 限制 + +* queryable state 的生命周期受限于作业的生命周期,*比如* tasks 在启动时注册可查询状态,并在退出时注销。在后续版本中,希望能够将其解耦 +从而允许 task 结束后依然能够查询 state,并且通过 state 备份来加速恢复。 +* 目前是通过 tell 来通知可用的 KvState。将来会使用 asks 和 acknowledgements 来提升稳定性。 +* 服务器端和客户端会记录请求的统计信息。因为统计信息目前不会暴露给外部,所以这个功能默认没有开启。如果将来支持通过 Metrics 系统发布这些数据,将开启统计功能。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/fault-tolerance/schema_evolution.md b/docs/content.zh/docs/dev/datastream/fault-tolerance/schema_evolution.md new file mode 100644 index 0000000000000..fef9d655f5aef --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/fault-tolerance/schema_evolution.md @@ -0,0 +1,105 @@ +--- +title: "状态数据结构升级" +weight: 7 +type: docs +aliases: + - /zh/dev/stream/state/schema_evolution.html +--- + + +# 状态数据结构升级 + +Apache Flink 流应用通常被设计为永远或者长时间运行。 +与所有长期运行的服务一样,应用程序需要随着业务的迭代而进行调整。 +应用所处理的数据 schema 也会随着进行变化。 + +此页面概述了如何升级状态类型的数据 schema 。 +目前对不同类型的状态结构(`ValueState`、`ListState` 等)有不同的限制 + +请注意,此页面的信息只与 Flink 自己生成的状态序列化器相关 [类型序列化框架]({{< ref "docs/dev/serialization/types_serialization" >}})。 +也就是说,在声明状态时,状态描述符不可以配置为使用特定的 TypeSerializer 或 TypeInformation , +在这种情况下,Flink 会推断状态类型的信息: + +```java +ListStateDescriptor descriptor = + new ListStateDescriptor<>( + "state-name", + MyPojoType.class); + +checkpointedState = getRuntimeContext().getListState(descriptor); +``` + +在内部,状态是否可以进行升级取决于用于读写持久化状态字节的序列化器。 +简而言之,状态数据结构只有在其序列化器正确支持时才能升级。 +这一过程是被 Flink 的类型序列化框架生成的序列化器透明处理的([下面]({{< ref "docs/dev/datastream/fault-tolerance/schema_evolution" >}}#数据结构升级支持的数据类型) 列出了当前的支持范围)。 + +如果你想要为你的状态类型实现自定义的 `TypeSerializer` 并且想要学习如何实现支持状态数据结构升级的序列化器, +可以参考 [自定义状态序列化器]({{< ref "docs/dev/datastream/fault-tolerance/custom_serialization" >}})。 +本文档也包含一些用于支持状态数据结构升级的状态序列化器与 Flink 状态后端存储相互作用的必要内部细节。 + +## 升级状态数据结构 + +为了对给定的状态类型进行升级,你需要采取以下几个步骤: + + 1. 对 Flink 流作业进行 savepoint 操作。 + 2. 升级程序中的状态类型(例如:修改你的 Avro 结构)。 + 3. 从 savepoint 恢复作业。当第一次访问状态数据时,Flink 会判断状态数据 schema 是否已经改变,并进行必要的迁移。 + +用来适应状态结构的改变而进行的状态迁移过程是自动发生的,并且状态之间是互相独立的。 +Flink 内部是这样来进行处理的,首先会检查新的序列化器相对比之前的序列化器是否有不同的状态结构;如果有, +那么之前的序列化器用来读取状态数据字节到对象,然后使用新的序列化器将对象回写为字节。 + +更多的迁移过程细节不在本文档谈论的范围;可以参考[文档]({{< ref "docs/dev/datastream/fault-tolerance/custom_serialization" >}})。 + +## 数据结构升级支持的数据类型 + +目前,仅支持 POJO 和 Avro 类型的 schema 升级 +因此,如果你比较关注于状态数据结构的升级,那么目前来看强烈推荐使用 Pojo 或者 Avro 状态数据类型。 + +我们有计划支持更多的复合类型;更多的细节可以参考 [FLINK-10896](https://issues.apache.org/jira/browse/FLINK-10896)。 + +### POJO 类型 + +Flink 基于下面的规则来支持 [POJO 类型]({{< ref "docs/dev/serialization/types_serialization" >}}#pojo-类型的规则)结构的升级: + + 1. 可以删除字段。一旦删除,被删除字段的前值将会在将来的 checkpoints 以及 savepoints 中删除。 + 2. 可以添加字段。新字段会使用类型对应的默认值进行初始化,比如 [Java 类型](https://docs.oracle.com/javase/tutorial/java/nutsandbolts/datatypes.html)。 + 3. 不可以修改字段的声明类型。 + 4. 不可以改变 POJO 类型的类名,包括类的命名空间。 + +需要注意,只有从 1.8.0 及以上版本的 Flink 生产的 savepoint 进行恢复时,POJO 类型的状态才可以进行升级。 +对 1.8.0 版本之前的 Flink 是没有办法进行 POJO 类型升级的。 + +### Avro 类型 + +Flink 完全支持 Avro 状态类型的升级,只要数据结构的修改是被 +[Avro 的数据结构解析规则](http://avro.apache.org/docs/current/spec.html#Schema+Resolution)认为兼容的即可。 + +一个例外是如果新的 Avro 数据 schema 生成的类无法被重定位或者使用了不同的命名空间,在作业恢复时状态数据会被认为是不兼容的。 + +{% warn Attention %} Schema evolution of keys is not supported. + +Example: RocksDB state backend relies on binary objects identity, rather than `hashCode` method implementation. Any changes to the keys object structure could lead to non deterministic behaviour. + +{% warn Attention %} **Kryo** cannot be used for schema evolution. + +When Kryo is used, there is no possibility for the framework to verify if any incompatible changes have been made. + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/fault-tolerance/state.md b/docs/content.zh/docs/dev/datastream/fault-tolerance/state.md new file mode 100644 index 0000000000000..66b4d7838b0b9 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/fault-tolerance/state.md @@ -0,0 +1,880 @@ +--- +title: "Working with State" +weight: 2 +type: docs +aliases: + - /zh/dev/stream/state/state.html + - /zh/apis/streaming/state.html +--- + + +# Working with State + +In this section you will learn about the APIs that Flink provides for writing +stateful programs. Please take a look at [Stateful Stream +Processing]({{< ref "docs/concepts/stateful-stream-processing" >}}) +to learn about the concepts behind stateful stream processing. + +## Keyed DataStream + +If you want to use keyed state, you first need to specify a key on a +`DataStream` that should be used to partition the state (and also the records +in the stream themselves). You can specify a key using `keyBy(KeySelector)` +in Java/Scala API or `key_by(KeySelector)` in Python API on a `DataStream`. +This will yield a `KeyedStream`, which then allows operations that use keyed state. + +A key selector function takes a single record as input and returns the key for +that record. The key can be of any type and **must** be derived from +deterministic computations. + +The data model of Flink is not based on key-value pairs. Therefore, you do not +need to physically pack the data set types into keys and values. Keys are +"virtual": they are defined as functions over the actual data to guide the +grouping operator. + +The following example shows a key selector function that simply returns the +field of an object: + +{{< tabs "9730828c-2f0f-48c8-9a5c-4ec415d0c492" >}} +{{< tab "Java" >}} +```java +// some ordinary POJO +public class WC { + public String word; + public int count; + + public String getWord() { return word; } +} +DataStream words = // [...] +KeyedStream keyed = words + .keyBy(WC::getWord); +``` + +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// some ordinary case class +case class WC(word: String, count: Int) +val words: DataStream[WC] = // [...] +val keyed = words.keyBy( _.word ) +``` +{{< /tab >}} + +{{< tab "Python" >}} +```python +words = # type: DataStream[Row] +keyed = words.key_by(lambda row: row[0]) +``` +{{< /tab >}} +{{< /tabs >}} + +#### Tuple Keys and Expression Keys + +Flink also has two alternative ways of defining keys: tuple keys and expression +keys in the Java/Scala API(still not supported in the Python API). With this you can +specify keys using tuple field indices or expressions +for selecting fields of objects. We don't recommend using these today but you +can refer to the Javadoc of DataStream to learn about them. Using a KeySelector +function is strictly superior: with Java lambdas they are easy to use and they +have potentially less overhead at runtime. + +{{< top >}} + +## 使用 Keyed State + +keyed state 接口提供不同类型状态的访问接口,这些状态都作用于当前输入数据的 key 下。换句话说,这些状态仅可在 `KeyedStream` +上使用,在Java/Scala API上可以通过 `stream.keyBy(...)` 得到 `KeyedStream`,在Python API上可以通过 `stream.key_by(...)` 得到 `KeyedStream`。 + +接下来,我们会介绍不同类型的状态,然后介绍如何使用他们。所有支持的状态类型如下所示: + +* `ValueState`: 保存一个可以更新和检索的值(如上所述,每个值都对应到当前的输入数据的 key,因此算子接收到的每个 key 都可能对应一个值)。 +这个值可以通过 `update(T)` 进行更新,通过 `T value()` 进行检索。 + + +* `ListState`: 保存一个元素的列表。可以往这个列表中追加数据,并在当前的列表上进行检索。可以通过 + `add(T)` 或者 `addAll(List)` 进行添加元素,通过 `Iterable get()` 获得整个列表。还可以通过 `update(List)` 覆盖当前的列表。 + +* `ReducingState`: 保存一个单值,表示添加到状态的所有值的聚合。接口与 `ListState` 类似,但使用 `add(T)` 增加元素,会使用提供的 `ReduceFunction` 进行聚合。 + +* `AggregatingState`: 保留一个单值,表示添加到状态的所有值的聚合。和 `ReducingState` 相反的是, 聚合类型可能与 添加到状态的元素的类型不同。 +接口与 `ListState` 类似,但使用 `add(IN)` 添加的元素会用指定的 `AggregateFunction` 进行聚合。 + +* `MapState`: 维护了一个映射列表。 你可以添加键值对到状态中,也可以获得反映当前所有映射的迭代器。使用 `put(UK,UV)` 或者 `putAll(Map)` 添加映射。 + 使用 `get(UK)` 检索特定 key。 使用 `entries()`,`keys()` 和 `values()` 分别检索映射、键和值的可迭代视图。你还可以通过 `isEmpty()` 来判断是否包含任何键值对。 + +所有类型的状态还有一个`clear()` 方法,清除当前 key 下的状态数据,也就是当前输入元素的 key。 + +请牢记,这些状态对象仅用于与状态交互。状态本身不一定存储在内存中,还可能在磁盘或其他位置。 +另外需要牢记的是从状态中获取的值取决于输入元素所代表的 key。 因此,在不同 key 上调用同一个接口,可能得到不同的值。 + +你必须创建一个 `StateDescriptor`,才能得到对应的状态句柄。 这保存了状态名称(正如我们稍后将看到的,你可以创建多个状态,并且它们必须具有唯一的名称以便可以引用它们), +状态所持有值的类型,并且可能包含用户指定的函数,例如`ReduceFunction`。 根据不同的状态类型,可以创建`ValueStateDescriptor`,`ListStateDescriptor`, +`AggregatingStateDescriptor`, `ReducingStateDescriptor` 或 `MapStateDescriptor`。 + +状态通过 `RuntimeContext` 进行访问,因此只能在 *rich functions* 中使用。请参阅[这里]({{< ref "docs/dev/datastream/user_defined_functions" >}}#rich-functions)获取相关信息, +但是我们很快也会看到一个例子。`RichFunction` 中 `RuntimeContext` 提供如下方法: + +* `ValueState getState(ValueStateDescriptor)` +* `ReducingState getReducingState(ReducingStateDescriptor)` +* `ListState getListState(ListStateDescriptor)` +* `AggregatingState getAggregatingState(AggregatingStateDescriptor)` +* `MapState getMapState(MapStateDescriptor)` + +下面是一个 `FlatMapFunction` 的例子,展示了如何将这些部分组合起来: + +{{< tabs "76a14a59-71da-4619-a2f8-463a58515e5e" >}} +{{< tab "Java" >}} +```java +public class CountWindowAverage extends RichFlatMapFunction, Tuple2> { + + /** + * The ValueState handle. The first field is the count, the second field a running sum. + */ + private transient ValueState> sum; + + @Override + public void flatMap(Tuple2 input, Collector> out) throws Exception { + + // access the state value + Tuple2 currentSum = sum.value(); + + // update the count + currentSum.f0 += 1; + + // add the second field of the input value + currentSum.f1 += input.f1; + + // update the state + sum.update(currentSum); + + // if the count reaches 2, emit the average and clear the state + if (currentSum.f0 >= 2) { + out.collect(new Tuple2<>(input.f0, currentSum.f1 / currentSum.f0)); + sum.clear(); + } + } + + @Override + public void open(Configuration config) { + ValueStateDescriptor> descriptor = + new ValueStateDescriptor<>( + "average", // the state name + TypeInformation.of(new TypeHint>() {}), // type information + Tuple2.of(0L, 0L)); // default value of the state, if nothing was set + sum = getRuntimeContext().getState(descriptor); + } +} + +// this can be used in a streaming program like this (assuming we have a StreamExecutionEnvironment env) +env.fromElements(Tuple2.of(1L, 3L), Tuple2.of(1L, 5L), Tuple2.of(1L, 7L), Tuple2.of(1L, 4L), Tuple2.of(1L, 2L)) + .keyBy(value -> value.f0) + .flatMap(new CountWindowAverage()) + .print(); + +// the printed output will be (1,4) and (1,5) +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +class CountWindowAverage extends RichFlatMapFunction[(Long, Long), (Long, Long)] { + + private var sum: ValueState[(Long, Long)] = _ + + override def flatMap(input: (Long, Long), out: Collector[(Long, Long)]): Unit = { + + // access the state value + val tmpCurrentSum = sum.value + + // If it hasn't been used before, it will be null + val currentSum = if (tmpCurrentSum != null) { + tmpCurrentSum + } else { + (0L, 0L) + } + + // update the count + val newSum = (currentSum._1 + 1, currentSum._2 + input._2) + + // update the state + sum.update(newSum) + + // if the count reaches 2, emit the average and clear the state + if (newSum._1 >= 2) { + out.collect((input._1, newSum._2 / newSum._1)) + sum.clear() + } + } + + override def open(parameters: Configuration): Unit = { + sum = getRuntimeContext.getState( + new ValueStateDescriptor[(Long, Long)]("average", createTypeInformation[(Long, Long)]) + ) + } +} + + +object ExampleCountWindowAverage extends App { + val env = StreamExecutionEnvironment.getExecutionEnvironment + + env.fromCollection(List( + (1L, 3L), + (1L, 5L), + (1L, 7L), + (1L, 4L), + (1L, 2L) + )).keyBy(_._1) + .flatMap(new CountWindowAverage()) + .print() + // the printed output will be (1,4) and (1,5) + + env.execute("ExampleKeyedState") +} +``` +{{< /tab >}} + +{{< tab "Python" >}} +```python +from pyflink.common.typeinfo import Types +from pyflink.datastream import StreamExecutionEnvironment, FlatMapFunction, RuntimeContext +from pyflink.datastream.state import ValueStateDescriptor + +class CountWindowAverage(FlatMapFunction): + + def __init__(self): + self.sum = None + + def open(self, runtime_context: RuntimeContext): + descriptor = ValueStateDescriptor( + "average", # the state name + Types.TUPLE([Types.LONG(), Types.LONG()]) # type information + ) + self.sum = runtime_context.get_state(descriptor) + + def flat_map(self, value): + # access the state value + current_sum = self.sum.value() + if current_sum is None: + current_sum = (0, 0) + + # update the count + current_sum = (current_sum[0] + 1, current_sum[1] + value[1]) + + # update the state + self.sum.update(current_sum) + + # if the count reaches 2, emit the average and clear the state + if current_sum[0] >= 2: + self.sum.clear() + yield value[0], int(current_sum[1] / current_sum[0]) + + +env = StreamExecutionEnvironment.get_execution_environment() +env.from_collection([(1, 3), (1, 5), (1, 7), (1, 4), (1, 2)]) \ + .key_by(lambda row: row[0]) \ + .flat_map(CountWindowAverage()) \ + .print() + +env.execute() + +# the printed output will be (1,4) and (1,5) +``` +{{< /tab >}} +{{< /tabs >}} + +这个例子实现了一个简单的计数窗口。 我们把元组的第一个元素当作 key(在示例中都 key 都是 "1")。 该函数将出现的次数以及总和存储在 "ValueState" 中。 +一旦出现次数达到 2,则将平均值发送到下游,并清除状态重新开始。 请注意,我们会为每个不同的 key(元组中第一个元素)保存一个单独的值。 + +### 状态有效期 (TTL) + +任何类型的 keyed state 都可以有 *有效期* (TTL)。如果配置了 TTL 且状态值已过期,则会尽最大可能清除对应的值,这会在后面详述。 + +所有状态类型都支持单元素的 TTL。 这意味着列表元素和映射元素将独立到期。 + +在使用状态 TTL 前,需要先构建一个`StateTtlConfig` 配置对象。 然后把配置传递到 state descriptor 中启用 TTL 功能: + +{{< tabs "b1c41e38-ec86-4c56-a6f6-de5c5817bd6c" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.api.common.state.StateTtlConfig; +import org.apache.flink.api.common.state.ValueStateDescriptor; +import org.apache.flink.api.common.time.Time; + +StateTtlConfig ttlConfig = StateTtlConfig + .newBuilder(Time.seconds(1)) + .setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite) + .setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired) + .build(); + +ValueStateDescriptor stateDescriptor = new ValueStateDescriptor<>("text state", String.class); +stateDescriptor.enableTimeToLive(ttlConfig); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.common.state.StateTtlConfig +import org.apache.flink.api.common.state.ValueStateDescriptor +import org.apache.flink.api.common.time.Time + +val ttlConfig = StateTtlConfig + .newBuilder(Time.seconds(1)) + .setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite) + .setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired) + .build + +val stateDescriptor = new ValueStateDescriptor[String]("text state", classOf[String]) +stateDescriptor.enableTimeToLive(ttlConfig) +``` +{{< /tab >}} +{{< /tabs >}} + +TTL 配置有以下几个选项: +`newBuilder` 的第一个参数表示数据的有效期,是必选项。 + +TTL 的更新策略(默认是 `OnCreateAndWrite`): + + - `StateTtlConfig.UpdateType.OnCreateAndWrite` - 仅在创建和写入时更新 + - `StateTtlConfig.UpdateType.OnReadAndWrite` - 读取时也更新 + +数据在过期但还未被清理时的可见性配置如下(默认为 `NeverReturnExpired`): + + - `StateTtlConfig.StateVisibility.NeverReturnExpired` - 不返回过期数据 + - `StateTtlConfig.StateVisibility.ReturnExpiredIfNotCleanedUp` - 会返回过期但未清理的数据 + +`NeverReturnExpired` 情况下,过期数据就像不存在一样,不管是否被物理删除。这对于不能访问过期数据的场景下非常有用,比如敏感数据。 +`ReturnExpiredIfNotCleanedUp` 在数据被物理删除前都会返回。 + +**注意:** + +- 状态上次的修改时间会和数据一起保存在 state backend 中,因此开启该特性会增加状态数据的存储。 +Heap state backend 会额外存储一个包括用户状态以及时间戳的 Java 对象,RocksDB state backend 会在每个状态值(list 或者 map 的每个元素)序列化后增加 8 个字节。 + +- 暂时只支持基于 *processing time* 的 TTL。 + +- 尝试从 checkpoint/savepoint 进行恢复时,TTL 的状态(是否开启)必须和之前保持一致,否则会遇到 "StateMigrationException"。 + +- TTL 的配置并不会保存在 checkpoint/savepoint 中,仅对当前 Job 有效。 + +- 当前开启 TTL 的 map state 仅在用户值序列化器支持 null 的情况下,才支持用户值为 null。如果用户值序列化器不支持 null, +可以用 `NullableSerializer` 包装一层。 + +- State TTL 当前在 PyFlink DataStream API 中还不支持。 + +#### 过期数据的清理 + +默认情况下,过期数据会在读取的时候被删除,例如 `ValueState#value`,同时会有后台线程定期清理(如果 StateBackend 支持的话)。可以通过 `StateTtlConfig` 配置关闭后台清理: + + +{{< tabs "99c1d874-3d6d-41d9-b58a-bda678fedc70" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.api.common.state.StateTtlConfig; + +StateTtlConfig ttlConfig = StateTtlConfig + .newBuilder(Time.seconds(1)) + .disableCleanupInBackground() + .build(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.common.state.StateTtlConfig + +val ttlConfig = StateTtlConfig + .newBuilder(Time.seconds(1)) + .disableCleanupInBackground + .build +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +State TTL 当前在 PyFlink DataStream API 中还不支持。 +``` +{{< /tab >}} +{{< /tabs >}} + +可以按照如下所示配置更细粒度的后台清理策略。当前的实现中 `HeapStateBackend` 依赖增量数据清理,`RocksDBStateBackend` 利用压缩过滤器进行后台清理。 + +#### 全量快照时进行清理 + +另外,你可以启用全量快照时进行清理的策略,这可以减少整个快照的大小。当前实现中不会清理本地的状态,但从上次快照恢复时,不会恢复那些已经删除的过期数据。 +该策略可以通过 `StateTtlConfig` 配置进行配置: + +{{< tabs "77959bcd-25cb-476a-893f-53424a723f0e" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.api.common.state.StateTtlConfig; +import org.apache.flink.api.common.time.Time; + +StateTtlConfig ttlConfig = StateTtlConfig + .newBuilder(Time.seconds(1)) + .cleanupFullSnapshot() + .build(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.common.state.StateTtlConfig +import org.apache.flink.api.common.time.Time + +val ttlConfig = StateTtlConfig + .newBuilder(Time.seconds(1)) + .cleanupFullSnapshot + .build +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +State TTL 当前在 PyFlink DataStream API 中还不支持。 +``` +{{< /tab >}} +{{< /tabs >}} + +这种策略在 `RocksDBStateBackend` 的增量 checkpoint 模式下无效。 + +**注意:** +- 这种清理方式可以在任何时候通过 `StateTtlConfig` 启用或者关闭,比如在从 savepoint 恢复时。 + +##### 增量数据清理 + +另外可以选择增量式清理状态数据,在状态访问或/和处理时进行。如果某个状态开启了该清理策略,则会在存储后端保留一个所有状态的惰性全局迭代器。 +每次触发增量清理时,从迭代器中选择已经过期的数进行清理。 + +该特性可以通过 `StateTtlConfig` 进行配置: + +{{< tabs "97f3b853-06df-43c6-a4a1-b50c796bdb52" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.api.common.state.StateTtlConfig; + StateTtlConfig ttlConfig = StateTtlConfig + .newBuilder(Time.seconds(1)) + .cleanupIncrementally(10, true) + .build(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.common.state.StateTtlConfig +val ttlConfig = StateTtlConfig + .newBuilder(Time.seconds(1)) + .cleanupIncrementally(10, true) + .build +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +State TTL 当前在 PyFlink DataStream API 中还不支持。 +``` +{{< /tab >}} +{{< /tabs >}} + +该策略有两个参数。 第一个是每次清理时检查状态的条目数,在每个状态访问时触发。第二个参数表示是否在处理每条记录时触发清理。 +Heap backend 默认会检查 5 条状态,并且关闭在每条记录时触发清理。 + +**注意:** +- 如果没有 state 访问,也没有处理数据,则不会清理过期数据。 +- 增量清理会增加数据处理的耗时。 +- 现在仅 Heap state backend 支持增量清除机制。在 RocksDB state backend 上启用该特性无效。 +- 如果 Heap state backend 使用同步快照方式,则会保存一份所有 key 的拷贝,从而防止并发修改问题,因此会增加内存的使用。但异步快照则没有这个问题。 +- 对已有的作业,这个清理方式可以在任何时候通过 `StateTtlConfig` 启用或禁用该特性,比如从 savepoint 重启后。 + +##### 在 RocksDB 压缩时清理 + +如果使用 RocksDB state backend,则会启用 Flink 为 RocksDB 定制的压缩过滤器。RocksDB 会周期性的对数据进行合并压缩从而减少存储空间。 +Flink 提供的 RocksDB 压缩过滤器会在压缩时过滤掉已经过期的状态数据。 + +该特性可以通过 `StateTtlConfig` 进行配置: + +{{< tabs "1a8a996b-f030-4e0d-9e76-1df6ee3006a1" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.api.common.state.StateTtlConfig; + +StateTtlConfig ttlConfig = StateTtlConfig + .newBuilder(Time.seconds(1)) + .cleanupInRocksdbCompactFilter(1000) + .build(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.common.state.StateTtlConfig + +val ttlConfig = StateTtlConfig + .newBuilder(Time.seconds(1)) + .cleanupInRocksdbCompactFilter(1000) + .build +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +State TTL 当前在 PyFlink DataStream API 中还不支持。 +``` +{{< /tab >}} +{{< /tabs >}} + +Flink 处理一定条数的状态数据后,会使用当前时间戳来检测 RocksDB 中的状态是否已经过期, +你可以通过 `StateTtlConfig.newBuilder(...).cleanupInRocksdbCompactFilter(long queryTimeAfterNumEntries)` 方法指定处理状态的条数。 +时间戳更新的越频繁,状态的清理越及时,但由于压缩会有调用 JNI 的开销,因此会影响整体的压缩性能。 +RocksDB backend 的默认后台清理策略会每处理 1000 条数据进行一次。 + +你还可以通过配置开启 RocksDB 过滤器的 debug 日志: +`log4j.logger.org.rocksdb.FlinkCompactionFilter=DEBUG` + +**注意:** +- 压缩时调用 TTL 过滤器会降低速度。TTL 过滤器需要解析上次访问的时间戳,并对每个将参与压缩的状态进行是否过期检查。 +对于集合型状态类型(比如 list 和 map),会对集合中每个元素进行检查。 +- 对于元素序列化后长度不固定的列表状态,TTL 过滤器需要在每次 JNI 调用过程中,额外调用 Flink 的 java 序列化器, +从而确定下一个未过期数据的位置。 +- 对已有的作业,这个清理方式可以在任何时候通过 `StateTtlConfig` 启用或禁用该特性,比如从 savepoint 重启后。 + +### DataStream 状态相关的 Scala API + +除了上面描述的接口之外,Scala API 还在 `KeyedStream` 上对 `map()` 和 `flatMap()` 访问 `ValueState` 提供了一个更便捷的接口。 +用户函数能够通过 `Option` 获取当前 `ValueState` 的值,并且返回即将保存到状态的值。 + +```scala +val stream: DataStream[(String, Int)] = ... + +val counts: DataStream[(String, Int)] = stream + .keyBy(_._1) + .mapWithState((in: (String, Int), count: Option[Int]) => + count match { + case Some(c) => ( (in._1, c), Some(c + in._2) ) + case None => ( (in._1, 0), Some(in._2) ) + }) +``` + +## Operator State + +*Operator State* (or *non-keyed state*) is state that is bound to one +parallel operator instance. The [Kafka Connector]({{< ref "docs/connectors/datastream/kafka" >}}) is a good motivating example for the use of +Operator State in Flink. Each parallel instance of the Kafka consumer maintains +a map of topic partitions and offsets as its Operator State. + +The Operator State interfaces support redistributing state among parallel +operator instances when the parallelism is changed. There are different schemes +for doing this redistribution. + +In a typical stateful Flink Application you don't need operators state. It is +mostly a special type of state that is used in source/sink implementations and +scenarios where you don't have a key by which state can be partitioned. + +**Notes:** Operator state is still not supported in Python DataStream API. + +## Broadcast State + +*Broadcast State* is a special type of *Operator State*. It was introduced to +support use cases where records of one stream need to be broadcasted to all +downstream tasks, where they are used to maintain the same state among all +subtasks. This state can then be accessed while processing records of a second +stream. As an example where broadcast state can emerge as a natural fit, one +can imagine a low-throughput stream containing a set of rules which we want to +evaluate against all elements coming from another stream. Having the above type +of use cases in mind, broadcast state differs from the rest of operator states +in that: + + 1. it has a map format, + 2. it is only available to specific operators that have as inputs a + *broadcasted* stream and a *non-broadcasted* one, and + 3. such an operator can have *multiple broadcast states* with different names. + +**Notes:** Broadcast state is still not supported in Python DataStream API. + +{{< top >}} + +## 使用 Operator State + +用户可以通过实现 `CheckpointedFunction` 接口来使用 operator state。 + +#### CheckpointedFunction + +`CheckpointedFunction` 接口提供了访问 non-keyed state 的方法,需要实现如下两个方法: + +```java +void snapshotState(FunctionSnapshotContext context) throws Exception; + +void initializeState(FunctionInitializationContext context) throws Exception; +``` + +进行 checkpoint 时会调用 `snapshotState()`。 用户自定义函数初始化时会调用 `initializeState()`,初始化包括第一次自定义函数初始化和从之前的 checkpoint 恢复。 +因此 `initializeState()` 不仅是定义不同状态类型初始化的地方,也需要包括状态恢复的逻辑。 + +当前 operator state 以 list 的形式存在。这些状态是一个 *可序列化* 对象的集合 `List`,彼此独立,方便在改变并发后进行状态的重新分派。 +换句话说,这些对象是重新分配 non-keyed state 的最细粒度。根据状态的不同访问方式,有如下几种重新分配的模式: + + - **Even-split redistribution:** 每个算子都保存一个列表形式的状态集合,整个状态由所有的列表拼接而成。当作业恢复或重新分配的时候,整个状态会按照算子的并发度进行均匀分配。 + 比如说,算子 A 的并发读为 1,包含两个元素 `element1` 和 `element2`,当并发读增加为 2 时,`element1` 会被分到并发 0 上,`element2` 则会被分到并发 1 上。 + + - **Union redistribution:** 每个算子保存一个列表形式的状态集合。整个状态由所有的列表拼接而成。当作业恢复或重新分配时,每个算子都将获得所有的状态数据。 + Do not use this feature if your list may have high cardinality. Checkpoint metadata will store an offset to each list entry, which could lead to RPC framesize or out-of-memory errors. + +下面的例子中的 `SinkFunction` 在 `CheckpointedFunction` 中进行数据缓存,然后统一发送到下游,这个例子演示了列表状态数据的 event-split redistribution。 + +{{< tabs "03fecab3-b48b-4d06-86ed-8769708ae7ca" >}} +{{< tab "Java" >}} +```java +public class BufferingSink + implements SinkFunction>, + CheckpointedFunction { + + private final int threshold; + + private transient ListState> checkpointedState; + + private List> bufferedElements; + + public BufferingSink(int threshold) { + this.threshold = threshold; + this.bufferedElements = new ArrayList<>(); + } + + @Override + public void invoke(Tuple2 value, Context contex) throws Exception { + bufferedElements.add(value); + if (bufferedElements.size() == threshold) { + for (Tuple2 element: bufferedElements) { + // send it to the sink + } + bufferedElements.clear(); + } + } + + @Override + public void snapshotState(FunctionSnapshotContext context) throws Exception { + checkpointedState.clear(); + for (Tuple2 element : bufferedElements) { + checkpointedState.add(element); + } + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + ListStateDescriptor> descriptor = + new ListStateDescriptor<>( + "buffered-elements", + TypeInformation.of(new TypeHint>() {})); + + checkpointedState = context.getOperatorStateStore().getListState(descriptor); + + if (context.isRestored()) { + for (Tuple2 element : checkpointedState.get()) { + bufferedElements.add(element); + } + } + } +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +class BufferingSink(threshold: Int = 0) + extends SinkFunction[(String, Int)] + with CheckpointedFunction { + + @transient + private var checkpointedState: ListState[(String, Int)] = _ + + private val bufferedElements = ListBuffer[(String, Int)]() + + override def invoke(value: (String, Int), context: Context): Unit = { + bufferedElements += value + if (bufferedElements.size == threshold) { + for (element <- bufferedElements) { + // send it to the sink + } + bufferedElements.clear() + } + } + + override def snapshotState(context: FunctionSnapshotContext): Unit = { + checkpointedState.clear() + for (element <- bufferedElements) { + checkpointedState.add(element) + } + } + + override def initializeState(context: FunctionInitializationContext): Unit = { + val descriptor = new ListStateDescriptor[(String, Int)]( + "buffered-elements", + TypeInformation.of(new TypeHint[(String, Int)]() {}) + ) + + checkpointedState = context.getOperatorStateStore.getListState(descriptor) + + if(context.isRestored) { + for(element <- checkpointedState.get()) { + bufferedElements += element + } + } + } + +} +``` +{{< /tab >}} +{{< /tabs >}} + +`initializeState` 方法接收一个 `FunctionInitializationContext` 参数,会用来初始化 non-keyed state 的 "容器"。这些容器是一个 `ListState` +用于在 checkpoint 时保存 non-keyed state 对象。 + +注意这些状态是如何初始化的,和 keyed state 类似,`StateDescriptor` 会包括状态名字、以及状态类型相关信息。 + + +{{< tabs "9f372f5f-ad80-4b2c-a318-fcbdb19c7d2a" >}} +{{< tab "Java" >}} +```java +ListStateDescriptor> descriptor = + new ListStateDescriptor<>( + "buffered-elements", + TypeInformation.of(new TypeHint>() {})); + +checkpointedState = context.getOperatorStateStore().getListState(descriptor); +``` + +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +val descriptor = new ListStateDescriptor[(String, Long)]( + "buffered-elements", + TypeInformation.of(new TypeHint[(String, Long)]() {}) +) + +checkpointedState = context.getOperatorStateStore.getListState(descriptor) + +``` +{{< /tab >}} +{{< /tabs >}} + +调用不同的获取状态对象的接口,会使用不同的状态分配算法。比如 `getUnionListState(descriptor)` 会使用 union redistribution 算法, +而 `getListState(descriptor)` 则简单的使用 even-split redistribution 算法。 + +当初始化好状态对象后,我们通过 `isRestored()` 方法判断是否从之前的故障中恢复回来,如果该方法返回 `true` 则表示从故障中进行恢复,会执行接下来的恢复逻辑。 + +正如代码所示,`BufferingSink` 中初始化时,恢复回来的 `ListState` 的所有元素会添加到一个局部变量中,供下次 `snapshotState()` 时使用。 +然后清空 `ListState`,再把当前局部变量中的所有元素写入到 checkpoint 中。 + +另外,我们同样可以在 `initializeState()` 方法中使用 `FunctionInitializationContext` 初始化 keyed state。 + +### 带状态的 Source Function + +带状态的数据源比其他的算子需要注意更多东西。为了保证更新状态以及输出的原子性(用于支持 exactly-once 语义),用户需要在发送数据前获取数据源的全局锁。 + +{{< tabs "0d664c7a-c695-4306-b562-e0cb36ae9efa" >}} +{{< tab "Java" >}} +```java +public static class CounterSource + extends RichParallelSourceFunction + implements CheckpointedFunction { + + /** current offset for exactly once semantics */ + private Long offset = 0L; + + /** flag for job cancellation */ + private volatile boolean isRunning = true; + + /** 存储 state 的变量. */ + private ListState state; + + @Override + public void run(SourceContext ctx) { + final Object lock = ctx.getCheckpointLock(); + + while (isRunning) { + // output and state update are atomic + synchronized (lock) { + ctx.collect(offset); + offset += 1; + } + } + } + + @Override + public void cancel() { + isRunning = false; + } + + @Override + public void initializeState(FunctionInitializationContext context) throws Exception { + state = context.getOperatorStateStore().getListState(new ListStateDescriptor<>( + "state", + LongSerializer.INSTANCE)); + + // 从我们已保存的状态中恢复 offset 到内存中,在进行任务恢复的时候也会调用此初始化状态的方法 + for (Long l : state.get()) { + offset = l; + } + } + + @Override + public void snapshotState(FunctionSnapshotContext context) throws Exception { + state.clear(); + state.add(offset); + } +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +class CounterSource + extends RichParallelSourceFunction[Long] + with CheckpointedFunction { + + @volatile + private var isRunning = true + + private var offset = 0L + private var state: ListState[Long] = _ + + override def run(ctx: SourceFunction.SourceContext[Long]): Unit = { + val lock = ctx.getCheckpointLock + + while (isRunning) { + // output and state update are atomic + lock.synchronized({ + ctx.collect(offset) + + offset += 1 + }) + } + } + + override def cancel(): Unit = isRunning = false + + override def initializeState(context: FunctionInitializationContext): Unit = { + state = context.getOperatorStateStore.getListState( + new ListStateDescriptor[Long]("state", classOf[Long])) + + for (l <- state.get().asScala) { + offset = l + } + } + + override def snapshotState(context: FunctionSnapshotContext): Unit = { + state.clear() + state.add(offset) + } +} +``` +{{< /tab >}} +{{< /tabs >}} + +希望订阅 checkpoint 成功消息的算子,可以参考 `org.apache.flink.api.common.state.CheckpointListener` 接口。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/fault-tolerance/state_backends.md b/docs/content.zh/docs/dev/datastream/fault-tolerance/state_backends.md new file mode 100644 index 0000000000000..6882198173131 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/fault-tolerance/state_backends.md @@ -0,0 +1,60 @@ +--- +title: "State Backends" +weight: 6 +type: docs +aliases: + - /zh/dev/stream/state/state_backends.html +--- + + +# State Backends + +Flink 提供了多种 state backends,它用于指定状态的存储方式和位置。 + +状态可以位于 Java 的堆或堆外内存。取决于你的 state backend,Flink 也可以自己管理应用程序的状态。 +为了让应用程序可以维护非常大的状态,Flink 可以自己管理内存(如果有必要可以溢写到磁盘)。 +默认情况下,所有 Flink Job 会使用配置文件 *flink-conf.yaml* 中指定的 state backend。 + +但是,配置文件中指定的默认 state backend 会被 Job 中指定的 state backend 覆盖,如下所示。 + +关于可用的 state backend 更多详细信息,包括其优点、限制和配置参数等,请参阅[部署和运维]({{< ref "docs/ops/state/state_backends" >}})的相应部分。 + +{{< tabs "03941da4-5c40-4bb8-97ce-dd14c08bb9a9" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +env.setStateBackend(...); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() +env.setStateBackend(...) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +env = StreamExecutionEnvironment.get_execution_environment() +env.set_state_backend(...) +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/java_lambdas.md b/docs/content.zh/docs/dev/datastream/java_lambdas.md new file mode 100644 index 0000000000000..d04596f08428e --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/java_lambdas.md @@ -0,0 +1,148 @@ +--- +title: "Java Lambda Expressions" +weight: 301 +type: docs +bookToc: false +aliases: + - /zh/dev/java_lambdas.html +--- + + +# Java Lambda Expressions + +Java 8 introduced several new language features designed for faster and clearer coding. With the most important feature, +the so-called "Lambda Expressions", it opened the door to functional programming. Lambda expressions allow for implementing and +passing functions in a straightforward way without having to declare additional (anonymous) classes. + +{{< hint info >}} +Flink supports the usage of lambda expressions for all operators of the Java API, however, whenever a lambda expression uses Java generics you need to declare type information *explicitly*. +{{< /hint >}} + +This document shows how to use lambda expressions and describes current +limitations. For a general introduction to the Flink API, please refer to the +[DataSteam API overview]({{< ref "docs/dev/datastream/overview" >}}) + +### Examples and Limitations + +The following example illustrates how to implement a simple, inline `map()` function that squares its input using a lambda expression. +The types of input `i` and output parameters of the `map()` function need not to be declared as they are inferred by the Java compiler. + +```java +env.fromElements(1, 2, 3) +// returns the squared i +.map(i -> i*i) +.print(); +``` + +Flink can automatically extract the result type information from the implementation of the method signature `OUT map(IN value)` because `OUT` is not generic but `Integer`. + +Unfortunately, functions such as `flatMap()` with a signature `void flatMap(IN value, Collector out)` are compiled into `void flatMap(IN value, Collector out)` by the Java compiler. This makes it impossible for Flink to infer the type information for the output type automatically. + +Flink will most likely throw an exception similar to the following: + +``` +org.apache.flink.api.common.functions.InvalidTypesException: The generic type parameters of 'Collector' are missing. + In many cases lambda methods don't provide enough information for automatic type extraction when Java generics are involved. + An easy workaround is to use an (anonymous) class instead that implements the 'org.apache.flink.api.common.functions.FlatMapFunction' interface. + Otherwise the type has to be specified explicitly using type information. +``` + +In this case, the type information needs to be *specified explicitly*, otherwise the output will be treated as type `Object` which leads to unefficient serialization. + +```java +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.DataSet; +import org.apache.flink.util.Collector; + +DataSet input = env.fromElements(1, 2, 3); + +// collector type must be declared +input.flatMap((Integer number, Collector out) -> { + StringBuilder builder = new StringBuilder(); + for(int i = 0; i < number; i++) { + builder.append("a"); + out.collect(builder.toString()); + } +}) +// provide type information explicitly +.returns(Types.STRING) +// prints "a", "a", "aa", "a", "aa", "aaa" +.print(); +``` + +Similar problems occur when using a `map()` function with a generic return type. A method signature `Tuple2 map(Integer value)` is erasured to `Tuple2 map(Integer value)` in the example below. + +```java +import org.apache.flink.api.common.functions.MapFunction; +import org.apache.flink.api.java.tuple.Tuple2; + +env.fromElements(1, 2, 3) + .map(i -> Tuple2.of(i, i)) // no information about fields of Tuple2 + .print(); +``` + +In general, those problems can be solved in multiple ways: + +```java +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.tuple.Tuple2; + +// use the explicit ".returns(...)" +env.fromElements(1, 2, 3) + .map(i -> Tuple2.of(i, i)) + .returns(Types.TUPLE(Types.INT, Types.INT)) + .print(); + +// use a class instead +env.fromElements(1, 2, 3) + .map(new MyTuple2Mapper()) + .print(); + +public static class MyTuple2Mapper extends MapFunction> { + @Override + public Tuple2 map(Integer i) { + return Tuple2.of(i, i); + } +} + +// use an anonymous class instead +env.fromElements(1, 2, 3) + .map(new MapFunction> { + @Override + public Tuple2 map(Integer i) { + return Tuple2.of(i, i); + } + }) + .print(); + +// or in this example use a tuple subclass instead +env.fromElements(1, 2, 3) + .map(i -> new DoubleTuple(i, i)) + .print(); + +public static class DoubleTuple extends Tuple2 { + public DoubleTuple(int f0, int f1) { + this.f0 = f0; + this.f1 = f1; + } +} +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/operators/_index.md b/docs/content.zh/docs/dev/datastream/operators/_index.md new file mode 100644 index 0000000000000..a7a6856130ac8 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/operators/_index.md @@ -0,0 +1,23 @@ +--- +title: 算子 +bookCollapseSection: true +weight: 7 +--- + diff --git a/docs/content.zh/docs/dev/datastream/operators/asyncio.md b/docs/content.zh/docs/dev/datastream/operators/asyncio.md new file mode 100644 index 0000000000000..fc7fb521bb855 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/operators/asyncio.md @@ -0,0 +1,237 @@ +--- +title: "异步 I/O" +weight: 5 +type: docs +aliases: + - /zh/dev/stream/operators/asyncio.html +--- + + +# 用于外部数据访问的异步 I/O + +本文讲解 Flink 用于访问外部数据存储的异步 I/O API。 +对于不熟悉异步或者事件驱动编程的用户,建议先储备一些关于 Future 和事件驱动编程的知识。 + +提示:这篇文档 [FLIP-12: 异步 I/O 的设计和实现](https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=65870673)介绍了关于设计和实现异步 I/O 功能的细节。 + +## 对于异步 I/O 操作的需求 + +在与外部系统交互(用数据库中的数据扩充流数据)的时候,需要考虑与外部系统的通信延迟对整个流处理应用的影响。 + +简单地访问外部数据库的数据,比如使用 `MapFunction`,通常意味着**同步**交互: +`MapFunction` 向数据库发送一个请求然后一直等待,直到收到响应。在许多情况下,等待占据了函数运行的大部分时间。 + +与数据库异步交互是指一个并行函数实例可以并发地处理多个请求和接收多个响应。这样,函数在等待的时间可以发送其他请求和接收其他响应。至少等待的时间可以被多个请求摊分。大多数情况下,异步交互可以大幅度提高流处理的吞吐量。 + +{{< img src="/fig/async_io.svg" width="50%" >}} + +*注意:*仅仅提高 `MapFunction` 的并行度(parallelism)在有些情况下也可以提升吞吐量,但是这样做通常会导致非常高的资源消耗:更多的并行 `MapFunction` 实例意味着更多的 Task、更多的线程、更多的 Flink 内部网络连接、 更多的与数据库的网络连接、更多的缓冲和更多程序内部协调的开销。 + + +## 先决条件 + +如上节所述,正确地实现数据库(或键/值存储)的异步 I/O 交互需要支持异步请求的数据库客户端。许多主流数据库都提供了这样的客户端。 + +如果没有这样的客户端,可以通过创建多个客户端并使用线程池处理同步调用的方法,将同步客户端转换为有限并发的客户端。然而,这种方法通常比正规的异步客户端效率低。 + + +## 异步 I/O API + +Flink 的异步 I/O API 允许用户在流处理中使用异步请求客户端。API 处理与数据流的集成,同时还能处理好顺序、事件时间和容错等。 + +在具备异步数据库客户端的基础上,实现数据流转换操作与数据库的异步 I/O 交互需要以下三部分: + +- 实现分发请求的 `AsyncFunction` +- 获取数据库交互的结果并发送给 `ResultFuture` 的 *回调* 函数 +- 将异步 I/O 操作应用于 `DataStream` 作为 `DataStream` 的一次转换操作。 + +下面是基本的代码模板: + +{{< tabs "b9213242-26c1-4416-95c2-076a23777eec" >}} +{{< tab "Java" >}} +```java +// 这个例子使用 Java 8 的 Future 接口(与 Flink 的 Future 相同)实现了异步请求和回调。 + +/** + * 实现 'AsyncFunction' 用于发送请求和设置回调。 + */ +class AsyncDatabaseRequest extends RichAsyncFunction> { + + /** 能够利用回调函数并发发送请求的数据库客户端 */ + private transient DatabaseClient client; + + @Override + public void open(Configuration parameters) throws Exception { + client = new DatabaseClient(host, post, credentials); + } + + @Override + public void close() throws Exception { + client.close(); + } + + @Override + public void asyncInvoke(String key, final ResultFuture> resultFuture) throws Exception { + + // 发送异步请求,接收 future 结果 + final Future result = client.query(key); + + // 设置客户端完成请求后要执行的回调函数 + // 回调函数只是简单地把结果发给 future + CompletableFuture.supplyAsync(new Supplier() { + + @Override + public String get() { + try { + return result.get(); + } catch (InterruptedException | ExecutionException e) { + // 显示地处理异常。 + return null; + } + } + }).thenAccept( (String dbResult) -> { + resultFuture.complete(Collections.singleton(new Tuple2<>(key, dbResult))); + }); + } +} + +// 创建初始 DataStream +DataStream stream = ...; + +// 应用异步 I/O 转换操作 +DataStream> resultStream = + AsyncDataStream.unorderedWait(stream, new AsyncDatabaseRequest(), 1000, TimeUnit.MILLISECONDS, 100); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +/** + * 实现 'AsyncFunction' 用于发送请求和设置回调。 + */ +class AsyncDatabaseRequest extends AsyncFunction[String, (String, String)] { + + /** 能够利用回调函数并发发送请求的数据库客户端 */ + lazy val client: DatabaseClient = new DatabaseClient(host, post, credentials) + + /** 用于 future 回调的上下文环境 */ + implicit lazy val executor: ExecutionContext = ExecutionContext.fromExecutor(Executors.directExecutor()) + + + override def asyncInvoke(str: String, resultFuture: ResultFuture[(String, String)]): Unit = { + + // 发送异步请求,接收 future 结果 + val resultFutureRequested: Future[String] = client.query(str) + + // 设置客户端完成请求后要执行的回调函数 + // 回调函数只是简单地把结果发给 future + resultFutureRequested.onSuccess { + case result: String => resultFuture.complete(Iterable((str, result))) + } + } +} + +// 创建初始 DataStream +val stream: DataStream[String] = ... + +// 应用异步 I/O 转换操作 +val resultStream: DataStream[(String, String)] = + AsyncDataStream.unorderedWait(stream, new AsyncDatabaseRequest(), 1000, TimeUnit.MILLISECONDS, 100) + +``` +{{< /tab >}} +{{< /tabs >}} + +**重要提示**: 第一次调用 `ResultFuture.complete` 后 `ResultFuture` 就完成了。 +后续的 `complete` 调用都将被忽略。 + +下面两个参数控制异步操作: + + - **Timeout**: 超时参数定义了异步请求发出多久后未得到响应即被认定为失败。 它可以防止一直等待得不到响应的请求。 + + - **Capacity**: 容量参数定义了可以同时进行的异步请求数。 + 即使异步 I/O 通常带来更高的吞吐量,执行异步 I/O 操作的算子仍然可能成为流处理的瓶颈。 限制并发请求的数量可以确保算子不会持续累积待处理的请求进而造成积压,而是在容量耗尽时触发反压。 + + +### 超时处理 + +当异步 I/O 请求超时的时候,默认会抛出异常并重启作业。 +如果你想处理超时,可以重写 `AsyncFunction#timeout` 方法。 + +### 结果的顺序 + +`AsyncFunction` 发出的并发请求经常以不确定的顺序完成,这取决于请求得到响应的顺序。 +Flink 提供两种模式控制结果记录以何种顺序发出。 + + - **无序模式**: 异步请求一结束就立刻发出结果记录。 + 流中记录的顺序在经过异步 I/O 算子之后发生了改变。 + 当使用 *处理时间* 作为基本时间特征时,这个模式具有最低的延迟和最少的开销。 + 此模式使用 `AsyncDataStream.unorderedWait(...)` 方法。 + + - **有序模式**: 这种模式保持了流的顺序。发出结果记录的顺序与触发异步请求的顺序(记录输入算子的顺序)相同。为了实现这一点,算子将缓冲一个结果记录直到这条记录前面的所有记录都发出(或超时)。由于记录或者结果要在 checkpoint 的状态中保存更长的时间,所以与无序模式相比,有序模式通常会带来一些额外的延迟和 checkpoint 开销。此模式使用 `AsyncDataStream.orderedWait(...)` 方法。 + + +### 事件时间 + +当流处理应用使用[事件时间]({{< ref "docs/concepts/time" >}})时,异步 I/O 算子会正确处理 watermark。对于两种顺序模式,这意味着以下内容: + + - **无序模式**: Watermark 既不超前于记录也不落后于记录,即 watermark 建立了*顺序的边界*。 + 只有连续两个 watermark 之间的记录是无序发出的。 + 在一个 watermark 后面生成的记录只会在这个 watermark 发出以后才发出。 + 在一个 watermark 之前的所有输入的结果记录全部发出以后,才会发出这个 watermark。 + + 这意味着存在 watermark 的情况下,*无序模式* 会引入一些与*有序模式* 相同的延迟和管理开销。开销大小取决于 watermark 的频率。 + + - **有序模式**: 连续两个 watermark 之间的记录顺序也被保留了。开销与使用*处理时间* 相比,没有显著的差别。 + + +请记住,*摄入时间* 是一种特殊的*事件时间*,它基于数据源的处理时间自动生成 watermark。 + + +### 容错保证 + +异步 I/O 算子提供了完全的精确一次容错保证。它将在途的异步请求的记录保存在 checkpoint 中,在故障恢复时重新触发请求。 + + +### 实现提示 + +在实现使用 *Executor*(或者 Scala 中的 *ExecutionContext*)和回调的 *Futures* 时,建议使用 `DirectExecutor`,因为通常回调的工作量很小,`DirectExecutor` 避免了额外的线程切换开销。回调通常只是把结果发送给 `ResultFuture`,也就是把它添加进输出缓冲。从这里开始,包括发送记录和与 chenkpoint 交互在内的繁重逻辑都将在专有的线程池中进行处理。 + +`DirectExecutor` 可以通过 `org.apache.flink.runtime.concurrent.Executors.directExecutor()` 或 +`com.google.common.util.concurrent.MoreExecutors.directExecutor()` 获得。 + + +### 警告 + +**Flink 不以多线程方式调用 AsyncFunction** + +我们想在这里明确指出一个经常混淆的地方:`AsyncFunction` 不是以多线程方式调用的。 +只有一个 `AsyncFunction` 实例,它被流中相应分区内的每个记录顺序地调用。除非 `asyncInvoke(...)` 方法快速返回并且依赖于(客户端的)回调, 否则无法实现正确的异步 I/O。 + +例如,以下情况导致阻塞的 `asyncInvoke(...)` 函数,从而使异步行为无效: + + - 使用同步数据库客户端,它的查询方法调用在返回结果前一直被阻塞。 + - 在 `asyncInvoke(...)` 方法内阻塞等待异步客户端返回的 future 类型对象 + +**目前,出于一致性的原因,AsyncFunction 的算子(异步等待算子)必须位于算子链的头部** + +根据 `FLINK-13063` 给出的原因,目前我们必须断开异步等待算子的算子链以防止潜在的一致性问题。这改变了先前支持的算子链的行为。需要旧有行为并接受可能违反一致性保证的用户可以实例化并手工将异步等待算子添加到作业图中并将链策略设置回通过异步等待算子的 `ChainingStrategy.ALWAYS` 方法进行链接。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/operators/joining.md b/docs/content.zh/docs/dev/datastream/operators/joining.md new file mode 100644 index 0000000000000..4889e65d59ae9 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/operators/joining.md @@ -0,0 +1,290 @@ +--- +title: "Joining" +weight: 3 +type: docs +aliases: + - /zh/dev/stream/operators/joining.html +--- + + +# Joining + +## Window Join + +A window join joins the elements of two streams that share a common key and lie in the same window. These windows can be defined by using a [window assigner]({{< ref "docs/dev/datastream/operators/windows" >}}#window-assigners) and are evaluated on elements from both of the streams. + +The elements from both sides are then passed to a user-defined `JoinFunction` or `FlatJoinFunction` where the user can emit results that meet the join criteria. + +The general usage can be summarized as follows: + +```java +stream.join(otherStream) + .where() + .equalTo() + .window() + .apply() +``` + +Some notes on semantics: +- The creation of pairwise combinations of elements of the two streams behaves like an inner-join, meaning elements from one stream will not be emitted if they don't have a corresponding element from the other stream to be joined with. +- Those elements that do get joined will have as their timestamp the largest timestamp that still lies in the respective window. For example a window with `[5, 10)` as its boundaries would result in the joined elements having 9 as their timestamp. + +In the following section we are going to give an overview over how different kinds of window joins behave using some exemplary scenarios. + +### Tumbling Window Join + +When performing a tumbling window join, all elements with a common key and a common tumbling window are joined as pairwise combinations and passed on to a `JoinFunction` or `FlatJoinFunction`. Because this behaves like an inner join, elements of one stream that do not have elements from another stream in their tumbling window are not emitted! + +{{< img src="/fig/tumbling-window-join.svg" width="80%" >}} + +As illustrated in the figure, we define a tumbling window with the size of 2 milliseconds, which results in windows of the form `[0,1], [2,3], ...`. The image shows the pairwise combinations of all elements in each window which will be passed on to the `JoinFunction`. Note that in the tumbling window `[6,7]` nothing is emitted because no elements exist in the green stream to be joined with the orange elements ⑥ and ⑦. + +{{< tabs "a8e08868-40d6-4719-b554-e2cabf2e1f6f" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; +import org.apache.flink.streaming.api.windowing.time.Time; + +... + +DataStream orangeStream = ... +DataStream greenStream = ... + +orangeStream.join(greenStream) + .where() + .equalTo() + .window(TumblingEventTimeWindows.of(Time.milliseconds(2))) + .apply (new JoinFunction (){ + @Override + public String join(Integer first, Integer second) { + return first + "," + second; + } + }); +``` +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; +import org.apache.flink.streaming.api.windowing.time.Time; + +... + +val orangeStream: DataStream[Integer] = ... +val greenStream: DataStream[Integer] = ... + +orangeStream.join(greenStream) + .where(elem => /* select key */) + .equalTo(elem => /* select key */) + .window(TumblingEventTimeWindows.of(Time.milliseconds(2))) + .apply { (e1, e2) => e1 + "," + e2 } +``` + +{{< /tab >}} +{{< /tabs >}} + +### Sliding Window Join + +When performing a sliding window join, all elements with a common key and common sliding window are joined as pairwise combinations and passed on to the `JoinFunction` or `FlatJoinFunction`. Elements of one stream that do not have elements from the other stream in the current sliding window are not emitted! Note that some elements might be joined in one sliding window but not in another! + +{{< img src="/fig/sliding-window-join.svg" width="80%" >}} + +In this example we are using sliding windows with a size of two milliseconds and slide them by one millisecond, resulting in the sliding windows `[-1, 0],[0,1],[1,2],[2,3], …`. The joined elements below the x-axis are the ones that are passed to the `JoinFunction` for each sliding window. Here you can also see how for example the orange ② is joined with the green ③ in the window `[2,3]`, but is not joined with anything in the window `[1,2]`. + +{{< tabs "a3d3218b-dd25-4428-bfbb-d02522d95661" >}} +{{< tab "Java" >}} + +```java +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows; +import org.apache.flink.streaming.api.windowing.time.Time; + +... + +DataStream orangeStream = ... +DataStream greenStream = ... + +orangeStream.join(greenStream) + .where() + .equalTo() + .window(SlidingEventTimeWindows.of(Time.milliseconds(2) /* size */, Time.milliseconds(1) /* slide */)) + .apply (new JoinFunction (){ + @Override + public String join(Integer first, Integer second) { + return first + "," + second; + } + }); +``` +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows; +import org.apache.flink.streaming.api.windowing.time.Time; + +... + +val orangeStream: DataStream[Integer] = ... +val greenStream: DataStream[Integer] = ... + +orangeStream.join(greenStream) + .where(elem => /* select key */) + .equalTo(elem => /* select key */) + .window(SlidingEventTimeWindows.of(Time.milliseconds(2) /* size */, Time.milliseconds(1) /* slide */)) + .apply { (e1, e2) => e1 + "," + e2 } +``` +{{< /tab >}} +{{< /tabs >}} + +### Session Window Join + +When performing a session window join, all elements with the same key that when _"combined"_ fulfill the session criteria are joined in pairwise combinations and passed on to the `JoinFunction` or `FlatJoinFunction`. Again this performs an inner join, so if there is a session window that only contains elements from one stream, no output will be emitted! + +{{< img src="/fig/session-window-join.svg" width="80%" >}} + +Here we define a session window join where each session is divided by a gap of at least 1ms. There are three sessions, and in the first two sessions the joined elements from both streams are passed to the `JoinFunction`. In the third session there are no elements in the green stream, so ⑧ and ⑨ are not joined! + +{{< tabs "0e75f447-e1f7-4f38-b68c-de42ddd33512" >}} +{{< tab "Java" >}} + +```java +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.streaming.api.windowing.assigners.EventTimeSessionWindows; +import org.apache.flink.streaming.api.windowing.time.Time; + +... + +DataStream orangeStream = ... +DataStream greenStream = ... + +orangeStream.join(greenStream) + .where() + .equalTo() + .window(EventTimeSessionWindows.withGap(Time.milliseconds(1))) + .apply (new JoinFunction (){ + @Override + public String join(Integer first, Integer second) { + return first + "," + second; + } + }); +``` +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +import org.apache.flink.streaming.api.windowing.assigners.EventTimeSessionWindows; +import org.apache.flink.streaming.api.windowing.time.Time; + +... + +val orangeStream: DataStream[Integer] = ... +val greenStream: DataStream[Integer] = ... + +orangeStream.join(greenStream) + .where(elem => /* select key */) + .equalTo(elem => /* select key */) + .window(EventTimeSessionWindows.withGap(Time.milliseconds(1))) + .apply { (e1, e2) => e1 + "," + e2 } +``` + +{{< /tab >}} +{{< /tabs >}} + +## Interval Join + +The interval join joins elements of two streams (we'll call them A & B for now) with a common key and where elements of stream B have timestamps that lie in a relative time interval to timestamps of elements in stream A. + +This can also be expressed more formally as +`b.timestamp ∈ [a.timestamp + lowerBound; a.timestamp + upperBound]` or +`a.timestamp + lowerBound <= b.timestamp <= a.timestamp + upperBound` + +where a and b are elements of A and B that share a common key. Both the lower and upper bound can be either negative or positive as long as as the lower bound is always smaller or equal to the upper bound. The interval join currently only performs inner joins. + +When a pair of elements are passed to the `ProcessJoinFunction`, they will be assigned with the larger timestamp (which can be accessed via the `ProcessJoinFunction.Context`) of the two elements. + +{{< hint info >}} +The interval join currently only supports event time. +{{< /hint >}} + +{{< img src="/fig/interval-join.svg" width="80%" >}} + +In the example above, we join two streams 'orange' and 'green' with a lower bound of -2 milliseconds and an upper bound of +1 millisecond. Be default, these boundaries are inclusive, but `.lowerBoundExclusive()` and `.upperBoundExclusive` can be applied to change the behaviour. + +Using the more formal notation again this will translate to + +`orangeElem.ts + lowerBound <= greenElem.ts <= orangeElem.ts + upperBound` + +as indicated by the triangles. + +{{< tabs "63cebeb2-5869-4d2e-998d-d77fb466e2e6" >}} +{{< tab "Java" >}} + +```java +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction; +import org.apache.flink.streaming.api.windowing.time.Time; + +... + +DataStream orangeStream = ... +DataStream greenStream = ... + +orangeStream + .keyBy() + .intervalJoin(greenStream.keyBy()) + .between(Time.milliseconds(-2), Time.milliseconds(1)) + .process (new ProcessJoinFunction out) { + out.collect(first + "," + second); + } + }); +``` + +{{< /tab >}} +{{< tab "Scala" >}} + +```scala +import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction; +import org.apache.flink.streaming.api.windowing.time.Time; + +... + +val orangeStream: DataStream[Integer] = ... +val greenStream: DataStream[Integer] = ... + +orangeStream + .keyBy(elem => /* select key */) + .intervalJoin(greenStream.keyBy(elem => /* select key */)) + .between(Time.milliseconds(-2), Time.milliseconds(1)) + .process(new ProcessJoinFunction[Integer, Integer, String] { + override def processElement(left: Integer, right: Integer, ctx: ProcessJoinFunction[Integer, Integer, String]#Context, out: Collector[String]): Unit = { + out.collect(left + "," + right); + } + }); + }); +``` + +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/operators/overview.md b/docs/content.zh/docs/dev/datastream/operators/overview.md new file mode 100644 index 0000000000000..c26ac984feb5e --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/operators/overview.md @@ -0,0 +1,757 @@ +--- +title: 概览 +weight: 1 +type: docs +aliases: + - /zh/dev/stream/operators/ +--- + + +# 算子 + +用户通过算子能将一个或多个 DataStream 转换成新的 DataStream,在应用程序中可以将多个数据转换算子合并成一个复杂的数据流拓扑。 + +这部分内容将描述 Flink DataStream API 中基本的数据转换API,数据转换后各种数据分区方式,以及算子的链接策略。 + +## 数据流转换 + +### Map +#### DataStream → DataStream + +Takes one element and produces one element. A map function that doubles the values of the input stream: + +{{< tabs mapfunc >}} +{{< tab "Java">}} +```java +DataStream dataStream = //... +dataStream.map(new MapFunction() { + @Override + public Integer map(Integer value) throws Exception { + return 2 * value; + } +}); +``` +{{< /tab >}} +{{< tab "Scala">}} +```scala +dataStream.map { x => x * 2 } +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +data_stream = env.from_collection(collection=[1, 2, 3, 4, 5]) +data_stream.map(lambda x: 2 * x, output_type=Types.INT()) +``` +{{< /tab >}} +{{< /tabs>}} + +### FlatMap +#### DataStream → DataStream + +Takes one element and produces zero, one, or more elements. A flatmap function that splits sentences to words: + +{{< tabs flatmapfunc >}} +{{< tab "Java">}} +```java +dataStream.flatMap(new FlatMapFunction() { + @Override + public void flatMap(String value, Collector out) + throws Exception { + for(String word: value.split(" ")){ + out.collect(word); + } + } +}); +``` +{{< /tab >}} +{{< tab "Scala">}} +```scala +dataStream.flatMap { str => str.split(" ") } +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +data_stream = env.from_collection(collection=['hello apache flink', 'streaming compute']) +data_stream.flat_map(lambda x: x.split(' '), output_type=Types.STRING()) +``` +{{< /tab >}} +{{< /tabs>}} + +### Filter +#### DataStream → DataStream + +Evaluates a boolean function for each element and retains those for which the function returns true. A filter that filters out zero values: + +{{< tabs filterfunc >}} +{{< tab "Java">}} +```java +dataStream.filter(new FilterFunction() { + @Override + public boolean filter(Integer value) throws Exception { + return value != 0; + } +}); +``` +{{< /tab >}} +{{< tab "Scala">}} +```scala +dataStream.filter { _ != 0 } +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +data_stream = env.from_collection(collection=[0, 1, 2, 3, 4, 5]) +data_stream.filter(lambda x: x != 0) +``` +{{< /tab >}} +{{< /tabs>}} + +### KeyBy +#### DataStream → KeyedStream + +Logically partitions a stream into disjoint partitions. All records with the same key are assigned to the same partition. Internally, _keyBy()_ is implemented with hash partitioning. There are different ways to [specify keys]({{< ref "docs/dev/datastream/fault-tolerance/state" >}}#keyed-datastream). + +{{< tabs keybyfunc >}} +{{< tab "Java">}} +```java +dataStream.keyBy(value -> value.getSomeKey()); +dataStream.keyBy(value -> value.f0); +``` +{{< /tab >}} +{{< tab "Scala">}} +```scala +dataStream.keyBy(_.someKey) +dataStream.keyBy(_._1) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +data_stream = env.from_collection(collection=[(1, 'a'), (2, 'a'), (3, 'b')]) +data_stream.key_by(lambda x: x[1], key_type=Types.STRING()) // Key by the result of KeySelector +``` +{{< /tab >}} +{{< /tabs>}} + +{{< hint warning >}} +A type **cannot be a key if**: + +1. it is a POJO type but does not override the `hashCode()` method and relies on the `Object.hashCode()` implementation. +2. it is an array of any type. +{{< /hint >}} + +### Reduce +#### KeyedStream → DataStream + +A "rolling" reduce on a keyed data stream. Combines the current element with the last reduced value and emits the new value. + +A reduce function that creates a stream of partial sums: + +{{< tabs globalreduce >}} +{{< tab "Java">}} +```java +keyedStream.reduce(new ReduceFunction() { + @Override + public Integer reduce(Integer value1, Integer value2) + throws Exception { + return value1 + value2; + } +}); +``` +{{< /tab >}} +{{< tab "Scala">}} +```scala +keyedStream.reduce { _ + _ } +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +data_stream = env.from_collection(collection=[(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')], type_info=Types.TUPLE([Types.INT(), Types.STRING()])) +data_stream.key_by(lambda x: x[1]).reduce(lambda a, b: (a[0] + b[0], b[1])) +``` +{{< /tab >}} +{{< /tabs>}} + +### Window +#### KeyedStream → WindowedStream + +Windows can be defined on already partitioned KeyedStreams. Windows group the data in each key according to some characteristic (e.g., the data that arrived within the last 5 seconds). +See [windows](windows.html) for a complete description of windows. + +{{< tabs window >}} +{{< tab "Java">}} +```java +dataStream + .keyBy(value -> value.f0) + .window(TumblingEventTimeWindows.of(Time.seconds(5))); +``` +{{< /tab >}} +{{< tab "Scala">}} +```scala +dataStream + .keyBy(_._1) + .window(TumblingEventTimeWindows.of(Time.seconds(5))) +``` +{{< /tab >}} +{{< tab "Python" >}} +This feature is not yet supported in Python +{{< /tab >}} +{{< /tabs>}} + +### WindowAll +#### DataStreamStream → AllWindowedStream + +Windows can be defined on regular DataStreams. Windows group all the stream events according to some characteristic (e.g., the data that arrived within the last 5 seconds). See [windows](windows.html) for a complete description of windows. + +{{< hint warning >}} +This is in many cases a non-parallel transformation. All records will be gathered in one task for the windowAll operator. +{{< /hint >}} + +{{< tabs windowAll >}} +{{< tab "Java">}} +```java +dataStream + .windowAll(TumblingEventTimeWindows.of(Time.seconds(5))); +``` +{{< /tab >}} +{{< tab "Scala">}} +```scala +dataStream + .windowAll(TumblingEventTimeWindows.of(Time.seconds(5))) +``` +{{< /tab >}} +{{< tab "Python" >}} +This feature is not yet supported in Python +{{< /tab >}} +{{< /tabs>}} + +### Window Apply +#### WindowedStream → DataStream +#### AllWindowedStream → DataStream + +Applies a general function to the window as a whole. Below is a function that manually sums the elements of a window. + +{{< hint info >}} +If you are using a windowAll transformation, you need to use an `AllWindowFunction` instead. +{{< /hint >}} + +{{< tabs windowapply >}} +{{< tab "Java">}} +```java +windowedStream.apply(new WindowFunction, Integer, Tuple, Window>() { + public void apply (Tuple tuple, + Window window, + Iterable> values, + Collector out) throws Exception { + int sum = 0; + for (value t: values) { + sum += t.f1; + } + out.collect (new Integer(sum)); + } +}); + +// applying an AllWindowFunction on non-keyed window stream +allWindowedStream.apply (new AllWindowFunction, Integer, Window>() { + public void apply (Window window, + Iterable> values, + Collector out) throws Exception { + int sum = 0; + for (value t: values) { + sum += t.f1; + } + out.collect (new Integer(sum)); + } +}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +windowedStream.apply { WindowFunction } + +// applying an AllWindowFunction on non-keyed window stream +allWindowedStream.apply { AllWindowFunction } +``` +{{< /tab >}} +{{< tab "Python" >}} +This feature is not yet supported in Python +{{< /tab >}} +{{< /tabs>}} + +### WindowReduce +#### WindowedStream → DataStream + +Applies a functional reduce function to the window and returns the reduced value. + +{{< tabs windowreduce >}} +{{< tab "Java" >}} +```java +windowedStream.reduce (new ReduceFunction>() { + public Tuple2 reduce(Tuple2 value1, Tuple2 value2) throws Exception { + return new Tuple2(value1.f0, value1.f1 + value2.f1); + } +}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +windowedStream.reduce { _ + _ } +``` +{{< /tab >}} +{{< tab "Python" >}} +This feature is not yet supported in Python +{{< /tab >}} +{{< /tabs>}} + +### Union +#### DataStream\* → DataStream + +Union of two or more data streams creating a new stream containing all the elements from all the streams. Note: If you union a data stream with itself you will get each element twice in the resulting stream. + +{{< tabs union >}} +{{< tab "Java" >}} +```java +dataStream.union(otherStream1, otherStream2, ...); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +dataStream.union(otherStream1, otherStream2, ...); +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +data_stream.union(otherStream1, otherStream2, ...) +``` +{{< /tab >}} +{{< /tabs>}} + +### Window Join +#### DataStream,DataStream → DataStream + +Join two data streams on a given key and a common window. + +{{< tabs windowjoin >}} +{{< tab "Java" >}} +```java +dataStream.join(otherStream) + .where().equalTo() + .window(TumblingEventTimeWindows.of(Time.seconds(3))) + .apply (new JoinFunction () {...}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +dataStream.join(otherStream) + .where().equalTo() + .window(TumblingEventTimeWindows.of(Time.seconds(3))) + .apply { ... } +``` +{{< /tab >}} +{{< tab "Python" >}} +This feature is not yet supported in Python +{{< /tab >}} +{{< /tabs>}} + +### Interval Join +#### KeyedStream,KeyedStream → DataStream + +Join two elements e1 and e2 of two keyed streams with a common key over a given time interval, so that `e1.timestamp + lowerBound <= e2.timestamp <= e1.timestamp + upperBound`. + +{{< tabs intervaljoin >}} +{{< tab "Java" >}} +```java +// this will join the two streams so that +// key1 == key2 && leftTs - 2 < rightTs < leftTs + 2 +keyedStream.intervalJoin(otherKeyedStream) + .between(Time.milliseconds(-2), Time.milliseconds(2)) // lower and upper bound + .upperBoundExclusive(true) // optional + .lowerBoundExclusive(true) // optional + .process(new IntervalJoinFunction() {...}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// this will join the two streams so that +// key1 == key2 && leftTs - 2 < rightTs < leftTs + 2 +keyedStream.intervalJoin(otherKeyedStream) + .between(Time.milliseconds(-2), Time.milliseconds(2)) + // lower and upper bound + .upperBoundExclusive(true) // optional + .lowerBoundExclusive(true) // optional + .process(new IntervalJoinFunction() {...}) +``` +{{< /tab >}} +{{< tab "Python" >}} +This feature is not yet supported in Python +{{< /tab >}} +{{< /tabs>}} + +### Window CoGroup +#### DataStream,DataStream → DataStream + +Cogroups two data streams on a given key and a common window. + +{{< tabs windowcogroup >}} +{{< tab "Java" >}} +```java +dataStream.coGroup(otherStream) + .where(0).equalTo(1) + .window(TumblingEventTimeWindows.of(Time.seconds(3))) + .apply (new CoGroupFunction () {...}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +dataStream.coGroup(otherStream) + .where(0).equalTo(1) + .window(TumblingEventTimeWindows.of(Time.seconds(3))) + .apply {} +``` +{{< /tab >}} +{{< tab "Python" >}} +This feature is not yet supported in Python +{{< /tab >}} +{{< /tabs>}} + +### Connect +#### DataStream,DataStream → ConnectedStream + +"Connects" two data streams retaining their types. Connect allowing for shared state between the two streams. + +{{< tabs connect >}} +{{< tab "Java" >}} +```java +DataStream someStream = //... +DataStream otherStream = //... + +ConnectedStreams connectedStreams = someStream.connect(otherStream); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +someStream : DataStream[Int] = ... +otherStream : DataStream[String] = ... + +val connectedStreams = someStream.connect(otherStream) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +stream_1 = ... +stream_2 = ... +connected_streams = stream_1.connect(stream_2) +``` +{{< /tab >}} +{{< /tabs>}} + +### CoMap, CoFlatMap +#### ConnectedStream → DataStream + +Similar to map and flatMap on a connected data stream + +{{< tabs comap >}} +{{< tab "Java" >}} +```java +connectedStreams.map(new CoMapFunction() { + @Override + public Boolean map1(Integer value) { + return true; + } + + @Override + public Boolean map2(String value) { + return false; + } +}); +connectedStreams.flatMap(new CoFlatMapFunction() { + + @Override + public void flatMap1(Integer value, Collector out) { + out.collect(value.toString()); + } + + @Override + public void flatMap2(String value, Collector out) { + for (String word: value.split(" ")) { + out.collect(word); + } + } +}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +connectedStreams.map( + (_ : Int) => true, + (_ : String) => false +) +connectedStreams.flatMap( + (_ : Int) => true, + (_ : String) => false +) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +class MyCoMapFunction(CoMapFunction): + + def map1(self, value): + return value[0] + 1, value[1] + + def map2(self, value): + return value[0], value[1] + 'flink' + +class MyCoFlatMapFunction(CoFlatMapFunction): + + def flat_map1(self, value) + for i in range(value[0]): + yield i + + def flat_map2(self, value): + yield value[0] + 1 + +connectedStreams.map(MyCoMapFunction()) +connectedStreams.flat_map(MyCoFlatMapFunction()) +``` +{{< /tab >}} +{{< /tabs>}} + +### Iterate +#### DataStream → IterativeStream → ConnectedStream + +Creates a "feedback" loop in the flow, by redirecting the output of one operator to some previous operator. This is especially useful for defining algorithms that continuously update a model. The following code starts with a stream and applies the iteration body continuously. Elements that are greater than 0 are sent back to the feedback channel, and the rest of the elements are forwarded downstream. + +{{< tabs iterate >}} +{{< tab "Java" >}} +```java +IterativeStream iteration = initialStream.iterate(); +DataStream iterationBody = iteration.map (/*do something*/); +DataStream feedback = iterationBody.filter(new FilterFunction(){ + @Override + public boolean filter(Long value) throws Exception { + return value > 0; + } +}); +iteration.closeWith(feedback); +DataStream output = iterationBody.filter(new FilterFunction(){ + @Override + public boolean filter(Long value) throws Exception { + return value <= 0; + } +}); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +initialStream.iterate { + iteration => { + val iterationBody = iteration.map {/*do something*/} + (iterationBody.filter(_ > 0), iterationBody.filter(_ <= 0)) + } +} +``` +{{< /tab >}} +{{< tab "Python" >}} +This feature is not yet supported in Python +{{< /tab >}} +{{< /tabs>}} + +## 物理分区 + +Flink 也提供以下方法让用户根据需要在数据转换完成后对数据分区进行更细粒度的配置。 + +### Custom Partitioning +#### DataStream → DataStream + +Uses a user-defined Partitioner to select the target task for each element. + +{{< tabs custompartitioning >}} +{{< tab "Java" >}} +```java +dataStream.partitionCustom(partitioner, "someKey"); +dataStream.partitionCustom(partitioner, 0); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +dataStream.partitionCustom(partitioner, "someKey") +dataStream.partitionCustom(partitioner, 0) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +data_stream = env.from_collection(collection=[(2, 'a'), (2, 'a'), (3, 'b')]) +data_stream.partition_custom(lambda key, num_partition: key % partition, lambda x: x[0]) +``` +{{< /tab >}} +{{< /tabs>}} + +### Random Partitioning +#### DataStream → DataStream + +Partitions elements randomly according to a uniform distribution. + +{{< tabs shuffle >}} +{{< tab "Java" >}} +```java +dataStream.shuffle(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +dataStream.shuffle() +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +data_stream.shuffle() +``` +{{< /tab >}} +{{< /tabs>}} + + +### Rescaling +#### DataStream → DataStream + +Partitions elements, round-robin, to a subset of downstream operations. This is useful if you want to have pipelines where you, for example, fan out from each parallel instance of a source to a subset of several mappers to distribute load but don't want the full rebalance that rebalance() would incur. This would require only local data transfers instead of transferring data over network, depending on other configuration values such as the number of slots of TaskManagers. + +The subset of downstream operations to which the upstream operation sends elements depends on the degree of parallelism of both the upstream and downstream operation. For example, if the upstream operation has parallelism 2 and the downstream operation has parallelism 6, then one upstream operation would distribute elements to three downstream operations while the other upstream operation would distribute to the other three downstream operations. If, on the other hand, the downstream operation has parallelism 2 while the upstream operation has parallelism 6 then three upstream operations would distribute to one downstream operation while the other three upstream operations would distribute to the other downstream operation. + +In cases where the different parallelisms are not multiples of each other one or several downstream operations will have a differing number of inputs from upstream operations. + +Please see this figure for a visualization of the connection pattern in the above example: + +{{< img src="/fig/rescale.svg" alt="Checkpoint barriers in data streams" >}} + +{{< tabs rescale >}} +{{< tab "Java" >}} +```java +dataStream.rescale(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +dataStream.rescale() +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +data_stream.rescale() +``` +{{< /tab >}} +{{< /tabs>}} + +### Broadcasting +#### DataStream → DataStream + +Broadcasts elements to every partition. + +{{< tabs broadcast >}} +{{< tab "Java" >}} +```java +dataStream.broadcast(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +dataStream.broadcast() +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +data_stream.broadcast() +``` +{{< /tab >}} +{{< /tabs>}} + +## 算子链和资源组 + +将两个算子链接在一起能使得它们在同一个线程中执行,从而提升性能。Flink 默认会将能链接的算子尽可能地进行链接(例如, 两个 map 转换操作)。此外, Flink 还提供了对链接更细粒度控制的 API 以满足更多需求: + +如果想对整个作业禁用算子链,可以调用 `StreamExecutionEnvironment.disableOperatorChaining()`。下列方法还提供了更细粒度的控制。需要注 意的是, 这些方法只能在 `DataStream` 转换操作后才能被调用,因为它们只对前一次数据转换生效。例如,可以 `someStream.map(...).startNewChain()` 这样调用,而不能 someStream.startNewChain()这样。 + +一个资源组对应着 Flink 中的一个 slot 槽,更多细节请看slots 槽。 你可以根据需要手动地将各个算子隔离到不同的 slot 中。 + +### Start New Chain + +Begin a new chain, starting with this operator. +The two mappers will be chained, and filter will not be chained to the first mapper. + +{{< tabs startnewchain >}} +{{< tab "Java" >}} +```java +someStream.filter(...).map(...).startNewChain().map(...); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +someStream.filter(...).map(...).startNewChain().map(...) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +some_stream.filter(...).map(...).start_new_chain().map(...) +``` +{{< /tab >}} +{{< /tabs>}} + +### Disable Chaining + +Do not chain the map operator. + +{{< tabs disablechaining >}} +{{< tab "Java" >}} +```java +someStream.map(...).disableChaining(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +someStream.map(...).disableChaining() +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +some_stream.map(...).disable_chaining() +``` +{{< /tab >}} +{{< /tabs>}} + +### Set Slot Sharing Group + +Set the slot sharing group of an operation. Flink will put operations with the same slot sharing group into the same slot while keeping operations that don't have the slot sharing group in other slots. This can be used to isolate slots. The slot sharing group is inherited from input operations if all input operations are in the same slot sharing group. The name of the default slot sharing group is "default", operations can explicitly be put into this group by calling slotSharingGroup("default"). + +{{< tabs slotsharing >}} +{{< tab "Java" >}} +```java +someStream.filter(...).slotSharingGroup("name"); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +someStream.filter(...).slotSharingGroup("name") +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +some_stream.filter(...).slot_sharing_group("name") +``` +{{< /tab >}} +{{< /tabs>}} diff --git a/docs/content.zh/docs/dev/datastream/operators/process_function.md b/docs/content.zh/docs/dev/datastream/operators/process_function.md new file mode 100644 index 0000000000000..38426b50dee85 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/operators/process_function.md @@ -0,0 +1,509 @@ +--- +title: "Process Function" +weight: 4 +type: docs +aliases: + - /zh/dev/stream/operators/process_function.html +--- + + +# Process Function + +## The ProcessFunction + +The `ProcessFunction` is a low-level stream processing operation, giving access to the basic building blocks of +all (acyclic) streaming applications: + + - events (stream elements) + - state (fault-tolerant, consistent, only on keyed stream) + - timers (event time and processing time, only on keyed stream) + +The `ProcessFunction` can be thought of as a `FlatMapFunction` with access to keyed state and timers. It handles events +by being invoked for each event received in the input stream(s). + +For fault-tolerant state, the `ProcessFunction` gives access to Flink's [keyed state]({{< ref "docs/dev/datastream/fault-tolerance/state" >}}), accessible via the +`RuntimeContext`, similar to the way other stateful functions can access keyed state. + +The timers allow applications to react to changes in processing time and in [event time]({{< ref "docs/concepts/time" >}}). +Every call to the function `processElement(...)` gets a `Context` object which gives access to the element's +event time timestamp, and to the *TimerService*. The `TimerService` can be used to register callbacks for future +event-/processing-time instants. With event-time timers, the `onTimer(...)` method is called when the current watermark is advanced up to or beyond the timestamp of the timer, while with processing-time timers, `onTimer(...)` is called when wall clock time reaches the specified time. During that call, all states are again scoped to the key with which the timer was created, allowing +timers to manipulate keyed state. + +{{< hint info >}} +If you want to access keyed state and timers you have +to apply the `ProcessFunction` on a keyed stream: +{{< /hint >}} + +```java +stream.keyBy(...).process(new MyProcessFunction()) +``` + +## Low-level Joins + +To realize low-level operations on two inputs, applications can use `CoProcessFunction` or `KeyedCoProcessFunction`. This +function is bound to two different inputs and gets individual calls to `processElement1(...)` and +`processElement2(...)` for records from the two different inputs. + +Implementing a low level join typically follows this pattern: + + - Create a state object for one input (or both) + - Update the state upon receiving elements from its input + - Upon receiving elements from the other input, probe the state and produce the joined result + +For example, you might be joining customer data to financial trades, +while keeping state for the customer data. If you care about having +complete and deterministic joins in the face of out-of-order events, +you can use a timer to evaluate and emit the join for a trade when the +watermark for the customer data stream has passed the time of that +trade. + +## Example + +In the following example a `KeyedProcessFunction` maintains counts per key, and emits a key/count pair whenever a minute passes (in event time) without an update for that key: + + - The count, key, and last-modification-timestamp are stored in a `ValueState`, which is implicitly scoped by key. + - For each record, the `KeyedProcessFunction` increments the counter and sets the last-modification timestamp + - The function also schedules a callback one minute into the future (in event time) + - Upon each callback, it checks the callback's event time timestamp against the last-modification time of the stored count + and emits the key/count if they match (i.e., no further update occurred during that minute) + +{{< hint info >}} +This simple example could have been implemented with +session windows. We use `KeyedProcessFunction` here to illustrate the basic pattern it provides. +{{< /hint >}} + +{{< tabs "6c8c009c-4c12-4338-9eeb-3be83cfa9e37" >}} +{{< tab "Java" >}} + +```java +import org.apache.flink.api.common.state.ValueState; +import org.apache.flink.api.common.state.ValueStateDescriptor; +import org.apache.flink.api.java.tuple.Tuple; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.functions.KeyedProcessFunction; +import org.apache.flink.streaming.api.functions.KeyedProcessFunction.Context; +import org.apache.flink.streaming.api.functions.KeyedProcessFunction.OnTimerContext; +import org.apache.flink.util.Collector; + + +// the source data stream +DataStream> stream = ...; + +// apply the process function onto a keyed stream +DataStream> result = stream + .keyBy(value -> value.f0) + .process(new CountWithTimeoutFunction()); + +/** + * The data type stored in the state + */ +public class CountWithTimestamp { + + public String key; + public long count; + public long lastModified; +} + +/** + * The implementation of the ProcessFunction that maintains the count and timeouts + */ +public class CountWithTimeoutFunction + extends KeyedProcessFunction, Tuple2> { + + /** The state that is maintained by this process function */ + private ValueState state; + + @Override + public void open(Configuration parameters) throws Exception { + state = getRuntimeContext().getState(new ValueStateDescriptor<>("myState", CountWithTimestamp.class)); + } + + @Override + public void processElement( + Tuple2 value, + Context ctx, + Collector> out) throws Exception { + + // retrieve the current count + CountWithTimestamp current = state.value(); + if (current == null) { + current = new CountWithTimestamp(); + current.key = value.f0; + } + + // update the state's count + current.count++; + + // set the state's timestamp to the record's assigned event time timestamp + current.lastModified = ctx.timestamp(); + + // write the state back + state.update(current); + + // schedule the next timer 60 seconds from the current event time + ctx.timerService().registerEventTimeTimer(current.lastModified + 60000); + } + + @Override + public void onTimer( + long timestamp, + OnTimerContext ctx, + Collector> out) throws Exception { + + // get the state for the key that scheduled the timer + CountWithTimestamp result = state.value(); + + // check if this is an outdated timer or the latest timer + if (timestamp == result.lastModified + 60000) { + // emit the state on timeout + out.collect(new Tuple2(result.key, result.count)); + } + } +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.common.state.ValueState +import org.apache.flink.api.common.state.ValueStateDescriptor +import org.apache.flink.api.java.tuple.Tuple +import org.apache.flink.streaming.api.functions.KeyedProcessFunction +import org.apache.flink.util.Collector + +// the source data stream +val stream: DataStream[Tuple2[String, String]] = ... + +// apply the process function onto a keyed stream +val result: DataStream[Tuple2[String, Long]] = stream + .keyBy(_._1) + .process(new CountWithTimeoutFunction()) + +/** + * The data type stored in the state + */ +case class CountWithTimestamp(key: String, count: Long, lastModified: Long) + +/** + * The implementation of the ProcessFunction that maintains the count and timeouts + */ +class CountWithTimeoutFunction extends KeyedProcessFunction[Tuple, (String, String), (String, Long)] { + + /** The state that is maintained by this process function */ + lazy val state: ValueState[CountWithTimestamp] = getRuntimeContext + .getState(new ValueStateDescriptor[CountWithTimestamp]("myState", classOf[CountWithTimestamp])) + + + override def processElement( + value: (String, String), + ctx: KeyedProcessFunction[Tuple, (String, String), (String, Long)]#Context, + out: Collector[(String, Long)]): Unit = { + + // initialize or retrieve/update the state + val current: CountWithTimestamp = state.value match { + case null => + CountWithTimestamp(value._1, 1, ctx.timestamp) + case CountWithTimestamp(key, count, lastModified) => + CountWithTimestamp(key, count + 1, ctx.timestamp) + } + + // write the state back + state.update(current) + + // schedule the next timer 60 seconds from the current event time + ctx.timerService.registerEventTimeTimer(current.lastModified + 60000) + } + + override def onTimer( + timestamp: Long, + ctx: KeyedProcessFunction[Tuple, (String, String), (String, Long)]#OnTimerContext, + out: Collector[(String, Long)]): Unit = { + + state.value match { + case CountWithTimestamp(key, count, lastModified) if (timestamp == lastModified + 60000) => + out.collect((key, count)) + case _ => + } + } +} +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +import datetime + +from pyflink.common import Row, WatermarkStrategy +from pyflink.common.typeinfo import Types +from pyflink.common.watermark_strategy import TimestampAssigner +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.datastream.functions import KeyedProcessFunction, RuntimeContext +from pyflink.datastream.state import ValueStateDescriptor +from pyflink.table import StreamTableEnvironment + + +class CountWithTimeoutFunction(KeyedProcessFunction): + + def __init__(self): + self.state = None + + def open(self, runtime_context: RuntimeContext): + self.state = runtime_context.get_state(ValueStateDescriptor( + "my_state", Types.ROW([Types.STRING(), Types.LONG(), Types.LONG()]))) + + def process_element(self, value, ctx: 'KeyedProcessFunction.Context'): + # retrieve the current count + current = self.state.value() + if current is None: + current = Row(value.f1, 0, 0) + + # update the state's count + current[1] += 1 + + # set the state's timestamp to the record's assigned event time timestamp + current[2] = ctx.timestamp() + + # write the state back + self.state.update(current) + + # schedule the next timer 60 seconds from the current event time + ctx.timer_service().register_event_time_timer(current[2] + 60000) + + def on_timer(self, timestamp: int, ctx: 'KeyedProcessFunction.OnTimerContext'): + # get the state for the key that scheduled the timer + result = self.state.value() + + # check if this is an outdated timer or the latest timer + if timestamp == result[2] + 60000: + # emit the state on timeout + yield result[0], result[1] + + +class MyTimestampAssigner(TimestampAssigner): + + def __init__(self): + self.epoch = datetime.datetime.utcfromtimestamp(0) + + def extract_timestamp(self, value, record_timestamp) -> int: + return int((value[0] - self.epoch).total_seconds() * 1000) + + +if __name__ == '__main__': + env = StreamExecutionEnvironment.get_execution_environment() + t_env = StreamTableEnvironment.create(stream_execution_environment=env) + + t_env.execute_sql(""" + CREATE TABLE my_source ( + a TIMESTAMP(3), + b VARCHAR, + c VARCHAR + ) WITH ( + 'connector' = 'datagen', + 'rows-per-second' = '10' + ) + """) + + stream = t_env.to_append_stream( + t_env.from_path('my_source'), + Types.ROW([Types.SQL_TIMESTAMP(), Types.STRING(), Types.STRING()])) + watermarked_stream = stream.assign_timestamps_and_watermarks( + WatermarkStrategy.for_monotonous_timestamps() + .with_timestamp_assigner(MyTimestampAssigner())) + + # apply the process function onto a keyed stream + result = watermarked_stream.key_by(lambda value: value[1]) \ + .process(CountWithTimeoutFunction()) \ + .print() + env.execute() +``` +{{< /tab >}} +{{< /tabs >}} + + +{{< hint warning >}} +Before Flink 1.4.0, when called from a processing-time timer, the `ProcessFunction.onTimer()` method sets +the current processing time as event-time timestamp. This behavior is very subtle and might not be noticed by users. Well, it's +harmful because processing-time timestamps are indeterministic and not aligned with watermarks. Besides, user-implemented logic +depends on this wrong timestamp highly likely is unintendedly faulty. So we've decided to fix it. Upon upgrading to 1.4.0, Flink jobs +that are using this incorrect event-time timestamp will fail, and users should adapt their jobs to the correct logic. +{{< /hint >}} + +## The KeyedProcessFunction + +`KeyedProcessFunction`, as an extension of `ProcessFunction`, gives access to the key of timers in its `onTimer(...)` +method. + +{{< tabs "f8b6791f-023f-4e56-a6e4-8541dd0b3e1b" >}} +{{< tab "Java" >}} +```java +@Override +public void onTimer(long timestamp, OnTimerContext ctx, Collector out) throws Exception { + K key = ctx.getCurrentKey(); + // ... +} + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +override def onTimer(timestamp: Long, ctx: OnTimerContext, out: Collector[OUT]): Unit = { + var key = ctx.getCurrentKey + // ... +} +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +def on_timer(self, timestamp: int, ctx: 'KeyedProcessFunction.OnTimerContext'): + key = ctx.get_current_key() + # ... +``` +{{< /tab >}} +{{< /tabs >}} + +## Timers + +Both types of timers (processing-time and event-time) are internally maintained by the `TimerService` and enqueued for execution. + +The `TimerService` deduplicates timers per key and timestamp, i.e., there is at most one timer per key and timestamp. If multiple timers are registered for the same timestamp, the `onTimer()` method will be called just once. + +Flink synchronizes invocations of `onTimer()` and `processElement()`. Hence, users do not have to worry about concurrent modification of state. + +### Fault Tolerance + +Timers are fault tolerant and checkpointed along with the state of the application. +In case of a failure recovery or when starting an application from a savepoint, the timers are restored. + +{{< hint info >}} +Checkpointed processing-time timers that were supposed to fire before their restoration, will fire immediately. +This might happen when an application recovers from a failure or when it is started from a savepoint. +{{< /hint >}} + +{{< hint info >}} +Timers are always asynchronously checkpointed, except for the combination of RocksDB backend / with incremental snapshots / with heap-based timers (will be resolved with `FLINK-10026`). +Notice that large numbers of timers can increase the checkpointing time because timers are part of the checkpointed state. See the "Timer Coalescing" section for advice on how to reduce the number of timers. +{{< /hint >}} + +### Timer Coalescing + +Since Flink maintains only one timer per key and timestamp, you can reduce the number of timers by reducing the timer resolution to coalesce them. + +For a timer resolution of 1 second (event or processing time), you +can round down the target time to full seconds. Timers will fire at most 1 second earlier but not later than requested with millisecond accuracy. +As a result, there are at most one timer per key and second. + +{{< tabs "aa23eeb6-d15f-44f2-85ab-d130a4202d57" >}} +{{< tab "Java" >}} +```java +long coalescedTime = ((ctx.timestamp() + timeout) / 1000) * 1000; +ctx.timerService().registerProcessingTimeTimer(coalescedTime); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val coalescedTime = ((ctx.timestamp + timeout) / 1000) * 1000 +ctx.timerService.registerProcessingTimeTimer(coalescedTime) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +coalesced_time = ((ctx.timestamp() + timeout) // 1000) * 1000 +ctx.timer_service().register_processing_time_timer(coalesced_time) +``` +{{< /tab >}} +{{< /tabs >}} + +Since event-time timers only fire with watermarks coming in, you may also schedule and coalesce +these timers with the next watermark by using the current one: + +{{< tabs "ef74a1da-c4cd-4fab-8035-d29ffd7039d4" >}} +{{< tab "Java" >}} +```java +long coalescedTime = ctx.timerService().currentWatermark() + 1; +ctx.timerService().registerEventTimeTimer(coalescedTime); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val coalescedTime = ctx.timerService.currentWatermark + 1 +ctx.timerService.registerEventTimeTimer(coalescedTime) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +coalesced_time = ctx.timer_service().current_watermark() + 1 +ctx.timer_service().register_event_time_timer(coalesced_time) +``` +{{< /tab >}} +{{< /tabs >}} + +Timers can also be stopped and removed as follows: + +Stopping a processing-time timer: + +{{< tabs "5d0d1344-6f51-44f8-b500-ebe863cedba4" >}} +{{< tab "Java" >}} +```java +long timestampOfTimerToStop = ... +ctx.timerService().deleteProcessingTimeTimer(timestampOfTimerToStop); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val timestampOfTimerToStop = ... +ctx.timerService.deleteProcessingTimeTimer(timestampOfTimerToStop) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +timestamp_of_timer_to_stop = ... +ctx.timer_service().delete_processing_time_timer(timestamp_of_timer_to_stop) +``` +{{< /tab >}} +{{< /tabs >}} + +Stopping an event-time timer: + +{{< tabs "581e5996-503c-452e-8b2a-a4daeaf4ac88" >}} +{{< tab "Java" >}} +```java +long timestampOfTimerToStop = ... +ctx.timerService().deleteEventTimeTimer(timestampOfTimerToStop); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val timestampOfTimerToStop = ... +ctx.timerService.deleteEventTimeTimer(timestampOfTimerToStop) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +timestamp_of_timer_to_stop = ... +ctx.timer_service().delete_event_time_timer(timestamp_of_timer_to_stop) +``` +{{< /tab >}} +{{< /tabs >}} + +{{< hint info >}} +Stopping a timer has no effect if no such timer with the given timestamp is registered. +{{< /hint >}} + +{{< top >}} diff --git a/docs/content.zh/docs/dev/datastream/operators/windows.md b/docs/content.zh/docs/dev/datastream/operators/windows.md new file mode 100644 index 0000000000000..0c5caff486b17 --- /dev/null +++ b/docs/content.zh/docs/dev/datastream/operators/windows.md @@ -0,0 +1,1320 @@ +--- +title: 窗口 +weight: 2 +type: docs +aliases: + - /zh/dev/stream/operators/windows.html +--- + + +# 窗口 + +Windows are at the heart of processing infinite streams. Windows split the stream into "buckets" of finite size, +over which we can apply computations. This document focuses on how windowing is performed in Flink and how the +programmer can benefit to the maximum from its offered functionality. + +The general structure of a windowed Flink program is presented below. The first snippet refers to *keyed* streams, +while the second to *non-keyed* ones. As one can see, the only difference is the `keyBy(...)` call for the keyed streams +and the `window(...)` which becomes `windowAll(...)` for non-keyed streams. This is also going to serve as a roadmap +for the rest of the page. + +**Keyed Windows** + + stream + .keyBy(...) <- keyed versus non-keyed windows + .window(...) <- required: "assigner" + [.trigger(...)] <- optional: "trigger" (else default trigger) + [.evictor(...)] <- optional: "evictor" (else no evictor) + [.allowedLateness(...)] <- optional: "lateness" (else zero) + [.sideOutputLateData(...)] <- optional: "output tag" (else no side output for late data) + .reduce/aggregate/apply() <- required: "function" + [.getSideOutput(...)] <- optional: "output tag" + +**Non-Keyed Windows** + + stream + .windowAll(...) <- required: "assigner" + [.trigger(...)] <- optional: "trigger" (else default trigger) + [.evictor(...)] <- optional: "evictor" (else no evictor) + [.allowedLateness(...)] <- optional: "lateness" (else zero) + [.sideOutputLateData(...)] <- optional: "output tag" (else no side output for late data) + .reduce/aggregate/apply() <- required: "function" + [.getSideOutput(...)] <- optional: "output tag" + +In the above, the commands in square brackets ([...]) are optional. This reveals that Flink allows you to customize your +windowing logic in many different ways so that it best fits your needs. + + + +## Window Lifecycle + +In a nutshell, a window is **created** as soon as the first element that should belong to this window arrives, and the +window is **completely removed** when the time (event or processing time) passes its end timestamp plus the user-specified +`allowed lateness` (see [Allowed Lateness](#allowed-lateness)). Flink guarantees removal only for time-based +windows and not for other types, *e.g.* global windows (see [Window Assigners](#window-assigners)). For example, with an +event-time-based windowing strategy that creates non-overlapping (or tumbling) windows every 5 minutes and has an allowed +lateness of 1 min, Flink will create a new window for the interval between `12:00` and `12:05` when the first element with +a timestamp that falls into this interval arrives, and it will remove it when the watermark passes the `12:06` +timestamp. + +In addition, each window will have a `Trigger` (see [Triggers](#triggers)) and a function (`ProcessWindowFunction`, `ReduceFunction`, +or `AggregateFunction`) (see [Window Functions](#window-functions)) attached to it. The function will contain the computation to +be applied to the contents of the window, while the `Trigger` specifies the conditions under which the window is +considered ready for the function to be applied. A triggering policy might be something like "when the number of elements +in the window is more than 4", or "when the watermark passes the end of the window". A trigger can also decide to +purge a window's contents any time between its creation and removal. Purging in this case only refers to the elements +in the window, and *not* the window metadata. This means that new data can still be added to that window. + +Apart from the above, you can specify an `Evictor` (see [Evictors](#evictors)) which will be able to remove +elements from the window after the trigger fires and before and/or after the function is applied. + +In the following we go into more detail for each of the components above. We start with the required parts in the above +snippet (see [Keyed vs Non-Keyed Windows](#keyed-vs-non-keyed-windows), [Window Assigner](#window-assigner), and +[Window Function](#window-function)) before moving to the optional ones. + +## Keyed vs Non-Keyed Windows + +The first thing to specify is whether your stream should be keyed or not. This has to be done before defining the window. +Using the `keyBy(...)` will split your infinite stream into logical keyed streams. If `keyBy(...)` is not called, your +stream is not keyed. + +In the case of keyed streams, any attribute of your incoming events can be used as a key +(more details [here]({{< ref "docs/dev/datastream/fault-tolerance/state" >}}#keyed-datastream)). Having a keyed stream will +allow your windowed computation to be performed in parallel by multiple tasks, as each logical keyed stream can be processed +independently from the rest. All elements referring to the same key will be sent to the same parallel task. + +In case of non-keyed streams, your original stream will not be split into multiple logical streams and all the windowing logic +will be performed by a single task, *i.e.* with parallelism of 1. + +## Window Assigners + +After specifying whether your stream is keyed or not, the next step is to define a *window assigner*. +The window assigner defines how elements are assigned to windows. This is done by specifying the `WindowAssigner` +of your choice in the `window(...)` (for *keyed* streams) or the `windowAll()` (for *non-keyed* streams) call. + +A `WindowAssigner` is responsible for assigning each incoming element to one or more windows. Flink comes +with pre-defined window assigners for the most common use cases, namely *tumbling windows*, +*sliding windows*, *session windows* and *global windows*. You can also implement a custom window assigner by +extending the `WindowAssigner` class. All built-in window assigners (except the global +windows) assign elements to windows based on time, which can either be processing time or event +time. Please take a look at our section on [event time]({{< ref "docs/concepts/time" >}}) to learn +about the difference between processing time and event time and how timestamps and watermarks are generated. + +Time-based windows have a *start timestamp* (inclusive) and an *end timestamp* (exclusive) +that together describe the size of the window. In code, Flink uses `TimeWindow` when working with +time-based windows which has methods for querying the start- and end-timestamp and also an +additional method `maxTimestamp()` that returns the largest allowed timestamp for a given windows. + +In the following, we show how Flink's pre-defined window assigners work and how they are used +in a DataStream program. The following figures visualize the workings of each assigner. The purple circles +represent elements of the stream, which are partitioned by some key (in this case *user 1*, *user 2* and *user 3*). +The x-axis shows the progress of time. + +### Tumbling Windows + +A *tumbling windows* assigner assigns each element to a window of a specified *window size*. +Tumbling windows have a fixed size and do not overlap. For example, if you specify a tumbling +window with a size of 5 minutes, the current window will be evaluated and a new window will be +started every five minutes as illustrated by the following figure. + +{{< img src="/fig/tumbling-windows.svg" alt="Tumbling Windows" >}} + +The following code snippets show how to use tumbling windows. + +{{< tabs "cb126c86-cbcc-4d11-bfac-ccf663073c38" >}} +{{< tab "Java" >}} +```java +DataStream input = ...; + +// tumbling event-time windows +input + .keyBy() + .window(TumblingEventTimeWindows.of(Time.seconds(5))) + .(); + +// tumbling processing-time windows +input + .keyBy() + .window(TumblingProcessingTimeWindows.of(Time.seconds(5))) + .(); + +// daily tumbling event-time windows offset by -8 hours. +input + .keyBy() + .window(TumblingEventTimeWindows.of(Time.days(1), Time.hours(-8))) + .(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val input: DataStream[T] = ... + +// tumbling event-time windows +input + .keyBy() + .window(TumblingEventTimeWindows.of(Time.seconds(5))) + .() + +// tumbling processing-time windows +input + .keyBy() + .window(TumblingProcessingTimeWindows.of(Time.seconds(5))) + .() + +// daily tumbling event-time windows offset by -8 hours. +input + .keyBy() + .window(TumblingEventTimeWindows.of(Time.days(1), Time.hours(-8))) + .() +``` +{{< /tab >}} +{{< /tabs >}} + +Time intervals can be specified by using one of `Time.milliseconds(x)`, `Time.seconds(x)`, +`Time.minutes(x)`, and so on. + +As shown in the last example, tumbling window assigners also take an optional `offset` +parameter that can be used to change the alignment of windows. For example, without offsets +hourly tumbling windows are aligned with epoch, that is you will get windows such as +`1:00:00.000 - 1:59:59.999`, `2:00:00.000 - 2:59:59.999` and so on. If you want to change +that you can give an offset. With an offset of 15 minutes you would, for example, get +`1:15:00.000 - 2:14:59.999`, `2:15:00.000 - 3:14:59.999` etc. +An important use case for offsets is to adjust windows to timezones other than UTC-0. +For example, in China you would have to specify an offset of `Time.hours(-8)`. + +### Sliding Windows + +The *sliding windows* assigner assigns elements to windows of fixed length. Similar to a tumbling +windows assigner, the size of the windows is configured by the *window size* parameter. +An additional *window slide* parameter controls how frequently a sliding window is started. Hence, +sliding windows can be overlapping if the slide is smaller than the window size. In this case elements +are assigned to multiple windows. + +For example, you could have windows of size 10 minutes that slides by 5 minutes. With this you get every +5 minutes a window that contains the events that arrived during the last 10 minutes as depicted by the +following figure. + +{{< img src="/fig/sliding-windows.svg" alt="sliding windows" >}} + +The following code snippets show how to use sliding windows. + +{{< tabs "89a9d497-6404-4333-b7bc-c8a465620279" >}} +{{< tab "Java" >}} +```java +DataStream input = ...; + +// sliding event-time windows +input + .keyBy() + .window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5))) + .(); + +// sliding processing-time windows +input + .keyBy() + .window(SlidingProcessingTimeWindows.of(Time.seconds(10), Time.seconds(5))) + .(); + +// sliding processing-time windows offset by -8 hours +input + .keyBy() + .window(SlidingProcessingTimeWindows.of(Time.hours(12), Time.hours(1), Time.hours(-8))) + .(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val input: DataStream[T] = ... + +// sliding event-time windows +input + .keyBy() + .window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5))) + .() + +// sliding processing-time windows +input + .keyBy() + .window(SlidingProcessingTimeWindows.of(Time.seconds(10), Time.seconds(5))) + .() + +// sliding processing-time windows offset by -8 hours +input + .keyBy() + .window(SlidingProcessingTimeWindows.of(Time.hours(12), Time.hours(1), Time.hours(-8))) + .() +``` +{{< /tab >}} +{{< /tabs >}} + +Time intervals can be specified by using one of `Time.milliseconds(x)`, `Time.seconds(x)`, +`Time.minutes(x)`, and so on. + +As shown in the last example, sliding window assigners also take an optional `offset` parameter +that can be used to change the alignment of windows. For example, without offsets hourly windows +sliding by 30 minutes are aligned with epoch, that is you will get windows such as +`1:00:00.000 - 1:59:59.999`, `1:30:00.000 - 2:29:59.999` and so on. If you want to change that +you can give an offset. With an offset of 15 minutes you would, for example, get +`1:15:00.000 - 2:14:59.999`, `1:45:00.000 - 2:44:59.999` etc. +An important use case for offsets is to adjust windows to timezones other than UTC-0. +For example, in China you would have to specify an offset of `Time.hours(-8)`. + +### Session Windows + +The *session windows* assigner groups elements by sessions of activity. Session windows do not overlap and +do not have a fixed start and end time, in contrast to *tumbling windows* and *sliding windows*. Instead a +session window closes when it does not receive elements for a certain period of time, *i.e.*, when a gap of +inactivity occurred. A session window assigner can be configured with either a static *session gap* or with a +*session gap extractor* function which defines how long the period of inactivity is. When this period expires, +the current session closes and subsequent elements are assigned to a new session window. + +{{< img src="/fig/session-windows.svg" alt="session windows" >}} + +The following code snippets show how to use session windows. + +{{< tabs "9178a99c-4a54-491a-8182-499d23a7432c" >}} +{{< tab "Java" >}} +```java +DataStream input = ...; + +// event-time session windows with static gap +input + .keyBy() + .window(EventTimeSessionWindows.withGap(Time.minutes(10))) + .(); + +// event-time session windows with dynamic gap +input + .keyBy() + .window(EventTimeSessionWindows.withDynamicGap((element) -> { + // determine and return session gap + })) + .(); + +// processing-time session windows with static gap +input + .keyBy() + .window(ProcessingTimeSessionWindows.withGap(Time.minutes(10))) + .(); + +// processing-time session windows with dynamic gap +input + .keyBy() + .window(ProcessingTimeSessionWindows.withDynamicGap((element) -> { + // determine and return session gap + })) + .(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val input: DataStream[T] = ... + +// event-time session windows with static gap +input + .keyBy() + .window(EventTimeSessionWindows.withGap(Time.minutes(10))) + .() + +// event-time session windows with dynamic gap +input + .keyBy() + .window(EventTimeSessionWindows.withDynamicGap(new SessionWindowTimeGapExtractor[String] { + override def extract(element: String): Long = { + // determine and return session gap + } + })) + .() + +// processing-time session windows with static gap +input + .keyBy() + .window(ProcessingTimeSessionWindows.withGap(Time.minutes(10))) + .() + + +// processing-time session windows with dynamic gap +input + .keyBy() + .window(DynamicProcessingTimeSessionWindows.withDynamicGap(new SessionWindowTimeGapExtractor[String] { + override def extract(element: String): Long = { + // determine and return session gap + } + })) + .() +``` +{{< /tab >}} +{{< /tabs >}} + +Static gaps can be specified by using one of `Time.milliseconds(x)`, `Time.seconds(x)`, +`Time.minutes(x)`, and so on. + +Dynamic gaps are specified by implementing the `SessionWindowTimeGapExtractor` interface. + +{{< hint info >}} +Since session windows do not have a fixed start and end, +they are evaluated differently than tumbling and sliding windows. Internally, a session window operator +creates a new window for each arriving record and merges windows together if they are closer to each other +than the defined gap. +In order to be mergeable, a session window operator requires a merging [Trigger](#triggers) and a merging +[Window Function](#window-functions), such as `ReduceFunction`, `AggregateFunction`, or `ProcessWindowFunction` +{{< /hint >}} + +### Global Windows + +A *global windows* assigner assigns all elements with the same key to the same single *global window*. +This windowing scheme is only useful if you also specify a custom [trigger](#triggers). Otherwise, +no computation will be performed, as the global window does not have a natural end at +which we could process the aggregated elements. + +{{< img src="/fig/non-windowed.svg" alt="global windows" >}} + +The following code snippets show how to use a global window. + +{{< tabs "3f113886-ba6f-46eb-96fe-db56292a8285" >}} +{{< tab "Java" >}} +```java +DataStream input = ...; + +input + .keyBy() + .window(GlobalWindows.create()) + .(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val input: DataStream[T] = ... + +input + .keyBy() + .window(GlobalWindows.create()) + .() +``` +{{< /tab >}} +{{< /tabs >}} + +## Window Functions + +After defining the window assigner, we need to specify the computation that we want +to perform on each of these windows. This is the responsibility of the *window function*, which is used to process the +elements of each (possibly keyed) window once the system determines that a window is ready for processing +(see [triggers](#triggers) for how Flink determines when a window is ready). + +The window function can be one of `ReduceFunction`, `AggregateFunction`, or `ProcessWindowFunction`. The first +two can be executed more efficiently (see [State Size](#useful-state-size-considerations) section) because Flink can incrementally aggregate +the elements for each window as they arrive. A `ProcessWindowFunction` gets an `Iterable` for all the elements contained in a +window and additional meta information about the window to which the elements belong. + +A windowed transformation with a `ProcessWindowFunction` cannot be executed as efficiently as the other +cases because Flink has to buffer *all* elements for a window internally before invoking the function. +This can be mitigated by combining a `ProcessWindowFunction` with a `ReduceFunction`, or `AggregateFunction` to +get both incremental aggregation of window elements and the additional window metadata that the +`ProcessWindowFunction` receives. We will look at examples for each of these variants. + +### ReduceFunction + +A `ReduceFunction` specifies how two elements from the input are combined to produce +an output element of the same type. Flink uses a `ReduceFunction` to incrementally aggregate +the elements of a window. + +A `ReduceFunction` can be defined and used like this: + +{{< tabs "e49c2bc0-a4cf-4ead-acbc-d96a98e1c6ff" >}} +{{< tab "Java" >}} +```java +DataStream> input = ...; + +input + .keyBy() + .window() + .reduce(new ReduceFunction>() { + public Tuple2 reduce(Tuple2 v1, Tuple2 v2) { + return new Tuple2<>(v1.f0, v1.f1 + v2.f1); + } + }); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val input: DataStream[(String, Long)] = ... + +input + .keyBy() + .window() + .reduce { (v1, v2) => (v1._1, v1._2 + v2._2) } +``` +{{< /tab >}} +{{< /tabs >}} + +The above example sums up the second fields of the tuples for all elements in a window. + +### AggregateFunction + +An `AggregateFunction` is a generalized version of a `ReduceFunction` that has three types: an +input type (`IN`), accumulator type (`ACC`), and an output type (`OUT`). The input type is the type +of elements in the input stream and the `AggregateFunction` has a method for adding one input +element to an accumulator. The interface also has methods for creating an initial accumulator, +for merging two accumulators into one accumulator and for extracting an output (of type `OUT`) from +an accumulator. We will see how this works in the example below. + +Same as with `ReduceFunction`, Flink will incrementally aggregate input elements of a window as they +arrive. + +An `AggregateFunction` can be defined and used like this: + +{{< tabs "7084ece9-370e-42e3-8130-e47cc9a6c600" >}} +{{< tab "Java" >}} +```java + +/** + * The accumulator is used to keep a running sum and a count. The {@code getResult} method + * computes the average. + */ +private static class AverageAggregate + implements AggregateFunction, Tuple2, Double> { + @Override + public Tuple2 createAccumulator() { + return new Tuple2<>(0L, 0L); + } + + @Override + public Tuple2 add(Tuple2 value, Tuple2 accumulator) { + return new Tuple2<>(accumulator.f0 + value.f1, accumulator.f1 + 1L); + } + + @Override + public Double getResult(Tuple2 accumulator) { + return ((double) accumulator.f0) / accumulator.f1; + } + + @Override + public Tuple2 merge(Tuple2 a, Tuple2 b) { + return new Tuple2<>(a.f0 + b.f0, a.f1 + b.f1); + } +} + +DataStream> input = ...; + +input + .keyBy() + .window() + .aggregate(new AverageAggregate()); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +/** + * The accumulator is used to keep a running sum and a count. The [getResult] method + * computes the average. + */ +class AverageAggregate extends AggregateFunction[(String, Long), (Long, Long), Double] { + override def createAccumulator() = (0L, 0L) + + override def add(value: (String, Long), accumulator: (Long, Long)) = + (accumulator._1 + value._2, accumulator._2 + 1L) + + override def getResult(accumulator: (Long, Long)) = accumulator._1 / accumulator._2 + + override def merge(a: (Long, Long), b: (Long, Long)) = + (a._1 + b._1, a._2 + b._2) +} + +val input: DataStream[(String, Long)] = ... + +input + .keyBy() + .window() + .aggregate(new AverageAggregate) +``` +{{< /tab >}} +{{< /tabs >}} + +The above example computes the average of the second field of the elements in the window. + +### ProcessWindowFunction + +A ProcessWindowFunction gets an Iterable containing all the elements of the window, and a Context +object with access to time and state information, which enables it to provide more flexibility than +other window functions. This comes at the cost of performance and resource consumption, because +elements cannot be incrementally aggregated but instead need to be buffered internally until the +window is considered ready for processing. + +The signature of `ProcessWindowFunction` looks as follows: + +{{< tabs "ce96f848-dcbf-4f8b-a079-afc301036da2" >}} +{{< tab "Java" >}} +```java +public abstract class ProcessWindowFunction implements Function { + + /** + * Evaluates the window and outputs none or several elements. + * + * @param key The key for which this window is evaluated. + * @param context The context in which the window is being evaluated. + * @param elements The elements in the window being evaluated. + * @param out A collector for emitting elements. + * + * @throws Exception The function may throw exceptions to fail the program and trigger recovery. + */ + public abstract void process( + KEY key, + Context context, + Iterable elements, + Collector out) throws Exception; + + /** + * The context holding window metadata. + */ + public abstract class Context implements java.io.Serializable { + /** + * Returns the window that is being evaluated. + */ + public abstract W window(); + + /** Returns the current processing time. */ + public abstract long currentProcessingTime(); + + /** Returns the current event-time watermark. */ + public abstract long currentWatermark(); + + /** + * State accessor for per-key and per-window state. + * + *

    NOTE:If you use per-window state you have to ensure that you clean it up + * by implementing {@link ProcessWindowFunction#clear(Context)}. + */ + public abstract KeyedStateStore windowState(); + + /** + * State accessor for per-key global state. + */ + public abstract KeyedStateStore globalState(); + } + +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +abstract class ProcessWindowFunction[IN, OUT, KEY, W <: Window] extends Function { + + /** + * Evaluates the window and outputs none or several elements. + * + * @param key The key for which this window is evaluated. + * @param context The context in which the window is being evaluated. + * @param elements The elements in the window being evaluated. + * @param out A collector for emitting elements. + * @throws Exception The function may throw exceptions to fail the program and trigger recovery. + */ + def process( + key: KEY, + context: Context, + elements: Iterable[IN], + out: Collector[OUT]) + + /** + * The context holding window metadata + */ + abstract class Context { + /** + * Returns the window that is being evaluated. + */ + def window: W + + /** + * Returns the current processing time. + */ + def currentProcessingTime: Long + + /** + * Returns the current event-time watermark. + */ + def currentWatermark: Long + + /** + * State accessor for per-key and per-window state. + */ + def windowState: KeyedStateStore + + /** + * State accessor for per-key global state. + */ + def globalState: KeyedStateStore + } + +} +``` +{{< /tab >}} +{{< /tabs >}} + +The `key` parameter is the key that is extracted +via the `KeySelector` that was specified for the `keyBy()` invocation. In case of tuple-index +keys or string-field references this key type is always `Tuple` and you have to manually cast +it to a tuple of the correct size to extract the key fields. + +A `ProcessWindowFunction` can be defined and used like this: + +{{< tabs "23f086d2-fc10-4dc6-9edc-0e69a2deefdb" >}} +{{< tab "Java" >}} +```java +DataStream> input = ...; + +input + .keyBy(t -> t.f0) + .window(TumblingEventTimeWindows.of(Time.minutes(5))) + .process(new MyProcessWindowFunction()); + +/* ... */ + +public class MyProcessWindowFunction + extends ProcessWindowFunction, String, String, TimeWindow> { + + @Override + public void process(String key, Context context, Iterable> input, Collector out) { + long count = 0; + for (Tuple2 in: input) { + count++; + } + out.collect("Window: " + context.window() + "count: " + count); + } +} + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val input: DataStream[(String, Long)] = ... + +input + .keyBy(_._1) + .window(TumblingEventTimeWindows.of(Time.minutes(5))) + .process(new MyProcessWindowFunction()) + +/* ... */ + +class MyProcessWindowFunction extends ProcessWindowFunction[(String, Long), String, String, TimeWindow] { + + def process(key: String, context: Context, input: Iterable[(String, Long)], out: Collector[String]) = { + var count = 0L + for (in <- input) { + count = count + 1 + } + out.collect(s"Window ${context.window} count: $count") + } +} +``` +{{< /tab >}} +{{< /tabs >}} + +The example shows a `ProcessWindowFunction` that counts the elements in a window. In addition, the window function adds information about the window to the output. + +{{< hint info >}} +Note that using `ProcessWindowFunction` for simple aggregates such as count is quite inefficient. The next section shows how a `ReduceFunction` or `AggregateFunction` can be combined with a `ProcessWindowFunction` to get both incremental aggregation and the added information of a `ProcessWindowFunction`. +{{< /hint >}} + +### ProcessWindowFunction with Incremental Aggregation + +A `ProcessWindowFunction` can be combined with either a `ReduceFunction`, or an `AggregateFunction` to +incrementally aggregate elements as they arrive in the window. +When the window is closed, the `ProcessWindowFunction` will be provided with the aggregated result. +This allows it to incrementally compute windows while having access to the +additional window meta information of the `ProcessWindowFunction`. + +You can also use the legacy `WindowFunction` instead of `ProcessWindowFunction` for incremental window aggregation. + +#### Incremental Window Aggregation with ReduceFunction + +The following example shows how an incremental `ReduceFunction` can be combined with +a `ProcessWindowFunction` to return the smallest event in a window along +with the start time of the window. + +{{< tabs "de5305c3-7e43-4a83-a63e-6eb93be8eb9b" >}} +{{< tab "Java" >}} +```java +DataStream input = ...; + +input + .keyBy() + .window() + .reduce(new MyReduceFunction(), new MyProcessWindowFunction()); + +// Function definitions + +private static class MyReduceFunction implements ReduceFunction { + + public SensorReading reduce(SensorReading r1, SensorReading r2) { + return r1.value() > r2.value() ? r2 : r1; + } +} + +private static class MyProcessWindowFunction + extends ProcessWindowFunction, String, TimeWindow> { + + public void process(String key, + Context context, + Iterable minReadings, + Collector> out) { + SensorReading min = minReadings.iterator().next(); + out.collect(new Tuple2(context.window().getStart(), min)); + } +} + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +val input: DataStream[SensorReading] = ... + +input + .keyBy() + .window() + .reduce( + (r1: SensorReading, r2: SensorReading) => { if (r1.value > r2.value) r2 else r1 }, + ( key: String, + context: ProcessWindowFunction[_, _, _, TimeWindow]#Context, + minReadings: Iterable[SensorReading], + out: Collector[(Long, SensorReading)] ) => + { + val min = minReadings.iterator.next() + out.collect((context.window.getStart, min)) + } + ) + +``` +{{< /tab >}} +{{< /tabs >}} + +#### Incremental Window Aggregation with AggregateFunction + +The following example shows how an incremental `AggregateFunction` can be combined with +a `ProcessWindowFunction` to compute the average and also emit the key and window along with +the average. + +{{< tabs "404eb3d8-042c-4aef-8175-0ebc8c34cf01" >}} +{{< tab "Java" >}} +```java +DataStream> input = ...; + +input + .keyBy() + .window() + .aggregate(new AverageAggregate(), new MyProcessWindowFunction()); + +// Function definitions + +/** + * The accumulator is used to keep a running sum and a count. The {@code getResult} method + * computes the average. + */ +private static class AverageAggregate + implements AggregateFunction, Tuple2, Double> { + @Override + public Tuple2 createAccumulator() { + return new Tuple2<>(0L, 0L); + } + + @Override + public Tuple2 add(Tuple2 value, Tuple2 accumulator) { + return new Tuple2<>(accumulator.f0 + value.f1, accumulator.f1 + 1L); + } + + @Override + public Double getResult(Tuple2 accumulator) { + return ((double) accumulator.f0) / accumulator.f1; + } + + @Override + public Tuple2 merge(Tuple2 a, Tuple2 b) { + return new Tuple2<>(a.f0 + b.f0, a.f1 + b.f1); + } +} + +private static class MyProcessWindowFunction + extends ProcessWindowFunction, String, TimeWindow> { + + public void process(String key, + Context context, + Iterable averages, + Collector> out) { + Double average = averages.iterator().next(); + out.collect(new Tuple2<>(key, average)); + } +} + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +val input: DataStream[(String, Long)] = ... + +input + .keyBy() + .window() + .aggregate(new AverageAggregate(), new MyProcessWindowFunction()) + +// Function definitions + +/** + * The accumulator is used to keep a running sum and a count. The [getResult] method + * computes the average. + */ +class AverageAggregate extends AggregateFunction[(String, Long), (Long, Long), Double] { + override def createAccumulator() = (0L, 0L) + + override def add(value: (String, Long), accumulator: (Long, Long)) = + (accumulator._1 + value._2, accumulator._2 + 1L) + + override def getResult(accumulator: (Long, Long)) = accumulator._1 / accumulator._2 + + override def merge(a: (Long, Long), b: (Long, Long)) = + (a._1 + b._1, a._2 + b._2) +} + +class MyProcessWindowFunction extends ProcessWindowFunction[Double, (String, Double), String, TimeWindow] { + + def process(key: String, context: Context, averages: Iterable[Double], out: Collector[(String, Double)]) = { + val average = averages.iterator.next() + out.collect((key, average)) + } +} + +``` +{{< /tab >}} +{{< /tabs >}} + +### Using per-window state in ProcessWindowFunction + +In addition to accessing keyed state (as any rich function can) a `ProcessWindowFunction` can +also use keyed state that is scoped to the window that the function is currently processing. In this +context it is important to understand what the window that *per-window* state is referring to is. +There are different "windows" involved: + + - The window that was defined when specifying the windowed operation: This might be *tumbling + windows of 1 hour* or *sliding windows of 2 hours that slide by 1 hour*. + - An actual instance of a defined window for a given key: This might be *time window from 12:00 + to 13:00 for user-id xyz*. This is based on the window definition and there will be many windows + based on the number of keys that the job is currently processing and based on what time slots + the events fall into. + +Per-window state is tied to the latter of those two. Meaning that if we process events for 1000 +different keys and events for all of them currently fall into the *[12:00, 13:00)* time window +then there will be 1000 window instances that each have their own keyed per-window state. + +There are two methods on the `Context` object that a `process()` invocation receives that allow +access to the two types of state: + + - `globalState()`, which allows access to keyed state that is not scoped to a window + - `windowState()`, which allows access to keyed state that is also scoped to the window + +This feature is helpful if you anticipate multiple firing for the same window, as can happen when +you have late firings for data that arrives late or when you have a custom trigger that does +speculative early firings. In such a case you would store information about previous firings or +the number of firings in per-window state. + +When using windowed state it is important to also clean up that state when a window is cleared. This +should happen in the `clear()` method. + +### WindowFunction (Legacy) + +In some places where a `ProcessWindowFunction` can be used you can also use a `WindowFunction`. This +is an older version of `ProcessWindowFunction` that provides less contextual information and does +not have some advances features, such as per-window keyed state. This interface will be deprecated +at some point. + +The signature of a `WindowFunction` looks as follows: + +{{< tabs "cfdb7c7a-7a5d-4ac3-b92d-b0f5aa32240f" >}} +{{< tab "Java" >}} +```java +public interface WindowFunction extends Function, Serializable { + + /** + * Evaluates the window and outputs none or several elements. + * + * @param key The key for which this window is evaluated. + * @param window The window that is being evaluated. + * @param input The elements in the window being evaluated. + * @param out A collector for emitting elements. + * + * @throws Exception The function may throw exceptions to fail the program and trigger recovery. + */ + void apply(KEY key, W window, Iterable input, Collector out) throws Exception; +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +trait WindowFunction[IN, OUT, KEY, W <: Window] extends Function with Serializable { + + /** + * Evaluates the window and outputs none or several elements. + * + * @param key The key for which this window is evaluated. + * @param window The window that is being evaluated. + * @param input The elements in the window being evaluated. + * @param out A collector for emitting elements. + * @throws Exception The function may throw exceptions to fail the program and trigger recovery. + */ + def apply(key: KEY, window: W, input: Iterable[IN], out: Collector[OUT]) +} +``` +{{< /tab >}} +{{< /tabs >}} + +It can be used like this: + +{{< tabs "4992209c-3237-42c2-82fd-894a30d2546a" >}} +{{< tab "Java" >}} +```java +DataStream> input = ...; + +input + .keyBy() + .window() + .apply(new MyWindowFunction()); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val input: DataStream[(String, Long)] = ... + +input + .keyBy() + .window() + .apply(new MyWindowFunction()) +``` +{{< /tab >}} +{{< /tabs >}} + +## Triggers + +A `Trigger` determines when a window (as formed by the *window assigner*) is ready to be +processed by the *window function*. Each `WindowAssigner` comes with a default `Trigger`. +If the default trigger does not fit your needs, you can specify a custom trigger using `trigger(...)`. + +The trigger interface has five methods that allow a `Trigger` to react to different events: + +* The `onElement()` method is called for each element that is added to a window. +* The `onEventTime()` method is called when a registered event-time timer fires. +* The `onProcessingTime()` method is called when a registered processing-time timer fires. +* The `onMerge()` method is relevant for stateful triggers and merges the states of two triggers when their corresponding windows merge, *e.g.* when using session windows. +* Finally the `clear()` method performs any action needed upon removal of the corresponding window. + +Two things to notice about the above methods are: + +1) The first three decide how to act on their invocation event by returning a `TriggerResult`. The action can be one of the following: + +* `CONTINUE`: do nothing, +* `FIRE`: trigger the computation, +* `PURGE`: clear the elements in the window, and +* `FIRE_AND_PURGE`: trigger the computation and clear the elements in the window afterwards. + +2) Any of these methods can be used to register processing- or event-time timers for future actions. + +### Fire and Purge + +Once a trigger determines that a window is ready for processing, it fires, *i.e.*, it returns `FIRE` or `FIRE_AND_PURGE`. This is the signal for the window operator +to emit the result of the current window. Given a window with a `ProcessWindowFunction` +all elements are passed to the `ProcessWindowFunction` (possibly after passing them to an evictor). +Windows with `ReduceFunction`, or `AggregateFunction` simply emit their eagerly aggregated result. + +When a trigger fires, it can either `FIRE` or `FIRE_AND_PURGE`. While `FIRE` keeps the contents of the window, `FIRE_AND_PURGE` removes its content. +By default, the pre-implemented triggers simply `FIRE` without purging the window state. + +{{< hint warning >}} +Purging will simply remove the contents of the window and will leave any potential meta-information about the window and any trigger state intact. +{{< /hint >}} + +### Default Triggers of WindowAssigners + +The default `Trigger` of a `WindowAssigner` is appropriate for many use cases. For example, all the event-time window assigners have an `EventTimeTrigger` as +default trigger. This trigger simply fires once the watermark passes the end of a window. + +The default trigger of the `GlobalWindow` is the `NeverTrigger` which does never fire. Consequently, you always have to define a custom trigger when using a `GlobalWindow`. + +{{< hint warning >}} +By specifying a trigger using `trigger()` you +are overwriting the default trigger of a `WindowAssigner`. For example, if you specify a +`CountTrigger` for `TumblingEventTimeWindows` you will no longer get window firings based on the +progress of time but only by count. Right now, you have to write your own custom trigger if +you want to react based on both time and count. +{{< /hint >}} + +### Built-in and Custom Triggers + +Flink comes with a few built-in triggers. + +* The (already mentioned) `EventTimeTrigger` fires based on the progress of event-time as measured by watermarks. +* The `ProcessingTimeTrigger` fires based on processing time. +* The `CountTrigger` fires once the number of elements in a window exceeds the given limit. +* The `PurgingTrigger` takes as argument another trigger and transforms it into a purging one. + +If you need to implement a custom trigger, you should check out the abstract +{{< gh_link file="/flink-streaming-java/src/main/java/org/apache/flink/streaming/api/windowing/triggers/Trigger.java" name="Trigger" >}} class. +Please note that the API is still evolving and might change in future versions of Flink. + +## Evictors + +Flink’s windowing model allows specifying an optional `Evictor` in addition to the `WindowAssigner` and the `Trigger`. +This can be done using the `evictor(...)` method (shown in the beginning of this document). The evictor has the ability +to remove elements from a window *after* the trigger fires and *before and/or after* the window function is applied. +To do so, the `Evictor` interface has two methods: + + /** + * Optionally evicts elements. Called before windowing function. + * + * @param elements The elements currently in the pane. + * @param size The current number of elements in the pane. + * @param window The {@link Window} + * @param evictorContext The context for the Evictor + */ + void evictBefore(Iterable> elements, int size, W window, EvictorContext evictorContext); + + /** + * Optionally evicts elements. Called after windowing function. + * + * @param elements The elements currently in the pane. + * @param size The current number of elements in the pane. + * @param window The {@link Window} + * @param evictorContext The context for the Evictor + */ + void evictAfter(Iterable> elements, int size, W window, EvictorContext evictorContext); + +The `evictBefore()` contains the eviction logic to be applied before the window function, while the `evictAfter()` +contains the one to be applied after the window function. Elements evicted before the application of the window +function will not be processed by it. + +Flink comes with three pre-implemented evictors. These are: + +* `CountEvictor`: keeps up to a user-specified number of elements from the window and discards the remaining ones from +the beginning of the window buffer. +* `DeltaEvictor`: takes a `DeltaFunction` and a `threshold`, computes the delta between the last element in the +window buffer and each of the remaining ones, and removes the ones with a delta greater or equal to the threshold. +* `TimeEvictor`: takes as argument an `interval` in milliseconds and for a given window, it finds the maximum +timestamp `max_ts` among its elements and removes all the elements with timestamps smaller than `max_ts - interval`. + +By default, all the pre-implemented evictors apply their logic before the window function. + +{{< hint danger >}} +Specifying an evictor prevents any pre-aggregation, as all the +elements of a window have to be passed to the evictor before applying the computation. +This means windows with evictors will create significantly more state. +{{< /hint >}} + +Flink provides no guarantees about the order of the elements within +a window. This implies that although an evictor may remove elements from the beginning of the window, these are not +necessarily the ones that arrive first or last. + + +## Allowed Lateness + +When working with *event-time* windowing, it can happen that elements arrive late, *i.e.* the watermark that Flink uses to +keep track of the progress of event-time is already past the end timestamp of a window to which an element belongs. See +[event time]({{< ref "docs/dev/datastream/event-time/generating_watermarks" >}}) and especially [late elements]({{< ref "docs/dev/datastream/event-time/generating_watermarks" >}}#late-elements) for a more thorough +discussion of how Flink deals with event time. + +By default, late elements are dropped when the watermark is past the end of the window. However, +Flink allows to specify a maximum *allowed lateness* for window operators. Allowed lateness +specifies by how much time elements can be late before they are dropped, and its default value is 0. +Elements that arrive after the watermark has passed the end of the window but before it passes the end of +the window plus the allowed lateness, are still added to the window. Depending on the trigger used, +a late but not dropped element may cause the window to fire again. This is the case for the `EventTimeTrigger`. + +In order to make this work, Flink keeps the state of windows until their allowed lateness expires. Once this happens, Flink removes the window and deletes its state, as +also described in the [Window Lifecycle](#window-lifecycle) section. + +By default, the allowed lateness is set to `0`. That is, elements that arrive behind the watermark will be dropped. + +You can specify an allowed lateness like this: + +{{< tabs "7adb4f13-71c9-46df-96ef-9454d1dfa4ea" >}} +{{< tab "Java" >}} +```java +DataStream input = ...; + +input + .keyBy() + .window() + .allowedLateness(

    +```text +== Abstract Syntax Tree == +LogicalUnion(all=[true]) + LogicalFilter(condition=[LIKE($1, _UTF-16LE'F%')]) + FlinkLogicalDataStreamScan(id=[1], fields=[count, word]) + FlinkLogicalDataStreamScan(id=[2], fields=[count, word]) + +== Optimized Logical Plan == +DataStreamUnion(all=[true], union all=[count, word]) + DataStreamCalc(select=[count, word], where=[LIKE(word, _UTF-16LE'F%')]) + DataStreamScan(id=[1], fields=[count, word]) + DataStreamScan(id=[2], fields=[count, word]) + +== Physical Execution Plan == +Stage 1 : Data Source + content : collect elements with CollectionInputFormat + +Stage 2 : Data Source + content : collect elements with CollectionInputFormat + + Stage 3 : Operator + content : from: (count, word) + ship_strategy : REBALANCE + + Stage 4 : Operator + content : where: (LIKE(word, _UTF-16LE'F%')), select: (count, word) + ship_strategy : FORWARD + + Stage 5 : Operator + content : from: (count, word) + ship_strategy : REBALANCE +``` +
    + +以下代码展示了一个示例以及使用 `StatementSet.explain()` 的多 sink 计划的相应输出: + +{{< tabs "77dd0499-7cec-4283-8353-0aadf5a3ab44" >}} +{{< tab "Java" >}} +```java + +EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build(); +TableEnvironment tEnv = TableEnvironment.create(settings); + +final Schema schema = new Schema() + .field("count", DataTypes.INT()) + .field("word", DataTypes.STRING()); + +tEnv.connect(new FileSystem().path("/source/path1")) + .withFormat(new Csv().deriveSchema()) + .withSchema(schema) + .createTemporaryTable("MySource1"); +tEnv.connect(new FileSystem().path("/source/path2")) + .withFormat(new Csv().deriveSchema()) + .withSchema(schema) + .createTemporaryTable("MySource2"); +tEnv.connect(new FileSystem().path("/sink/path1")) + .withFormat(new Csv().deriveSchema()) + .withSchema(schema) + .createTemporaryTable("MySink1"); +tEnv.connect(new FileSystem().path("/sink/path2")) + .withFormat(new Csv().deriveSchema()) + .withSchema(schema) + .createTemporaryTable("MySink2"); + +StatementSet stmtSet = tEnv.createStatementSet(); + +Table table1 = tEnv.from("MySource1").where($("word").like("F%")); +stmtSet.addInsert("MySink1", table1); + +Table table2 = table1.unionAll(tEnv.from("MySource2")); +stmtSet.addInsert("MySink2", table2); + +String explanation = stmtSet.explain(); +System.out.println(explanation); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val settings = EnvironmentSettings.newInstance.useBlinkPlanner.inStreamingMode.build +val tEnv = TableEnvironment.create(settings) + +val schema = new Schema() + .field("count", DataTypes.INT()) + .field("word", DataTypes.STRING()) + +tEnv.connect(new FileSystem().path("/source/path1")) + .withFormat(new Csv().deriveSchema()) + .withSchema(schema) + .createTemporaryTable("MySource1") +tEnv.connect(new FileSystem().path("/source/path2")) + .withFormat(new Csv().deriveSchema()) + .withSchema(schema) + .createTemporaryTable("MySource2") +tEnv.connect(new FileSystem().path("/sink/path1")) + .withFormat(new Csv().deriveSchema()) + .withSchema(schema) + .createTemporaryTable("MySink1") +tEnv.connect(new FileSystem().path("/sink/path2")) + .withFormat(new Csv().deriveSchema()) + .withSchema(schema) + .createTemporaryTable("MySink2") + +val stmtSet = tEnv.createStatementSet() + +val table1 = tEnv.from("MySource1").where($"word".like("F%")) +stmtSet.addInsert("MySink1", table1) + +val table2 = table1.unionAll(tEnv.from("MySource2")) +stmtSet.addInsert("MySink2", table2) + +val explanation = stmtSet.explain() +println(explanation) + +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +settings = EnvironmentSettings.new_instance().use_blink_planner().in_streaming_mode().build() +t_env = TableEnvironment.create(environment_settings=settings) + +schema = Schema() + .field("count", DataTypes.INT()) + .field("word", DataTypes.STRING()) + +t_env.connect(FileSystem().path("/source/path1"))) + .with_format(Csv().deriveSchema()) + .with_schema(schema) + .create_temporary_table("MySource1") +t_env.connect(FileSystem().path("/source/path2"))) + .with_format(Csv().deriveSchema()) + .with_schema(schema) + .create_temporary_table("MySource2") +t_env.connect(FileSystem().path("/sink/path1"))) + .with_format(Csv().deriveSchema()) + .with_schema(schema) + .create_temporary_table("MySink1") +t_env.connect(FileSystem().path("/sink/path2"))) + .with_format(Csv().deriveSchema()) + .with_schema(schema) + .create_temporary_table("MySink2") + +stmt_set = t_env.create_statement_set() + +table1 = t_env.from_path("MySource1").where(col('word').like('F%')) +stmt_set.add_insert("MySink1", table1) + +table2 = table1.union_all(t_env.from_path("MySource2")) +stmt_set.add_insert("MySink2", table2) + +explanation = stmt_set.explain() +print(explanation) + +``` +{{< /tab >}} +{{< /tabs >}} + +多 sink 计划的结果是: +
    +```text + +== Abstract Syntax Tree == +LogicalLegacySink(name=[MySink1], fields=[count, word]) ++- LogicalFilter(condition=[LIKE($1, _UTF-16LE'F%')]) + +- LogicalTableScan(table=[[default_catalog, default_database, MySource1, source: [CsvTableSource(read fields: count, word)]]]) + +LogicalLegacySink(name=[MySink2], fields=[count, word]) ++- LogicalUnion(all=[true]) + :- LogicalFilter(condition=[LIKE($1, _UTF-16LE'F%')]) + : +- LogicalTableScan(table=[[default_catalog, default_database, MySource1, source: [CsvTableSource(read fields: count, word)]]]) + +- LogicalTableScan(table=[[default_catalog, default_database, MySource2, source: [CsvTableSource(read fields: count, word)]]]) + +== Optimized Logical Plan == +Calc(select=[count, word], where=[LIKE(word, _UTF-16LE'F%')], reuse_id=[1]) ++- TableSourceScan(table=[[default_catalog, default_database, MySource1, source: [CsvTableSource(read fields: count, word)]]], fields=[count, word]) + +LegacySink(name=[MySink1], fields=[count, word]) ++- Reused(reference_id=[1]) + +LegacySink(name=[MySink2], fields=[count, word]) ++- Union(all=[true], union=[count, word]) + :- Reused(reference_id=[1]) + +- TableSourceScan(table=[[default_catalog, default_database, MySource2, source: [CsvTableSource(read fields: count, word)]]], fields=[count, word]) + +== Physical Execution Plan == +Stage 1 : Data Source + content : collect elements with CollectionInputFormat + + Stage 2 : Operator + content : CsvTableSource(read fields: count, word) + ship_strategy : REBALANCE + + Stage 3 : Operator + content : SourceConversion(table:Buffer(default_catalog, default_database, MySource1, source: [CsvTableSource(read fields: count, word)]), fields:(count, word)) + ship_strategy : FORWARD + + Stage 4 : Operator + content : Calc(where: (word LIKE _UTF-16LE'F%'), select: (count, word)) + ship_strategy : FORWARD + + Stage 5 : Operator + content : SinkConversionToRow + ship_strategy : FORWARD + + Stage 6 : Operator + content : Map + ship_strategy : FORWARD + +Stage 8 : Data Source + content : collect elements with CollectionInputFormat + + Stage 9 : Operator + content : CsvTableSource(read fields: count, word) + ship_strategy : REBALANCE + + Stage 10 : Operator + content : SourceConversion(table:Buffer(default_catalog, default_database, MySource2, source: [CsvTableSource(read fields: count, word)]), fields:(count, word)) + ship_strategy : FORWARD + + Stage 12 : Operator + content : SinkConversionToRow + ship_strategy : FORWARD + + Stage 13 : Operator + content : Map + ship_strategy : FORWARD + + Stage 7 : Data Sink + content : Sink: CsvTableSink(count, word) + ship_strategy : FORWARD + + Stage 14 : Data Sink + content : Sink: CsvTableSink(count, word) + ship_strategy : FORWARD + +``` +
    + +{{< top >}} + + diff --git a/docs/content.zh/docs/dev/table/concepts/_index.md b/docs/content.zh/docs/dev/table/concepts/_index.md new file mode 100644 index 0000000000000..0d50f5b5d2423 --- /dev/null +++ b/docs/content.zh/docs/dev/table/concepts/_index.md @@ -0,0 +1,23 @@ +--- +title: 流式概念 +bookCollapseSection: true +weight: 4 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/dev/table/concepts/dynamic_tables.md b/docs/content.zh/docs/dev/table/concepts/dynamic_tables.md new file mode 100644 index 0000000000000..428e876ade0ff --- /dev/null +++ b/docs/content.zh/docs/dev/table/concepts/dynamic_tables.md @@ -0,0 +1,184 @@ +--- +title: "动态表 (Dynamic Table)" +weight: 2 +type: docs +aliases: + - /zh/dev/table/streaming/dynamic_tables.html +--- + + +# 动态表 (Dynamic Table) + +SQL 和关系代数在设计时并未考虑流数据。因此,在关系代数(和 SQL)之间几乎没有概念上的差异。 + +本文会讨论这种差异,并介绍 Flink 如何在无界数据集上实现与数据库引擎在有界数据上的处理具有相同的语义。 + + + +DataStream 上的关系查询 +---------------------------------- + +下表比较了传统的关系代数和流处理与输入数据、执行和输出结果的关系。 + + + + + + + + + + + + + + + + + + +
    关系代数 / SQL流处理
    关系(或表)是有界(多)元组集合。流是一个无限元组序列。
    对批数据(例如关系数据库中的表)执行的查询可以访问完整的输入数据。流式查询在启动时不能访问所有数据,必须“等待”数据流入。
    批处理查询在产生固定大小的结果后终止。流查询不断地根据接收到的记录更新其结果,并且始终不会结束。
    + +尽管存在这些差异,但是使用关系查询和 SQL 处理流并不是不可能的。高级关系数据库系统提供了一个称为 *物化视图(Materialized Views)* 的特性。物化视图被定义为一条 SQL 查询,就像常规的虚拟视图一样。与虚拟视图相反,物化视图缓存查询的结果,因此在访问视图时不需要对查询进行计算。缓存的一个常见难题是防止缓存为过期的结果提供服务。当其定义查询的基表被修改时,物化视图将过期。 *即时视图维护(Eager View Maintenance)* 是一种一旦更新了物化视图的基表就立即更新视图的技术。 + +如果我们考虑以下问题,那么即时视图维护和流上的SQL查询之间的联系就会变得显而易见: + +- 数据库表是 `INSERT`、`UPDATE` 和 `DELETE` DML 语句的 *stream* 的结果,通常称为 *changelog stream* 。 +- 物化视图被定义为一条 SQL 查询。为了更新视图,查询不断地处理视图的基本关系的changelog 流。 +- 物化视图是流式 SQL 查询的结果。 + +了解了这些要点之后,我们将在下一节中介绍 *动态表(Dynamic tables)* 的概念。 + +动态表 & 连续查询(Continuous Query) +--------------------------------------- + +*动态表* 是 Flink 的支持流数据的 Table API 和 SQL 的核心概念。与表示批处理数据的静态表不同,动态表是随时间变化的。可以像查询静态批处理表一样查询它们。查询动态表将生成一个 *连续查询* 。一个连续查询永远不会终止,结果会生成一个动态表。查询不断更新其(动态)结果表,以反映其(动态)输入表上的更改。本质上,动态表上的连续查询非常类似于定义物化视图的查询。 + +需要注意的是,连续查询的结果在语义上总是等价于以批处理模式在输入表快照上执行的相同查询的结果。 + +下图显示了流、动态表和连续查询之间的关系: + +{{< img alt="Dynamic tables" src="/fig/table-streaming/stream-query-stream.png" width="80%">}} + +1. 将流转换为动态表。 +2. 在动态表上计算一个连续查询,生成一个新的动态表。 +3. 生成的动态表被转换回流。 + +**注意:** 动态表首先是一个逻辑概念。在查询执行期间不一定(完全)物化动态表。 + +在下面,我们将解释动态表和连续查询的概念,并使用具有以下模式的单击事件流: + +```plain +[ + user: VARCHAR, // 用户名 + cTime: TIMESTAMP, // 访问 URL 的时间 + url: VARCHAR // 用户访问的 URL +] +``` + +在流上定义表 +---------------------------- + +为了使用关系查询处理流,必须将其转换成 `Table`。从概念上讲,流的每条记录都被解释为对结果表的 `INSERT` 操作。本质上我们正在从一个 `INSERT`-only 的 changelog 流构建表。 + +下图显示了单击事件流(左侧)如何转换为表(右侧)。当插入更多的单击流记录时,结果表将不断增长。 + +{{< img alt="Append mode" src="/fig/table-streaming/append-mode.png" width="60%">}} + +**注意:** 在流上定义的表在内部没有物化。 + +### 连续查询 +---------------------- + +在动态表上计算一个连续查询,并生成一个新的动态表。与批处理查询不同,连续查询从不终止,并根据其输入表上的更新更新其结果表。在任何时候,连续查询的结果在语义上与以批处理模式在输入表快照上执行的相同查询的结果相同。 + +在接下来的代码中,我们将展示 `clicks` 表上的两个示例查询,这个表是在点击事件流上定义的。 + +第一个查询是一个简单的 `GROUP-BY COUNT` 聚合查询。它基于 `user` 字段对 `clicks` 表进行分组,并统计访问的 URL 的数量。下面的图显示了当 `clicks` 表被附加的行更新时,查询是如何被评估的。 + +{{< img alt="Continuous Non-Windowed Query" src="/fig/table-streaming/query-groupBy-cnt.png" width="90%">}} + +当查询开始,`clicks` 表(左侧)是空的。当第一行数据被插入到 `clicks` 表时,查询开始计算结果表。第一行数据 `[Mary,./home]` 插入后,结果表(右侧,上部)由一行 `[Mary, 1]` 组成。当第二行 `[Bob, ./cart]` 插入到 `clicks` 表时,查询会更新结果表并插入了一行新数据 `[Bob, 1]`。第三行 `[Mary, ./prod?id=1]` 将产生已计算的结果行的更新,`[Mary, 1]` 更新成 `[Mary, 2]`。最后,当第四行数据加入 `clicks` 表时,查询将第三行 `[Liz, 1]` 插入到结果表中。 + +第二条查询与第一条类似,但是除了用户属性之外,还将 `clicks` 分组至[每小时滚动窗口]({{< ref "docs/dev/table/sql/overview" >}}#group-windows)中,然后计算 url 数量(基于时间的计算,例如基于特定[时间属性](time_attributes.html)的窗口,后面会讨论)。同样,该图显示了不同时间点的输入和输出,以可视化动态表的变化特性。 + +{{< img alt="Continuous Group-Window Query" src="/fig/table-streaming/query-groupBy-window-cnt.png" width="100%">}} + +与前面一样,左边显示了输入表 `clicks`。查询每小时持续计算结果并更新结果表。clicks表包含四行带有时间戳(`cTime`)的数据,时间戳在 `12:00:00` 和 `12:59:59` 之间。查询从这个输入计算出两个结果行(每个 `user` 一个),并将它们附加到结果表中。对于 `13:00:00` 和 `13:59:59` 之间的下一个窗口,`clicks` 表包含三行,这将导致另外两行被追加到结果表。随着时间的推移,更多的行被添加到 `click` 中,结果表将被更新。 + + + +### 更新和追加查询 + +虽然这两个示例查询看起来非常相似(都计算分组计数聚合),但它们在一个重要方面不同: +- 第一个查询更新先前输出的结果,即定义结果表的 changelog 流包含 `INSERT` 和 `UPDATE` 操作。 +- 第二个查询只附加到结果表,即结果表的 changelog 流只包含 `INSERT` 操作。 + +一个查询是产生一个只追加的表还是一个更新的表有一些含义: +- 产生更新更改的查询通常必须维护更多的状态(请参阅以下部分)。 +- 将 append-only 的表转换为流与将已更新的表转换为流是不同的(参阅[表到流的转换](#table-to-stream-conversion)章节)。 + +### 查询限制 + +许多(但不是全部)语义上有效的查询可以作为流上的连续查询进行评估。有些查询代价太高而无法计算,这可能是由于它们需要维护的状态大小,也可能是由于计算更新代价太高。 + +- **状态大小:** 连续查询在无界流上计算,通常应该运行数周或数月。因此,连续查询处理的数据总量可能非常大。必须更新先前输出的结果的查询需要维护所有输出的行,以便能够更新它们。例如,第一个查询示例需要存储每个用户的 URL 计数,以便能够增加该计数并在输入表接收新行时发送新结果。如果只跟踪注册用户,则要维护的计数数量可能不会太高。但是,如果未注册的用户分配了一个惟一的用户名,那么要维护的计数数量将随着时间增长,并可能最终导致查询失败。 + +```sql +SELECT user, COUNT(url) +FROM clicks +GROUP BY user; +``` + +- **计算更新:** 有些查询需要重新计算和更新大量已输出的结果行,即使只添加或更新一条输入记录。显然,这样的查询不适合作为连续查询执行。下面的查询就是一个例子,它根据最后一次单击的时间为每个用户计算一个 `RANK`。一旦 `click` 表接收到一个新行,用户的 `lastAction` 就会更新,并必须计算一个新的排名。然而,由于两行不能具有相同的排名,所以所有较低排名的行也需要更新。 + +```sql +SELECT user, RANK() OVER (ORDER BY lastAction) +FROM ( + SELECT user, MAX(cTime) AS lastAction FROM clicks GROUP BY user +); +``` + +[查询配置](query_configuration.html)章节讨论了控制连续查询执行的参数。一些参数可以用来在维持状态的大小和获得结果的准确性之间做取舍。 + + + +表到流的转换 +-------------------------- + +动态表可以像普通数据库表一样通过 `INSERT`、`UPDATE` 和 `DELETE` 来不断修改。它可能是一个只有一行、不断更新的表,也可能是一个 insert-only 的表,没有 `UPDATE` 和 `DELETE` 修改,或者介于两者之间的其他表。 + +在将动态表转换为流或将其写入外部系统时,需要对这些更改进行编码。Flink的 Table API 和 SQL 支持三种方式来编码一个动态表的变化: + +* **Append-only 流:** 仅通过 `INSERT` 操作修改的动态表可以通过输出插入的行转换为流。 + +* **Retract 流:** retract 流包含两种类型的 message: *add messages* 和 *retract messages* 。通过将`INSERT` 操作编码为 add message、将 `DELETE` 操作编码为 retract message、将 `UPDATE` 操作编码为更新(先前)行的 retract message 和更新(新)行的 add message,将动态表转换为 retract 流。下图显示了将动态表转换为 retract 流的过程。 + +{{< img alt="Dynamic tables" src="/fig/table-streaming/undo-redo-mode.png" width="85%" >}} + + +* **Upsert 流:** upsert 流包含两种类型的 message: *upsert messages* 和*delete messages*。转换为 upsert 流的动态表需要(可能是组合的)唯一键。通过将 `INSERT` 和 `UPDATE` 操作编码为 upsert message,将 `DELETE` 操作编码为 delete message ,将具有唯一键的动态表转换为流。消费流的算子需要知道唯一键的属性,以便正确地应用 message。与 retract 流的主要区别在于 `UPDATE` 操作是用单个 message 编码的,因此效率更高。下图显示了将动态表转换为 upsert 流的过程。 + +{{< img alt="Dynamic tables" src="/fig/table-streaming/redo-mode.png" width="85%" >}} + + +在[通用概念]({{< ref "docs/dev/table/common" >}}#convert-a-table-into-a-datastream)中讨论了将动态表转换为 `DataStream` 的 API。请注意,在将动态表转换为 `DataStream` 时,只支持 append 流和 retract 流。在 [TableSources 和 TableSinks](../sourceSinks.html#define-a-tablesink) 章节讨论向外部系统输出动态表的 `TableSink` 接口。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/concepts/legacy.md b/docs/content.zh/docs/dev/table/concepts/legacy.md new file mode 100644 index 0000000000000..93e778db9d714 --- /dev/null +++ b/docs/content.zh/docs/dev/table/concepts/legacy.md @@ -0,0 +1,136 @@ +--- +title: "Legacy Features" +weight: 1002 +type: docs +aliases: + - /zh/dev/table/streaming/legacy.html +--- + + +# Legacy Features + +As Flink SQL has matured there are some features that have been replaced with more modern and better functioning substitutes. +These legacy features remain documented here for those users that have not yet or are unable to, upgrade to the more modern variant. + +# Temporal Table Function + +The temporal table function is the legacy way of defining something akin to a [versioned table]({{< ref "docs/dev/table/concepts/versioned_tables" >}}) +that can be used in a temporal table join. +Please define temporal joins using [versioned tables]({{< ref "docs/dev/table/concepts/versioned_tables" >}}) in new queries. + +Unlike a versioned table, temporal table functions can only be defined on top of append-only streams +— it does not support changelog inputs. +Additionally, a temporal table function cannot be defined in pure SQL DDL. + +#### Defining a Temporal Table Function + +Temporal table functions can be defined on top of append-only streams using the [Table API]({{< ref "docs/dev/table/tableApi" >}}). +The table is registered with one or more key columns, and a time attribute used for versioning. + +Suppose we have an append-only table of currency rates that we would like to +register as a temporal table function. + +```sql +SELECT * FROM currency_rates; + +update_time currency rate +============= ========= ==== +09:00:00 Yen 102 +09:00:00 Euro 114 +09:00:00 USD 1 +11:15:00 Euro 119 +11:49:00 Pounds 108 +``` + +Using the Table API, we can register this stream using `currency` for the key and `update_time` as +the versioning time attribute. + +{{< tabs "066b6695-5bc3-4d7a-9033-ff6b1d14b3a1" >}} +{{< tab "Java" >}} +```java +TemporalTableFunction rates = tEnv + .from("currency_rates"). + .createTemporalTableFunction("update_time", "currency"); + +tEnv.registerFunction("rates", rates); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +rates = tEnv + .from("currency_rates"). + .createTemporalTableFunction("update_time", "currency") + +tEnv.registerFunction("rates", rates) +``` +{{< /tab >}} +{{< /tabs >}} + +#### Temporal Table Function Join + +Once defined, a temporal table function is used as a standard [table function]({{< ref "docs/dev/table/functions/udfs" >}}#table-functions). +Append-only tables (left input/probe side) can join with a temporal table (right input/build side), +i.e., a table that changes over time and tracks its changes, to retrieve the value for a key as it was at a particular point in time. + +Consider an append-only table `orders` that tracks customers' orders in different currencies. + +```sql +SELECT * FROM orders; + +order_time amount currency +========== ====== ========= +10:15 2 Euro +10:30 1 USD +10:32 50 Yen +10:52 3 Euro +11:04 5 USD +``` + +Given these tables, we would like to convert orders to a common currency — USD. + +{{< tabs "7ec4efc6-41ae-42c1-a261-4a94dd3b44e0" >}} +{{< tab "SQL" >}} +```sql +SELECT + SUM(amount * rate) AS amount +FROM + orders, + LATERAL TABLE (rates(order_time)) +WHERE + rates.currency = orders.currency +``` +{{< /tab >}} +{{< tab "Java" >}} +```java +Table result = orders + .joinLateral($("rates(order_time)"), $("orders.currency = rates.currency")) + .select($("(o_amount * r_rate).sum as amount")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val result = orders + .joinLateral($"rates(order_time)", $"orders.currency = rates.currency") + .select($"(o_amount * r_rate).sum as amount")) +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/concepts/overview.md b/docs/content.zh/docs/dev/table/concepts/overview.md new file mode 100644 index 0000000000000..7f27c67174296 --- /dev/null +++ b/docs/content.zh/docs/dev/table/concepts/overview.md @@ -0,0 +1,46 @@ +--- +title: "流式概念" +weight: 1 +type: docs +aliases: + - /zh/dev/table/streaming/ +is_beta: false +--- + + +# 流式概念 + +Flink 的 [Table API]({{< ref "docs/dev/table/tableApi" >}}) 和 [SQL]({{< ref "docs/dev/table/sql/overview" >}}) 是流批统一的 API。 +这意味着 Table API & SQL 在无论有限的批式输入还是无限的流式输入下,都具有相同的语义。 +因为传统的关系代数以及 SQL 最开始都是为了批式处理而设计的, +关系型查询在流式场景下不如在批式场景下容易懂。 + +下面这些页面包含了概念、实际的限制,以及流式数据处理中的一些特定的配置。 + +接下来? +----------------- + +* [动态表]({{< ref "docs/dev/table/concepts/dynamic_tables" >}}): 描述了动态表的概念。 +* [时间属性]({{< ref "docs/dev/table/concepts/time_attributes" >}}): 解释了时间属性以及它是如何在 Table API & SQL 中使用的。 +* [流上的 Join]({{< ref "docs/dev/table/sql/queries/joins" >}}): 支持的几种流上的 Join。 +* [时态(temporal)表]({{< ref "docs/dev/table/concepts/versioned_tables" >}}): 描述了时态表的概念。 +* [查询配置]({{< ref "docs/dev/table/config" >}}): Table API & SQL 特定的配置。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/concepts/time_attributes.md b/docs/content.zh/docs/dev/table/concepts/time_attributes.md new file mode 100644 index 0000000000000..aabd41f1ffc30 --- /dev/null +++ b/docs/content.zh/docs/dev/table/concepts/time_attributes.md @@ -0,0 +1,439 @@ +--- +title: "时间属性" +weight: 3 +type: docs +aliases: + - /zh/dev/table/streaming/time_attributes.html +--- + + +# 时间属性 + +Flink 可以基于几种不同的 *时间* 概念来处理数据。 + +- *处理时间* 指的是执行具体操作时的机器时间(大家熟知的绝对时间, 例如 Java的 `System.currentTimeMillis()`) ) +- *事件时间* 指的是数据本身携带的时间。这个时间是在事件产生时的时间。 +- *摄入时间* 指的是数据进入 Flink 的时间;在系统内部,会把它当做事件时间来处理。 + +对于时间相关的更多信息,可以参考 [事件时间和Watermark]({{< ref "docs/concepts/time" >}})。 + +本页面说明了如何在 Flink Table API & SQL 里面定义时间以及相关的操作。 + +时间属性介绍 +------------------------------- + +像窗口(在 [Table API]({{< ref "docs/dev/table/tableApi" >}}#group-windows) 和 [SQL]({{< ref "docs/dev/table/sql/queries" >}}#group-windows) )这种基于时间的操作,需要有时间信息。因此,Table API 中的表就需要提供*逻辑时间属性*来表示时间,以及支持时间相关的操作。 + +每种类型的表都可以有时间属性,可以在用CREATE TABLE DDL创建表的时候指定、也可以在 `DataStream` 中指定、也可以在定义 `TableSource` 时指定。一旦时间属性定义好,它就可以像普通列一样使用,也可以在时间相关的操作中使用。 + +只要时间属性没有被修改,而是简单地从一个表传递到另一个表,它就仍然是一个有效的时间属性。时间属性可以像普通的时间戳的列一样被使用和计算。一旦时间属性被用在了计算中,它就会被物化,进而变成一个普通的时间戳。普通的时间戳是无法跟 Flink 的时间以及watermark等一起使用的,所以普通的时间戳就无法用在时间相关的操作中。 + +Table API 程序需要在 streaming environment 中指定时间属性: + +{{< tabs "bc88319c-8c8f-4f9d-8972-a2d6efe478d7" >}} +{{< tab "Java" >}} +```java +final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime); // default + +// 或者: +// env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime); +// env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment + +env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime) // default + +// 或者: +// env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime) +// env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +env = StreamExecutionEnvironment.get_execution_environment() + +env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime) # default + +# 或者: +# env.set_stream_time_characteristic(TimeCharacteristic.IngestionTime) +# env.set_stream_time_characteristic(TimeCharacteristic.EventTime) +``` +{{< /tab >}} +{{< /tabs >}} + + + +处理时间 +--------------- + +处理时间是基于机器的本地时间来处理数据,它是最简单的一种时间概念,但是它不能提供确定性。它既不需要从数据里获取时间,也不需要生成 watermark。 + +共有三种方法可以定义处理时间。 + +### 在创建表的 DDL 中定义 + +处理时间属性可以在创建表的 DDL 中用计算列的方式定义,用 `PROCTIME()` 就可以定义处理时间,函数 `PROCTIME()` 的返回类型是 TIMESTAMP_LTZ 。关于计算列,更多信息可以参考:[CREATE TABLE DDL]({{< ref "docs/dev/table/sql/create" >}}#create-table) + +```sql + +CREATE TABLE user_actions ( + user_name STRING, + data STRING, + user_action_time AS PROCTIME() -- 声明一个额外的列作为处理时间属性 +) WITH ( + ... +); + +SELECT TUMBLE_START(user_action_time, INTERVAL '10' MINUTE), COUNT(DISTINCT user_name) +FROM user_actions +GROUP BY TUMBLE(user_action_time, INTERVAL '10' MINUTE); + +``` + + +### 在 DataStream 到 Table 转换时定义 + +处理时间属性可以在 schema 定义的时候用 `.proctime` 后缀来定义。时间属性一定不能定义在一个已有字段上,所以它只能定义在 schema 定义的最后。 + +{{< tabs "ec1506ee-adf2-4668-a6f0-7944a5b24bc2" >}} +{{< tab "Java" >}} +```java +DataStream> stream = ...; + +// 声明一个额外的字段作为时间属性字段 +Table table = tEnv.fromDataStream(stream, $("user_name"), $("data"), $("user_action_time").proctime()); + +WindowedTable windowedTable = table.window( + Tumble.over(lit(10).minutes()) + .on($("user_action_time")) + .as("userActionWindow")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val stream: DataStream[(String, String)] = ... + +// 声明一个额外的字段作为时间属性字段 +val table = tEnv.fromDataStream(stream, $"UserActionTimestamp", $"user_name", $"data", $"user_action_time".proctime) + +val windowedTable = table.window(Tumble over 10.minutes on $"user_action_time" as "userActionWindow") +``` +{{< /tab >}} +{{< /tabs >}} + +### 使用 TableSource 定义 + +处理时间属性可以在实现了 `DefinedProctimeAttribute` 的 `TableSource` 中定义。逻辑的时间属性会放在 `TableSource` 已有物理字段的最后 + +{{< tabs "4b87d0d7-487e-4b11-a01a-4812f6b71e2d" >}} +{{< tab "Java" >}} +```java +// 定义一个由处理时间属性的 table source +public class UserActionSource implements StreamTableSource, DefinedProctimeAttribute { + + @Override + public TypeInformation getReturnType() { + String[] names = new String[] {"user_name" , "data"}; + TypeInformation[] types = new TypeInformation[] {Types.STRING(), Types.STRING()}; + return Types.ROW(names, types); + } + + @Override + public DataStream getDataStream(StreamExecutionEnvironment execEnv) { + // create stream + DataStream stream = ...; + return stream; + } + + @Override + public String getProctimeAttribute() { + // 这个名字的列会被追加到最后,作为第三列 + return "user_action_time"; + } +} + +// register table source +tEnv.registerTableSource("user_actions", new UserActionSource()); + +WindowedTable windowedTable = tEnv + .from("user_actions") + .window(Tumble + .over(lit(10).minutes()) + .on($("user_action_time")) + .as("userActionWindow")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// 定义一个由处理时间属性的 table source +class UserActionSource extends StreamTableSource[Row] with DefinedProctimeAttribute { + + override def getReturnType = { + val names = Array[String]("user_name" , "data") + val types = Array[TypeInformation[_]](Types.STRING, Types.STRING) + Types.ROW(names, types) + } + + override def getDataStream(execEnv: StreamExecutionEnvironment): DataStream[Row] = { + // create stream + val stream = ... + stream + } + + override def getProctimeAttribute = { + // 这个名字的列会被追加到最后,作为第三列 + "user_action_time" + } +} + +// register table source +tEnv.registerTableSource("user_actions", new UserActionSource) + +val windowedTable = tEnv + .from("user_actions") + .window(Tumble over 10.minutes on $"user_action_time" as "userActionWindow") +``` +{{< /tab >}} +{{< /tabs >}} + +事件时间 +---------- + +事件时间允许程序按照数据中包含的时间来处理,这样可以在有乱序或者晚到的数据的情况下产生一致的处理结果。它可以保证从外部存储读取数据后产生可以复现(replayable)的结果。 + +除此之外,事件时间可以让程序在流式和批式作业中使用同样的语法。在流式程序中的事件时间属性,在批式程序中就是一个正常的时间字段。 + +为了能够处理乱序的事件,并且区分正常到达和晚到的事件,Flink 需要从事件中获取事件时间并且产生 watermark([watermarks]({{< ref "docs/concepts/time" >}}))。 + +事件时间属性也有类似于处理时间的三种定义方式:在DDL中定义、在 DataStream 到 Table 转换时定义、用 TableSource 定义。 + +### 在 DDL 中定义 + +事件时间属性可以用 WATERMARK 语句在 CREATE TABLE DDL 中进行定义。WATERMARK 语句在一个已有字段上定义一个 watermark 生成表达式,同时标记这个已有字段为时间属性字段。更多信息可以参考:[CREATE TABLE DDL]({{< ref "docs/dev/table/sql/create" >}}#create-table) + +Flink supports defining event time attribute on TIMESTAMP column and TIMESTAMP_LTZ column. +If the data source contains timestamp literal, it's recommended to defining event time attribute on TIMESTAMP column: + +Flink 支持和在 TIMESTAMP 列和 TIMESTAMP_LTZ 列上定义事件时间。如果源数据中的时间戳数据表示为年-月-日-时-分-秒,则通常为不带时区信息的字符串值,例如 `2020-04-15 20:13:40.564`,建议将事件时间属性定义在 `TIMESTAMP` 列上: +```sql + +CREATE TABLE user_actions ( + user_name STRING, + data STRING, + user_action_time TIMESTAMP(3), + -- 声明 user_action_time 是事件时间属性,并且用 延迟 5 秒的策略来生成 watermark + WATERMARK FOR user_action_time AS user_action_time - INTERVAL '5' SECOND +) WITH ( + ... +); + +SELECT TUMBLE_START(user_action_time, INTERVAL '10' MINUTE), COUNT(DISTINCT user_name) +FROM user_actions +GROUP BY TUMBLE(user_action_time, INTERVAL '10' MINUTE); + +``` + +源数据中的时间戳数据表示为一个纪元 (epoch) 时间,通常是一个 long 值,例如 `1618989564564`,建议将事件时间属性定义在 `TIMESTAMP_LTZ` 列上: + ```sql + +CREATE TABLE user_actions ( + user_name STRING, + data STRING, + ts BIGINT, + time_ltz AS TO_TIMESTAMP_LTZ(time_ltz, 3), + -- declare time_ltz as event time attribute and use 5 seconds delayed watermark strategy + WATERMARK FOR time_ltz AS time_ltz - INTERVAL '5' SECOND +) WITH ( + ... +); + +SELECT TUMBLE_START(time_ltz, INTERVAL '10' MINUTE), COUNT(DISTINCT user_name) +FROM user_actions +GROUP BY TUMBLE(time_ltz, INTERVAL '10' MINUTE); + +``` + +### 在 DataStream 到 Table 转换时定义 + +事件时间属性可以用 `.rowtime` 后缀在定义 `DataStream` schema 的时候来定义。[时间戳和 watermark]({{< ref "docs/concepts/time" >}}) 在这之前一定是在 `DataStream` 上已经定义好了。 +在从 DataStream 转换到 Table 时,由于 `DataStream` 没有时区概念,因此 Flink 总是将 `rowtime` 属性解析成 `TIMESTAMP WITHOUT TIME ZONE` 类型,并且将所有事件时间的值都视为 UTC 时区的值。 + +在从 `DataStream` 到 `Table` 转换时定义事件时间属性有两种方式。取决于用 `.rowtime` 后缀修饰的字段名字是否是已有字段,事件时间字段可以是: + +- 在 schema 的结尾追加一个新的字段 +- 替换一个已经存在的字段。 + +不管在哪种情况下,事件时间字段都表示 `DataStream` 中定义的事件的时间戳。 + +{{< tabs "4ddfd970-e823-48c2-ae38-4e110ada0f9a" >}} +{{< tab "Java" >}} +```java + +// Option 1: + +// 基于 stream 中的事件产生时间戳和 watermark +DataStream> stream = inputStream.assignTimestampsAndWatermarks(...); + +// 声明一个额外的逻辑字段作为事件时间属性 +Table table = tEnv.fromDataStream(stream, $("user_name"), $("data"), $("user_action_time").rowtime()); + + +// Option 2: + +// 从第一个字段获取事件时间,并且产生 watermark +DataStream> stream = inputStream.assignTimestampsAndWatermarks(...); + +// 第一个字段已经用作事件时间抽取了,不用再用一个新字段来表示事件时间了 +Table table = tEnv.fromDataStream(stream, $("user_action_time").rowtime(), $("user_name"), $("data")); + +// Usage: + +WindowedTable windowedTable = table.window(Tumble + .over(lit(10).minutes()) + .on($("user_action_time")) + .as("userActionWindow")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +// Option 1: + +// 基于 stream 中的事件产生时间戳和 watermark +val stream: DataStream[(String, String)] = inputStream.assignTimestampsAndWatermarks(...) + +// 声明一个额外的逻辑字段作为事件时间属性 +val table = tEnv.fromDataStream(stream, $"user_name", $"data", $"user_action_time".rowtime) + + +// Option 2: + +// 从第一个字段获取事件时间,并且产生 watermark +val stream: DataStream[(Long, String, String)] = inputStream.assignTimestampsAndWatermarks(...) + +// 第一个字段已经用作事件时间抽取了,不用再用一个新字段来表示事件时间了 +val table = tEnv.fromDataStream(stream, $"user_action_time".rowtime, $"user_name", $"data") + +// Usage: + +val windowedTable = table.window(Tumble over 10.minutes on $"user_action_time" as "userActionWindow") +``` +{{< /tab >}} +{{< /tabs >}} + +### 使用 TableSource 定义 + +事件时间属性可以在实现了 `DefinedRowTimeAttributes` 的 `TableSource` 中定义。`getRowtimeAttributeDescriptors()` 方法返回 `RowtimeAttributeDescriptor` 的列表,包含了描述事件时间属性的字段名字、如何计算事件时间、以及 watermark 生成策略等信息。 + +同时需要确保 `getDataStream` 返回的 `DataStream` 已经定义好了时间属性。 +只有在定义了 `StreamRecordTimestamp` 时间戳分配器的时候,才认为 `DataStream` 是有时间戳信息的。 +只有定义了 `PreserveWatermarks` watermark 生成策略的 `DataStream` 的 watermark 才会被保留。反之,则只有时间字段的值是生效的。 + +{{< tabs "bfc26d42-c602-402e-98fc-fe6eb787d283" >}} +{{< tab "Java" >}} +```java +// 定义一个有事件时间属性的 table source +public class UserActionSource implements StreamTableSource, DefinedRowtimeAttributes { + + @Override + public TypeInformation getReturnType() { + String[] names = new String[] {"user_name", "data", "user_action_time"}; + TypeInformation[] types = + new TypeInformation[] {Types.STRING(), Types.STRING(), Types.LONG()}; + return Types.ROW(names, types); + } + + @Override + public DataStream getDataStream(StreamExecutionEnvironment execEnv) { + // 构造 DataStream + // ... + // 基于 "user_action_time" 定义 watermark + DataStream stream = inputStream.assignTimestampsAndWatermarks(...); + return stream; + } + + @Override + public List getRowtimeAttributeDescriptors() { + // 标记 "user_action_time" 字段是事件时间字段 + // 给 "user_action_time" 构造一个时间属性描述符 + RowtimeAttributeDescriptor rowtimeAttrDescr = new RowtimeAttributeDescriptor( + "user_action_time", + new ExistingField("user_action_time"), + new AscendingTimestamps()); + List listRowtimeAttrDescr = Collections.singletonList(rowtimeAttrDescr); + return listRowtimeAttrDescr; + } +} + +// register the table source +tEnv.registerTableSource("user_actions", new UserActionSource()); + +WindowedTable windowedTable = tEnv + .from("user_actions") + .window(Tumble.over(lit(10).minutes()).on($("user_action_time")).as("userActionWindow")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// 定义一个有事件时间属性的 table source +class UserActionSource extends StreamTableSource[Row] with DefinedRowtimeAttributes { + + override def getReturnType = { + val names = Array[String]("user_name" , "data", "user_action_time") + val types = Array[TypeInformation[_]](Types.STRING, Types.STRING, Types.LONG) + Types.ROW(names, types) + } + + override def getDataStream(execEnv: StreamExecutionEnvironment): DataStream[Row] = { + // 构造 DataStream + // ... + // 基于 "user_action_time" 定义 watermark + val stream = inputStream.assignTimestampsAndWatermarks(...) + stream + } + + override def getRowtimeAttributeDescriptors: util.List[RowtimeAttributeDescriptor] = { + // 标记 "user_action_time" 字段是事件时间字段 + // 给 "user_action_time" 构造一个时间属性描述符 + val rowtimeAttrDescr = new RowtimeAttributeDescriptor( + "user_action_time", + new ExistingField("user_action_time"), + new AscendingTimestamps) + val listRowtimeAttrDescr = Collections.singletonList(rowtimeAttrDescr) + listRowtimeAttrDescr + } +} + +// register the table source +tEnv.registerTableSource("user_actions", new UserActionSource) + +val windowedTable = tEnv + .from("user_actions") + .window(Tumble over 10.minutes on $"user_action_time" as "userActionWindow") +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/concepts/timezone.md b/docs/content.zh/docs/dev/table/concepts/timezone.md new file mode 100644 index 0000000000000..870dffae13592 --- /dev/null +++ b/docs/content.zh/docs/dev/table/concepts/timezone.md @@ -0,0 +1,573 @@ +--- +title: "时区" +weight: 4 +type: docs +--- + + +## 概述 + +Flink 为日期和时间提供了丰富的数据类型, 包括 `DATE`, `TIME`, `TIMESTAMP`, `TIMESTAMP_LTZ`, `INTERVAL YEAR TO MONTH`, `INTERVAL DAY TO SECOND` (更多详情请参考 [Date and Time]({{< ref "docs/dev/table/types" >}}#date-and-time))。 +Flink 支持在 session (会话)级别设置时区(更多详情请参考 [table.local-time-zone]({{< ref "docs/dev/table/config">}}#table-local-time-zone))。 +Flink 对多种时间类型和时区的支持使得跨时区的数据处理变得非常容易。 + +## TIMESTAMP vs TIMESTAMP_LTZ + +### TIMESTAMP 类型 + - `TIMESTAMP(p)` 是 `TIMESTAMP(p) WITHOUT TIME ZONE` 的简写, 精度 `p` 支持的范围是0-9, 默认是6。 + - `TIMESTAMP` 用于描述年, 月, 日, 小时, 分钟, 秒 和 小数秒对应的时间戳。 + - `TIMESTAMP` 可以通过一个字符串来指定,例如: + ```sql +Flink SQL> SELECT TIMESTAMP '1970-01-01 00:00:04.001'; ++-------------------------+ +| 1970-01-01 00:00:04.001 | ++-------------------------+ +``` + +### TIMESTAMP_LTZ 类型 + - `TIMESTAMP_LTZ(p)` 是 `TIMESTAMP(p) WITH LOCAL TIME ZONE` 的简写, 精度 `p` 支持的范围是0-9, 默认是6。 + - `TIMESTAMP_LTZ` 用于描述时间线上的绝对时间点, 使用 long 保存从 epoch 至今的毫秒数, 使用int保存毫秒中的纳秒数。 epoch 时间是从 java 的标准 epoch 时间 `1970-01-01T00:00:00Z` 开始计算。 在计算和可视化时, 每个 `TIMESTAMP_LTZ` 类型的数据都是使用的 session (会话)中配置的时区。 + - `TIMESTAMP_LTZ` 没有字符串表达形式因此无法通过字符串来指定, 可以通过一个 long 类型的 epoch 时间来转化(例如: 通过 Java 来产生一个 long 类型的 epoch 时间 `System.currentTimeMillis()`) + + ```sql +Flink SQL> CREATE VIEW T1 AS SELECT TO_TIMESTAMP_LTZ(4001, 3); +Flink SQL> SET table.local-time-zone=UTC; +Flink SQL> SELECT * FROM T1; ++---------------------------+ +| TO_TIMESTAMP_LTZ(4001, 3) | ++---------------------------+ +| 1970-01-01 00:00:04.001 | ++---------------------------+ + +Flink SQL> SET table.local-time-zone=Asia/Shanghai; +Flink SQL> SELECT * FROM T1; ++---------------------------+ +| TO_TIMESTAMP_LTZ(4001, 3) | ++---------------------------+ +| 1970-01-01 08:00:04.001 | ++---------------------------+ +``` + +- `TIMESTAMP_LTZ` 可以用于跨时区的计算,因为它是一个基于 epoch 的绝对时间点(比如上例中的 `4001` 毫秒)代表的就是不同时区的同一个绝对时间点。 +补充一个背景知识:在同一个时间点, 全世界所有的机器上执行 `System.currentTimeMillis()` 都会返回同样的值。 (比如上例中的 `4001` milliseconds), 这就是绝对时间的定义。 + +## 时区的作用 +本地时区定义了当前 session(会话)所在的时区, 你可以在 Sql client 或者应用程序中配置。 + +{{< tabs "SQL snippets" >}} +{{< tab "SQL Client" >}} +```sql +-- 设置为 UTC 时区 +Flink SQL> SET table.local-time-zone=UTC; + +-- 设置为上海时区 +Flink SQL> SET table.local-time-zone=Asia/Shanghai; + +-- 设置为Los_Angeles时区 +Flink SQL> SET table.local-time-zone=America/Los_Angeles; +``` +{{< /tab >}} +{{< tab "Java" >}} +```java + EnvironmentSettings envSetting = EnvironmentSettings.newInstance().build(); + TableEnvironment tEnv = TableEnvironment.create(envSetting); + + // 设置为 UTC 时区 + tEnv.getConfig().setLocalTimeZone(ZoneId.of("UTC")); + +// 设置为上海时区 + tEnv.getConfig().setLocalTimeZone(ZoneId.of("Asia/Shanghai")); + +// 设置为 Los_Angeles 时区 + tEnv.getConfig().setLocalTimeZone(ZoneId.of("America/Los_Angeles")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val envSetting = EnvironmentSettings.newInstance.build +val tEnv = TableEnvironment.create(envSetting) + +// 设置为 UTC 时区 +tEnv.getConfig.setLocalTimeZone(ZoneId.of("UTC")) + +// 设置为上海时区 +tEnv.getConfig.setLocalTimeZone(ZoneId.of("Asia/Shanghai")) + +// 设置为 Los_Angeles 时区 +tEnv.getConfig.setLocalTimeZone(ZoneId.of("America/Los_Angeles")) +``` +{{< /tab >}} +{{< /tabs >}} + +session(会话)的时区设置在 Flink SQL 中非常有用, 它的主要用法如下: + +### 确定时间函数的返回值 +session (会话)中配置的时区会对以下函数生效。 +* LOCALTIME +* LOCALTIMESTAMP +* CURRENT_DATE +* CURRENT_TIME +* CURRENT_TIMESTAMP +* CURRENT_ROW_TIMESTAMP() +* NOW() +* PROCTIME() + + +```sql +Flink SQL> SET sql-client.execution.result-mode=tableau; +Flink SQL> CREATE VIEW MyView1 AS SELECT LOCALTIME, LOCALTIMESTAMP, CURRENT_DATE, CURRENT_TIME, CURRENT_TIMESTAMP, CURRENT_ROW_TIMESTAMP(), NOW(), PROCTIME(); +Flink SQL> DESC MyView1; +``` + +``` ++------------------------+-----------------------------+-------+-----+--------+-----------+ +| name | type | null | key | extras | watermark | ++------------------------+-----------------------------+-------+-----+--------+-----------+ +| LOCALTIME | TIME(0) | false | | | | +| LOCALTIMESTAMP | TIMESTAMP(3) | false | | | | +| CURRENT_DATE | DATE | false | | | | +| CURRENT_TIME | TIME(0) | false | | | | +| CURRENT_TIMESTAMP | TIMESTAMP_LTZ(3) | false | | | | +|CURRENT_ROW_TIMESTAMP() | TIMESTAMP_LTZ(3) | false | | | | +| NOW() | TIMESTAMP_LTZ(3) | false | | | | +| PROCTIME() | TIMESTAMP_LTZ(3) *PROCTIME* | false | | | | ++------------------------+-----------------------------+-------+-----+--------+-----------+ +``` + +```sql +Flink SQL> SET table.local-time-zone=UTC; +Flink SQL> SELECT * FROM MyView1; +``` + +``` ++-----------+-------------------------+--------------+--------------+-------------------------+-------------------------+-------------------------+-------------------------+ +| LOCALTIME | LOCALTIMESTAMP | CURRENT_DATE | CURRENT_TIME | CURRENT_TIMESTAMP | CURRENT_ROW_TIMESTAMP() | NOW() | PROCTIME() | ++-----------+-------------------------+--------------+--------------+-------------------------+-------------------------+-------------------------+-------------------------+ +| 15:18:36 | 2021-04-15 15:18:36.384 | 2021-04-15 | 15:18:36 | 2021-04-15 15:18:36.384 | 2021-04-15 15:18:36.384 | 2021-04-15 15:18:36.384 | 2021-04-15 15:18:36.384 | ++-----------+-------------------------+--------------+--------------+-------------------------+-------------------------+-------------------------+-------------------------+ +``` + +```sql +Flink SQL> SET table.local-time-zone=Asia/Shanghai; +Flink SQL> SELECT * FROM MyView1; +``` + +``` ++-----------+-------------------------+--------------+--------------+-------------------------+-------------------------+-------------------------+-------------------------+ +| LOCALTIME | LOCALTIMESTAMP | CURRENT_DATE | CURRENT_TIME | CURRENT_TIMESTAMP | CURRENT_ROW_TIMESTAMP() | NOW() | PROCTIME() | ++-----------+-------------------------+--------------+--------------+-------------------------+-------------------------+-------------------------+-------------------------+ +| 23:18:36 | 2021-04-15 23:18:36.384 | 2021-04-15 | 23:18:36 | 2021-04-15 23:18:36.384 | 2021-04-15 23:18:36.384 | 2021-04-15 23:18:36.384 | 2021-04-15 23:18:36.384 | ++-----------+-------------------------+--------------+--------------+-------------------------+-------------------------+-------------------------+-------------------------+ +``` + +### `TIMESTAMP_LTZ` 字符串表示 +当一个 `TIMESTAMP_LTZ` 值转为 string 格式时, session 中配置的时区会生效。 例如打印这个值,将类型强制转化为 `STRING` 类型, 将类型强制转换为 `TIMESTAMP` ,将 `TIMESTAMP` 的值转化为 `TIMESTAMP_LTZ` 类型: +```sql +Flink SQL> CREATE VIEW MyView2 AS SELECT TO_TIMESTAMP_LTZ(4001, 3) AS ltz, TIMESTAMP '1970-01-01 00:00:01.001' AS ntz; +Flink SQL> DESC MyView2; +``` + +``` ++------+------------------+-------+-----+--------+-----------+ +| name | type | null | key | extras | watermark | ++------+------------------+-------+-----+--------+-----------+ +| ltz | TIMESTAMP_LTZ(3) | true | | | | +| ntz | TIMESTAMP(3) | false | | | | ++------+------------------+-------+-----+--------+-----------+ +``` + +```sql +Flink SQL> SET table.local-time-zone=UTC; +Flink SQL> SELECT * FROM MyView2; +``` + +``` ++-------------------------+-------------------------+ +| ltz | ntz | ++-------------------------+-------------------------+ +| 1970-01-01 00:00:04.001 | 1970-01-01 00:00:01.001 | ++-------------------------+-------------------------+ +``` + +```sql +Flink SQL> SET table.local-time-zone=Asia/Shanghai; +Flink SQL> SELECT * FROM MyView2; +``` + +``` ++-------------------------+-------------------------+ +| ltz | ntz | ++-------------------------+-------------------------+ +| 1970-01-01 08:00:04.001 | 1970-01-01 00:00:01.001 | ++-------------------------+-------------------------+ +``` + +```sql +Flink SQL> CREATE VIEW MyView3 AS SELECT ltz, CAST(ltz AS TIMESTAMP(3)), CAST(ltz AS STRING), ntz, CAST(ntz AS TIMESTAMP_LTZ(3)) FROM MyView2; +``` + +``` +Flink SQL> DESC MyView3; ++-------------------------------+------------------+-------+-----+--------+-----------+ +| name | type | null | key | extras | watermark | ++-------------------------------+------------------+-------+-----+--------+-----------+ +| ltz | TIMESTAMP_LTZ(3) | true | | | | +| CAST(ltz AS TIMESTAMP(3)) | TIMESTAMP(3) | true | | | | +| CAST(ltz AS STRING) | STRING | true | | | | +| ntz | TIMESTAMP(3) | false | | | | +| CAST(ntz AS TIMESTAMP_LTZ(3)) | TIMESTAMP_LTZ(3) | false | | | | ++-------------------------------+------------------+-------+-----+--------+-----------+ +``` + +```sql +Flink SQL> SELECT * FROM MyView3; +``` + +``` ++-------------------------+---------------------------+-------------------------+-------------------------+-------------------------------+ +| ltz | CAST(ltz AS TIMESTAMP(3)) | CAST(ltz AS STRING) | ntz | CAST(ntz AS TIMESTAMP_LTZ(3)) | ++-------------------------+---------------------------+-------------------------+-------------------------+-------------------------------+ +| 1970-01-01 08:00:04.001 | 1970-01-01 08:00:04.001 | 1970-01-01 08:00:04.001 | 1970-01-01 00:00:01.001 | 1970-01-01 00:00:01.001 | ++-------------------------+---------------------------+-------------------------+-------------------------+-------------------------------+ +``` + +## 时间属性和时区 +更多时间属性相关的详细介绍, 请参考 [Time Attribute]({{< ref "docs/dev/table/concepts/time_attributes">}}#时间属性) 。 + +### 处理时间和时区 +Flink SQL 使用函数 `PROCTIME()` 来定义处理时间属性, 该函数返回的类型是 `TIMESTAMP_LTZ` 。 + +{{< hint info >}} +在 Flink1.13 之前, `PROCTIME()` 函数返回的类型是 `TIMESTAMP` , 返回值是UTC时区下的 `TIMESTAMP` 。 +例如: 当上海的时间为 `2021-03-01 12:00:00` 时, `PROCTIME()` 显示的时间却是错误的 `2021-03-01 04:00:00` 。 +这个问题在 Flink 1.13 中修复了, 因此用户不用再去处理时区的问题了。 +{{< /hint >}} + +`PROCTIME()` 返回的是本地时区的时间, 使用 `TIMESTAMP_LTZ` 类型也可以支持夏令时时间。 + +```sql +Flink SQL> SET table.local-time-zone=UTC; +Flink SQL> SELECT PROCTIME(); +``` +``` ++-------------------------+ +| PROCTIME() | ++-------------------------+ +| 2021-04-15 14:48:31.387 | ++-------------------------+ +``` + +```sql +Flink SQL> SET table.local-time-zone=Asia/Shanghai; +Flink SQL> SELECT PROCTIME(); +``` +``` ++-------------------------+ +| PROCTIME() | ++-------------------------+ +| 2021-04-15 22:48:31.387 | ++-------------------------+ +``` + +```sql +Flink SQL> CREATE TABLE MyTable1 ( + item STRING, + price DOUBLE, + proctime as PROCTIME() + ) WITH ( + 'connector' = 'socket', + 'hostname' = '127.0.0.1', + 'port' = '9999', + 'format' = 'csv' + ); + +Flink SQL> CREATE VIEW MyView3 AS + SELECT + TUMBLE_START(proctime, INTERVAL '10' MINUTES) AS window_start, + TUMBLE_END(proctime, INTERVAL '10' MINUTES) AS window_end, + TUMBLE_PROCTIME(proctime, INTERVAL '10' MINUTES) as window_proctime, + item, + MAX(price) as max_price + FROM MyTable1 + GROUP BY TUMBLE(proctime, INTERVAL '10' MINUTES), item; + +Flink SQL> DESC MyView3; +``` + +``` ++-----------------+-----------------------------+-------+-----+--------+-----------+ +| name | type | null | key | extras | watermark | ++-----------------+-----------------------------+-------+-----+--------+-----------+ +| window_start | TIMESTAMP(3) | false | | | | +| window_end | TIMESTAMP(3) | false | | | | +| window_proctime | TIMESTAMP_LTZ(3) *PROCTIME* | false | | | | +| item | STRING | true | | | | +| max_price | DOUBLE | true | | | | ++-----------------+-----------------------------+-------+-----+--------+-----------+ +``` + +在终端执行以下命令写入数据到 `MyTable1` : + +``` +> nc -lk 9999 +A,1.1 +B,1.2 +A,1.8 +B,2.5 +C,3.8 +``` + +```sql +Flink SQL> SET table.local-time-zone=UTC; +Flink SQL> SELECT * FROM MyView3; +``` + +``` ++-------------------------+-------------------------+-------------------------+------+-----------+ +| window_start | window_end | window_procime | item | max_price | ++-------------------------+-------------------------+-------------------------+------+-----------+ +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:10:00.005 | A | 1.8 | +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:10:00.007 | B | 2.5 | +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:10:00.007 | C | 3.8 | ++-------------------------+-------------------------+-------------------------+------+-----------+ +``` + +```sql +Flink SQL> SET table.local-time-zone=Asia/Shanghai; +Flink SQL> SELECT * FROM MyView3; +``` + +相比在 UTC 时区下的计算结果, 在 Asia/Shanghai 时区下计算的窗口开始时间, 窗口结束时间和窗口处理时间是不同的。 +``` ++-------------------------+-------------------------+-------------------------+------+-----------+ +| window_start | window_end | window_procime | item | max_price | ++-------------------------+-------------------------+-------------------------+------+-----------+ +| 2021-04-15 22:00:00.000 | 2021-04-15 22:10:00.000 | 2021-04-15 22:10:00.005 | A | 1.8 | +| 2021-04-15 22:00:00.000 | 2021-04-15 22:10:00.000 | 2021-04-15 22:10:00.007 | B | 2.5 | +| 2021-04-15 22:00:00.000 | 2021-04-15 22:10:00.000 | 2021-04-15 22:10:00.007 | C | 3.8 | ++-------------------------+-------------------------+-------------------------+------+-----------+ +``` + +{{< hint info >}} +处理时间窗口是不确定的, 每次运行都会返回不同的窗口和聚合结果。 以上的示例只用于说明时区如何影响处理时间窗口。 +{{< /hint >}} + +### 事件时间和时区 +Flink 支持在 `TIMESTAMP` 列和 `TIMESTAMP_LTZ` 列上定义时间属性。 + +#### TIMESTAMP 上的事件时间属性 +如果 source 中的时间用于表示年-月-日-小时-分钟-秒, 通常是一个不带时区的字符串, 例如: `2020-04-15 20:13:40.564`。 推荐在 `TIMESTAMP` 列上定义事件时间属性。 +```sql +Flink SQL> CREATE TABLE MyTable2 ( + item STRING, + price DOUBLE, + ts TIMESTAMP(3), -- TIMESTAMP data type + WATERMARK FOR ts AS ts - INTERVAL '10' SECOND + ) WITH ( + 'connector' = 'socket', + 'hostname' = '127.0.0.1', + 'port' = '9999', + 'format' = 'csv' + ); + +Flink SQL> CREATE VIEW MyView4 AS + SELECT + TUMBLE_START(ts, INTERVAL '10' MINUTES) AS window_start, + TUMBLE_END(ts, INTERVAL '10' MINUTES) AS window_end, + TUMBLE_ROWTIME(ts, INTERVAL '10' MINUTES) as window_rowtime, + item, + MAX(price) as max_price + FROM MyTable2 + GROUP BY TUMBLE(ts, INTERVAL '10' MINUTES), item; + +Flink SQL> DESC MyView4; +``` + +``` ++----------------+------------------------+------+-----+--------+-----------+ +| name | type | null | key | extras | watermark | ++----------------+------------------------+------+-----+--------+-----------+ +| window_start | TIMESTAMP(3) | true | | | | +| window_end | TIMESTAMP(3) | true | | | | +| window_rowtime | TIMESTAMP(3) *ROWTIME* | true | | | | +| item | STRING | true | | | | +| max_price | DOUBLE | true | | | | ++----------------+------------------------+------+-----+--------+-----------+ +``` + +在终端执行以下命令用于写入数据到 `MyTable2` : + +``` +> nc -lk 9999 +A,1.1,2021-04-15 14:01:00 +B,1.2,2021-04-15 14:02:00 +A,1.8,2021-04-15 14:03:00 +B,2.5,2021-04-15 14:04:00 +C,3.8,2021-04-15 14:05:00 +C,3.8,2021-04-15 14:11:00 +``` + +```sql +Flink SQL> SET table.local-time-zone=UTC; +Flink SQL> SELECT * FROM MyView4; +``` + +``` ++-------------------------+-------------------------+-------------------------+------+-----------+ +| window_start | window_end | window_rowtime | item | max_price | ++-------------------------+-------------------------+-------------------------+------+-----------+ +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:09:59.999 | A | 1.8 | +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:09:59.999 | B | 2.5 | +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:09:59.999 | C | 3.8 | ++-------------------------+-------------------------+-------------------------+------+-----------+ +``` + +```sql +Flink SQL> SET table.local-time-zone=Asia/Shanghai; +Flink SQL> SELECT * FROM MyView4; +``` + +相比在 UTC 时区下的计算结果, 在 Asia/Shanghai 时区下计算的窗口开始时间, 窗口结束时间和窗口的 rowtime 是相同的。 +``` ++-------------------------+-------------------------+-------------------------+------+-----------+ +| window_start | window_end | window_rowtime | item | max_price | ++-------------------------+-------------------------+-------------------------+------+-----------+ +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:09:59.999 | A | 1.8 | +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:09:59.999 | B | 2.5 | +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:09:59.999 | C | 3.8 | ++-------------------------+-------------------------+-------------------------+------+-----------+ +``` + +#### TIMESTAMP_LTZ 上的事件时间属性 +如果源数据中的时间为一个 epoch 时间, 通常是一个 long 值, 例如: `1618989564564` ,推荐将事件时间属性定义在 `TIMESTAMP_LTZ` 列上。 +```sql +Flink SQL> CREATE TABLE MyTable3 ( + item STRING, + price DOUBLE, + ts BIGINT, -- long time value in epoch milliseconds + ts_ltz AS TO_TIMESTAMP_LTZ(ts, 3), + WATERMARK FOR ts_ltz AS ts_ltz - INTERVAL '10' SECOND + ) WITH ( + 'connector' = 'socket', + 'hostname' = '127.0.0.1', + 'port' = '9999', + 'format' = 'csv' + ); + +Flink SQL> CREATE VIEW MyView5 AS + SELECT + TUMBLE_START(ts_ltz, INTERVAL '10' MINUTES) AS window_start, + TUMBLE_END(ts_ltz, INTERVAL '10' MINUTES) AS window_end, + TUMBLE_ROWTIME(ts_ltz, INTERVAL '10' MINUTES) as window_rowtime, + item, + MAX(price) as max_price + FROM MyTable3 + GROUP BY TUMBLE(ts_ltz, INTERVAL '10' MINUTES), item; + +Flink SQL> DESC MyView5; +``` + +``` ++----------------+----------------------------+-------+-----+--------+-----------+ +| name | type | null | key | extras | watermark | ++----------------+----------------------------+-------+-----+--------+-----------+ +| window_start | TIMESTAMP(3) | false | | | | +| window_end | TIMESTAMP(3) | false | | | | +| window_rowtime | TIMESTAMP_LTZ(3) *ROWTIME* | true | | | | +| item | STRING | true | | | | +| max_price | DOUBLE | true | | | | ++----------------+----------------------------+-------+-----+--------+-----------+ +``` + +`MyTable3` 的输入数据为: +``` +A,1.1,1618495260000 # The corresponding utc timestamp is 2021-04-15 14:01:00 +B,1.2,1618495320000 # The corresponding utc timestamp is 2021-04-15 14:02:00 +A,1.8,1618495380000 # The corresponding utc timestamp is 2021-04-15 14:03:00 +B,2.5,1618495440000 # The corresponding utc timestamp is 2021-04-15 14:04:00 +C,3.8,1618495500000 # The corresponding utc timestamp is 2021-04-15 14:05:00 +C,3.8,1618495860000 # The corresponding utc timestamp is 2021-04-15 14:11:00 +``` + +```sql +Flink SQL> SET table.local-time-zone=UTC; +Flink SQL> SELECT * FROM MyView5; +``` + +``` ++-------------------------+-------------------------+-------------------------+------+-----------+ +| window_start | window_end | window_rowtime | item | max_price | ++-------------------------+-------------------------+-------------------------+------+-----------+ +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:09:59.999 | A | 1.8 | +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:09:59.999 | B | 2.5 | +| 2021-04-15 14:00:00.000 | 2021-04-15 14:10:00.000 | 2021-04-15 14:09:59.999 | C | 3.8 | ++-------------------------+-------------------------+-------------------------+------+-----------+ +``` + +```sql +Flink SQL> SET table.local-time-zone=Asia/Shanghai; +Flink SQL> SELECT * FROM MyView5; +``` + +相比在 UTC 时区下的计算结果, 在 Asia/Shanghai 时区下计算的窗口开始时间, 窗口结束时间和窗口的 rowtime 是不同的。 +``` ++-------------------------+-------------------------+-------------------------+------+-----------+ +| window_start | window_end | window_rowtime | item | max_price | ++-------------------------+-------------------------+-------------------------+------+-----------+ +| 2021-04-15 22:00:00.000 | 2021-04-15 22:10:00.000 | 2021-04-15 22:09:59.999 | A | 1.8 | +| 2021-04-15 22:00:00.000 | 2021-04-15 22:10:00.000 | 2021-04-15 22:09:59.999 | B | 2.5 | +| 2021-04-15 22:00:00.000 | 2021-04-15 22:10:00.000 | 2021-04-15 22:09:59.999 | C | 3.8 | ++-------------------------+-------------------------+-------------------------+------+-----------+ +``` + +## 夏令时支持 +Flink SQL支持在 `TIMESTAMP_LTZ`列上定义时间属性, 基于这一特征,Flink SQL 在窗口中使用 `TIMESTAMP` 和 `TIMESTAMP_LTZ` 类型优雅地支持了夏令时。 + +Flink 使用时间戳的字符格式来分割窗口并通过每条记录对应的 epoch 时间来分配窗口。 这意味着 Flink 窗口开始时间和窗口结束时间使用的是 `TIMESTAMP` 类型(例如: `TUMBLE_START` 和 `TUMBLE_END`), 窗口的时间属性使用的是 `TIMESTAMP_LTZ` 类型(例如: `TUMBLE_PROCTIME`, `TUMBLE_ROWTIME`)。 +给定一个 tumble window示例, 在 Los_Angele 时区下夏令时从 `2021-03-14 02:00:00` 开始: +``` +long epoch1 = 1615708800000L; // 2021-03-14 00:00:00 +long epoch2 = 1615712400000L; // 2021-03-14 01:00:00 +long epoch3 = 1615716000000L; // 2021-03-14 03:00:00, 手表往前拨一小时,跳过 (2021-03-14 02:00:00) +long epoch4 = 1615719600000L; // 2021-03-14 04:00:00 +``` +在 Los_angele 时区下, tumble window [2021-03-14 00:00:00, 2021-03-14 00:04:00] 将会收集3个小时的数据, 在其他非夏令时的时区下将会收集4个小时的数据,用户只需要在 `TIMESTAMP_LTZ` 列上声明时间属性即可。 + +Flink 的所有窗口(如 Hop window, Session window, Cumulative window)都会遵循这种方式, Flink SQL 中的所有操作都很好地支持了 `TIMESTAMP_LTZ` 类型,因此Flink可以非常优雅的支持夏令时。   + + +## Batch 模式和 Streaming 模式的区别 +以下函数: +* LOCALTIME +* LOCALTIMESTAMP +* CURRENT_DATE +* CURRENT_TIME +* CURRENT_TIMESTAMP +* NOW() + +Flink 会根据执行模式来进行不同计算,在 Streaming 模式下这些函数是每条记录都会计算一次,但在 Batch 模式下,只会在 query 开始时计算一次,所有记录都使用相同的结果。 + +以下时间函数无论是在 Streaming 模式还是 Batch 模式下,都会为每条记录计算一次结果: + +* CURRENT_ROW_TIMESTAMP() +* PROCTIME() + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/concepts/versioned_tables.md b/docs/content.zh/docs/dev/table/concepts/versioned_tables.md new file mode 100644 index 0000000000000..b7cbe0b2ecd64 --- /dev/null +++ b/docs/content.zh/docs/dev/table/concepts/versioned_tables.md @@ -0,0 +1,366 @@ +--- +title: "时态表(Temporal Tables)" +weight: 4 +type: docs +aliases: + - /zh/dev/table/streaming/versioned_tables.html + - /zh/dev/table/streaming/temporal_tables.html +--- + + +# 时态表(Temporal Tables) + +时态表(Temporal Table)是一张随时间变化的表 -- 在 Flink 中称为[动态表]({{< ref "docs/dev/table/concepts/dynamic_tables" >}}),时态表中的每条记录都关联了一个或多个时间段,所有的 Flink 表都是时态的(动态的)。 + +时态表包含表的一个或多个有版本的表快照,时态表可以是一张跟踪所有变更记录的表(例如数据库表的 changelog,包含多个表快照),也可以是物化所有变更之后的表(例如数据库表,只有最新表快照)。 + +**版本**: 时态表可以划分成一系列带版本的表快照集合,表快照中的版本代表了快照中所有记录的有效区间,有效区间的开始时间和结束时间可以通过用户指定,根据时态表是否可以追踪自身的历史版本与否,时态表可以分为 `版本表` 和 `普通表`。 + +**版本表**: 如果时态表中的记录可以追踪和并访问它的历史版本,这种表我们称之为版本表,来自数据库的 changelog 可以定义成版本表。 + +**普通表**: 如果时态表中的记录仅仅可以追踪并和它的最新版本,这种表我们称之为普通表,来自数据库 或 HBase 的表可以定义成普通表。 + + + +设计初衷 +---------- + +### 关联一张版本表 + +以订单流关联产品表这个场景举例,`orders` 表包含了来自 Kafka 的实时订单流,`product_changelog` 表来自数据库表 `products` 的 changelog , 产品的价格在数据库表 `products` 中是随时间实时变化的。 + +```sql +SELECT * FROM product_changelog; + +(changelog kind) update_time product_id product_name price +================= =========== ========== ============ ===== ++(INSERT) 00:01:00 p_001 scooter 11.11 ++(INSERT) 00:02:00 p_002 basketball 23.11 +-(UPDATE_BEFORE) 12:00:00 p_001 scooter 11.11 ++(UPDATE_AFTER) 12:00:00 p_001 scooter 12.99 +-(UPDATE_BEFORE) 12:00:00 p_002 basketball 23.11 ++(UPDATE_AFTER) 12:00:00 p_002 basketball 19.99 +-(DELETE) 18:00:00 p_001 scooter 12.99 +``` + +表 `product_changelog` 表示数据库表 `products`不断增长的 changelog, 比如,产品 `scooter` 在时间点 `00:01:00`的初始价格是 `11.11`, 在 `12:00:00` 的时候涨价到了 `12.99`, +在 `18:00:00` 的时候这条产品价格记录被删除。 + +如果我们想输出 `product_changelog` 表在 `10:00:00` 对应的版本,表的内容如下所示: +```sql +update_time product_id product_name price +=========== ========== ============ ===== +00:01:00 p_001 scooter 11.11 +00:02:00 p_002 basketball 23.11 +``` + +如果我们想输出 `product_changelog` 表在 `13:00:00` 对应的版本,表的内容如下所示: +```sql +update_time product_id product_name price +=========== ========== ============ ===== +12:00:00 p_001 scooter 12.99 +12:00:00 p_002 basketball 19.99 +``` + +上述例子中,`products` 表的版本是通过 `update_time` 和 `product_id` 进行追踪的,`product_id` 对应 `product_changelog` 表的主键,`update_time` 对应事件时间。 + +在 Flink 中, 这由[*版本表*](#声明版本表)表示。 + +### 关联一张普通表 + +另一方面,某些用户案列需要连接变化的维表,该表是外部数据库表。 + +假设 `LatestRates` 是一个物化的最新汇率表 (比如:一张 HBase 表),`LatestRates` 总是表示 HBase 表 `Rates` 的最新内容。 + +我们在 `10:15:00` 时查询到的内容如下所示: +```sql +10:15:00 > SELECT * FROM LatestRates; + +currency rate +========= ==== +US Dollar 102 +Euro 114 +Yen 1 +``` + +我们在 `11:00:00` 时查询到的内容如下所示: +```sql +11:00:00 > SELECT * FROM LatestRates; + +currency rate +========= ==== +US Dollar 102 +Euro 116 +Yen 1 +``` + +在 Flink 中, 这由[*普通表*](#声明普通表)表示。 + +时态表 +----- +注意 仅 Blink planner 支持此功能。 + +Flink 使用主键约束和事件时间来定义一张版本表和版本视图。 + +### 声明版本表 +在 Flink 中,定义了主键约束和事件时间属性的表就是版本表。 +```sql +-- 定义一张版本表 +CREATE TABLE product_changelog ( + product_id STRING, + product_name STRING, + product_price DECIMAL(10, 4), + update_time TIMESTAMP(3) METADATA FROM 'value.source.timestamp' VIRTUAL, + PRIMARY KEY(product_id) NOT ENFORCED, -- (1) 定义主键约束 + WATERMARK FOR update_time AS update_time -- (2) 通过 watermark 定义事件时间 +) WITH ( + 'connector' = 'kafka', + 'topic' = 'products', + 'scan.startup.mode' = 'earliest-offset', + 'properties.bootstrap.servers' = 'localhost:9092', + 'value.format' = 'debezium-json' +); +``` + +行 `(1)` 为表 `product_changelog` 定义了主键, 行 `(2)` 把 `update_time` 定义为表 `product_changelog` 的事件时间,因此 `product_changelog` 是一张版本表。 + +**注意**: `METADATA FROM 'value.source.timestamp' VIRTUAL` 语法的意思是从每条 changelog 中抽取 changelog 对应的数据库表中操作的执行时间,强烈推荐使用数据库表中操作的 +执行时间作为事件时间 ,否则通过时间抽取的版本可能和数据库中的版本不匹配。 + +### 声明版本视图 + +Flink 也支持定义版本视图只要一个视图包含主键和事件时间便是一个版本视图。 + +假设我们有表 `RatesHistory` 如下所示: +```sql +-- 定义一张 append-only 表 +CREATE TABLE RatesHistory ( + currency_time TIMESTAMP(3), + currency STRING, + rate DECIMAL(38, 10), + WATERMARK FOR currency_time AS currency_time -- 定义事件时间 +) WITH ( + 'connector' = 'kafka', + 'topic' = 'rates', + 'scan.startup.mode' = 'earliest-offset', + 'properties.bootstrap.servers' = 'localhost:9092', + 'format' = 'json' -- 普通的 append-only 流 +) +``` + +表 `RatesHistory` 代表一个兑换日元货币汇率表(日元汇率为1),该表是不断增长的 append-only 表。 +例如,`欧元` 兑换 `日元` 从 `09:00:00` 到 `10:45:00` 的汇率为 `114`。从 `10:45:00` 到 `11:15:00` 的汇率为 `116`。 + +```sql +SELECT * FROM RatesHistory; + +currency_time currency rate +============= ========= ==== +09:00:00 US Dollar 102 +09:00:00 Euro 114 +09:00:00 Yen 1 +10:45:00 Euro 116 +11:15:00 Euro 119 +11:49:00 Pounds 108 +``` + +为了在 `RatesHistory` 上定义版本表,Flink 支持通过[去重查询]({{< ref "docs/dev/table/sql/queries" >}}#去重)定义版本视图, +去重查询可以产出一个有序的 changelog 流,去重查询能够推断主键并保留原始数据流的事件时间属性。 + +```sql +CREATE VIEW versioned_rates AS +SELECT currency, rate, currency_time -- (1) `currency_time` 保留了事件时间 + FROM ( + SELECT *, + ROW_NUMBER() OVER (PARTITION BY currency -- (2) `currency` 是去重 query 的 unique key,可以作为主键 + ORDER BY currency_time DESC) AS rowNum + FROM RatesHistory ) +WHERE rowNum = 1; + +-- 视图 `versioned_rates` 将会产出如下的 changelog: + +(changelog kind) currency_time currency rate +================ ============= ========= ==== ++(INSERT) 09:00:00 US Dollar 102 ++(INSERT) 09:00:00 Euro 114 ++(INSERT) 09:00:00 Yen 1 ++(UPDATE_AFTER) 10:45:00 Euro 116 ++(UPDATE_AFTER) 11:15:00 Euro 119 ++(INSERT) 11:49:00 Pounds 108 +{% endhighlight sql %} + +行 `(1)` 保留了事件时间作为视图 `versioned_rates` 的事件时间,行 `(2)` 使得视图 `versioned_rates` 有了主键, 因此视图 `versioned_rates` 是一个版本视图。 + +视图中的去重 query 会被 Flink 优化并高效地产出 changelog stream, 产出的 changelog 保留了主键约束和事件时间。 + +如果我们想输出 `versioned_rates` 表在 `11:00:00` 对应的版本,表的内容如下所示: +```sql +currency_time currency rate +============= ========== ==== +09:00:00 US Dollar 102 +09:00:00 Yen 1 +10:45:00 Euro 116 +``` + +如果我们想输出 `versioned_rates` 表在 `12:00:00` 对应的版本,表的内容如下所示: +```sql +currency_time currency rate +============= ========== ==== +09:00:00 US Dollar 102 +09:00:00 Yen 1 +10:45:00 Euro 119 +11:49:00 Pounds 108 +``` + +### 声明普通表 + +普通表的声明和 Flink 建表 DDL 一致,参考 [create table]({{< ref "docs/dev/table/sql/create" >}}#create-table) 页面获取更多如何建表的信息。 + +```sql +-- 用 DDL 定义一张 HBase 表,然后我们可以在 SQL 中将其当作一张时态表使用 +-- 'currency' 列是 HBase 表中的 rowKey + CREATE TABLE LatestRates ( + currency STRING, + fam1 ROW + ) WITH ( + 'connector' = 'hbase-1.4', + 'table-name' = 'rates', + 'zookeeper.quorum' = 'localhost:2181' + ); +``` + +注意 理论上讲任意都能用作时态表并在基于处理时间的时态表 Join 中使用,但当前支持作为时态表的普通表必须实现接口 `LookupableTableSource`。接口 `LookupableTableSource` 的实例只能作为时态表用于基于处理时间的时态 Join 。 + +通过 `LookupableTableSource` 定义的表意味着该表具备了在运行时通过一个或多个 key 去查询外部存储系统的能力,当前支持在 基于处理时间的时态表 join 中使用的表包括 +[JDBC]({{< ref "docs/connectors/table/jdbc" >}}), [HBase]({{< ref "docs/connectors/table/hbase" >}}) 和 [Hive]({{< ref "docs/connectors/table/hive/hive_read_write" >}}#temporal-table-join)。 + +另请参阅 [LookupableTableSource]({{< ref "docs/dev/table/sourcesSinks" >}}#lookup-table-source)页面了解更多信息。 + +在基于处理时间的时态表 Join 中支持任意表作为时态表会在不远的将来支持。 + +时态表函数 +------------------------ +时态表函数是一种过时的方式去定义时态表并关联时态表的数据,现在我们可以用时态表 DDL 去定义时态表,用[时态表 Join]({{< ref "docs/dev/table/sql/queries/joins" >}}#时态表-join) 语法去关联时态表。 + +时态表函数和时态表 DDL 最大的区别在于,时态表 DDL 可以在纯 SQL 环境中使用但是时态表函数不支持,用时态表 DDL 声明的时态表支持 changelog 流和 append-only 流但时态表函数仅支持 append-only 流。 + +为了访问时态表中的数据,必须传递一个[时间属性]({{< ref "docs/dev/table/concepts/time_attributes" >}}),该属性确定将要返回的表的版本。 +Flink 使用[表函数]({{< ref "docs/dev/table/functions/udfs" >}}#表值函数)的 SQL 语法提供一种表达它的方法。 + +定义后,*时态表函数*将使用单个时间参数 timeAttribute 并返回一个行集合。 +该集合包含相对于给定时间属性的所有现有主键的行的最新版本。 + +假设我们基于 `RatesHistory` 表定义了一个时态表函数,我们可以通过以下方式查询该函数 `Rates(timeAttribute)`: + +```sql +SELECT * FROM Rates('10:15:00'); + +rowtime currency rate +======= ========= ==== +09:00:00 US Dollar 102 +09:00:00 Euro 114 +09:00:00 Yen 1 + +SELECT * FROM Rates('11:00:00'); + +rowtime currency rate +======== ========= ==== +09:00:00 US Dollar 102 +10:45:00 Euro 116 +09:00:00 Yen 1 +``` + +对 `Rates(timeAttribute)` 的每个查询都将返回给定 `timeAttribute` 的 `Rates` 状态。 + +**注意**:当前 Flink 不支持使用常量时间属性参数直接查询时态表函数。目前,时态表函数只能在 join 中使用。上面的示例用于为函数 `Rates(timeAttribute)` 返回内容提供直观信息。 + +另请参阅有关[用于持续查询的 join ]({{< ref "docs/dev/table/sql/queries/joins" >}})页面,以获取有关如何与时态表 join 的更多信息。 + +### 定义时态表函数 + +以下代码段说明了如何从 append-only 表中创建时态表函数。 + +{{< tabs "53d51b01-eee7-49b7-965d-98ab237fb3a1" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.table.functions.TemporalTableFunction; +(...) + +// 获取 stream 和 table 环境 +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + +// 提供一个汇率历史记录表静态数据集 +List> ratesHistoryData = new ArrayList<>(); +ratesHistoryData.add(Tuple2.of("US Dollar", 102L)); +ratesHistoryData.add(Tuple2.of("Euro", 114L)); +ratesHistoryData.add(Tuple2.of("Yen", 1L)); +ratesHistoryData.add(Tuple2.of("Euro", 116L)); +ratesHistoryData.add(Tuple2.of("Euro", 119L)); + +// 用上面的数据集创建并注册一个示例表 +// 在实际设置中,应使用自己的表替换它 +DataStream> ratesHistoryStream = env.fromCollection(ratesHistoryData); +Table ratesHistory = tEnv.fromDataStream(ratesHistoryStream, $("r_currency"), $("r_rate"), $("r_proctime").proctime()); + +tEnv.createTemporaryView("RatesHistory", ratesHistory); + +// 创建和注册时态表函数 +// 指定 "r_proctime" 为时间属性,指定 "r_currency" 为主键 +TemporalTableFunction rates = ratesHistory.createTemporalTableFunction("r_proctime", "r_currency"); // <==== (1) +tEnv.registerFunction("Rates", rates); // <==== (2) +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// 获取 stream 和 table 环境 +val env = StreamExecutionEnvironment.getExecutionEnvironment +val tEnv = StreamTableEnvironment.create(env) + +// 提供一个汇率历史记录表静态数据集 +val ratesHistoryData = new mutable.MutableList[(String, Long)] +ratesHistoryData.+=(("US Dollar", 102L)) +ratesHistoryData.+=(("Euro", 114L)) +ratesHistoryData.+=(("Yen", 1L)) +ratesHistoryData.+=(("Euro", 116L)) +ratesHistoryData.+=(("Euro", 119L)) + +// 用上面的数据集创建并注册一个示例表 +// 在实际设置中,应使用自己的表替换它 +val ratesHistory = env + .fromCollection(ratesHistoryData) + .toTable(tEnv, 'r_currency, 'r_rate, 'r_proctime.proctime) + +tEnv.createTemporaryView("RatesHistory", ratesHistory) + +// 创建和注册时态表函数 +// 指定 "r_proctime" 为时间属性,指定 "r_currency" 为主键 +val rates = ratesHistory.createTemporalTableFunction($"r_proctime", $"r_currency") // <==== (1) +tEnv.registerFunction("Rates", rates) // <==== (2) +``` +{{< /tab >}} +{{< /tabs >}} + +行`(1)`创建了一个 `rates` [时态表函数](#时态表函数), +这使我们可以在[ Table API ]({{< ref "docs/dev/table/tableApi" >}}#joins)中使用 `rates` 函数。 + +行`(2)`在表环境中注册名称为 `Rates` 的函数,这使我们可以在[ SQL ]({{< ref "docs/dev/table/sql/queries" >}}#joins)中使用 `Rates` 函数。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/config.md b/docs/content.zh/docs/dev/table/config.md new file mode 100644 index 0000000000000..2344a52e8a179 --- /dev/null +++ b/docs/content.zh/docs/dev/table/config.md @@ -0,0 +1,118 @@ +--- +title: "配置" +weight: 111 +type: docs +aliases: + - /zh/dev/table/config.html +--- + + +# 配置 + +Table 和 SQL API 的默认配置能够确保结果准确,同时也提供可接受的性能。 + +根据 Table 程序的需求,可能需要调整特定的参数用于优化。例如,无界流程序可能需要保证所需的状态是有限的(请参阅 [流式概念](./streaming/query_configuration.html)). + + + +### 概览 + +在每个 TableEnvironment 中,`TableConfig` 提供用于当前会话的配置项。 + +对于常见或者重要的配置项,`TableConfig` 提供带有详细注释的 `getters` 和 `setters` 方法。 + +对于更加高级的配置,用户可以直接访问底层的 key-value 配置项。以下章节列举了所有可用于调整 Flink Table 和 SQL API 程序的配置项。 + +注意 因为配置项会在执行操作的不同时间点被读取,所以推荐在实例化 TableEnvironment 后尽早地设置配置项。 + +{{< tabs "ec2c3d9c-2ecd-4017-9c77-fb32cd6966cf" >}} +{{< tab "Java" >}} +```java +// instantiate table environment +TableEnvironment tEnv = ... + +// access flink configuration +Configuration configuration = tEnv.getConfig().getConfiguration(); +// set low-level key-value options +configuration.setString("table.exec.mini-batch.enabled", "true"); +configuration.setString("table.exec.mini-batch.allow-latency", "5 s"); +configuration.setString("table.exec.mini-batch.size", "5000"); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// instantiate table environment +val tEnv: TableEnvironment = ... + +// access flink configuration +val configuration = tEnv.getConfig().getConfiguration() +// set low-level key-value options +configuration.setString("table.exec.mini-batch.enabled", "true") +configuration.setString("table.exec.mini-batch.allow-latency", "5 s") +configuration.setString("table.exec.mini-batch.size", "5000") +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +# instantiate table environment +t_env = ... + +# access flink configuration +configuration = t_env.get_config().get_configuration(); +# set low-level key-value options +configuration.set_string("table.exec.mini-batch.enabled", "true"); +configuration.set_string("table.exec.mini-batch.allow-latency", "5 s"); +configuration.set_string("table.exec.mini-batch.size", "5000"); +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +``` +Flink SQL> SET table.exec.mini-batch.enabled = true; +Flink SQL> SET table.exec.mini-batch.allow-latency = 5s; +Flink SQL> SET table.exec.mini-batch.size = 5000; +``` +{{< /tab >}} +{{< /tabs >}} + +注意 目前,key-value 配置项仅被 Blink planner 支持。 + +### 执行配置 + +以下选项可用于优化查询执行的性能。 + +{{< generated/execution_config_configuration >}} + +### 优化器配置 + +以下配置可以用于调整查询优化器的行为以获得更好的执行计划。 + +{{< generated/optimizer_config_configuration >}} + +### Planner 配置 + +以下配置可以用于调整 planner 的行为。 + +{{< generated/table_config_configuration >}} + +### SQL Client 配置 + +以下配置可以用于调整 sql client 的行为。 + +{{< generated/sql_client_configuration >}} diff --git a/docs/content.zh/docs/dev/table/data_stream_api.md b/docs/content.zh/docs/dev/table/data_stream_api.md new file mode 100644 index 0000000000000..ff154eb286851 --- /dev/null +++ b/docs/content.zh/docs/dev/table/data_stream_api.md @@ -0,0 +1,2195 @@ +--- +title: "DataStream API Integration" +weight: 3 +type: docs +--- + + +# DataStream API Integration + +{{< hint info >}} +This page only discusses the integration with DataStream API in JVM languages such as Java or Scala. +For Python, see the [Python API]({{< ref "docs/dev/python/overview" >}}) area. +{{< /hint >}} + +Both Table API and DataStream API are equally important when it comes to defining a data +processing pipeline. + +The DataStream API offers the primitives of stream processing (namely time, state, and dataflow +management) in a relatively low-level imperative programming API. The Table API abstracts away many +internals and provides a structured and declarative API. + +Both APIs can work with bounded *and* unbounded streams. + +Bounded streams need to be managed when processing historical data. Unbounded streams occur +in real-time processing scenarios that might be initialized with historical data first. + +For efficient execution, both APIs offer processing bounded streams in an optimized batch execution +mode. However, since batch is just a special case of streaming, it is also possible to run pipelines +of bounded streams in regular streaming execution mode. + +{{< hint warning >}} +Both DataStream API and Table API provide their own way of enabling the batch execution mode at the +moment. In the near future, this will be further unified. +{{< /hint >}} + +Pipelines in one API can be defined end-to-end without dependencies on the other API. However, it +might be useful to mix both APIs for various reasons: + +- Use the table ecosystem for accessing catalogs or connecting to external systems easily, before +implementing the main pipeline in DataStream API. +- Access some of the SQL functions for stateless data normalization and cleansing, before +implementing the main pipeline in DataStream API. +- Switch to DataStream API every now and then if a more low-level operation (e.g. custom timer +handling) is not present in Table API. + +Flink provides special bridging functionalities to make the integration with DataStream API as smooth +as possible. + +{{< hint info >}} +Switching between DataStream and Table API adds some conversion overhead. For example, internal data +structures of the table runtime (i.e. `RowData`) that partially work on binary data need to be converted +to more user-friendly data structures (i.e. `Row`). Usually, this overhead can be neglected but is +mentioned here for completeness. +{{< /hint >}} + +{{< top >}} + +Converting between DataStream and Table +--------------------------------------- + +Flink provides a specialized `StreamTableEnvironment` in Java and Scala for integrating with the +DataStream API. Those environments extend the regular `TableEnvironment` with additional methods +and take the `StreamExecutionEnvironment` used in the DataStream API as a parameter. + +{{< hint warning >}} +Currently, the `StreamTableEnvironment` does not support enabling the batch execution mode yet. Nevertheless, +bounded streams can be processed there using the streaming execution mode but with lower efficiency. + +Note, however, that the general `TableEnvironment` can work in both streaming execution or optimized batch +execution mode. +{{< /hint >}} + +The following code shows an example of how to go back and forth between the two APIs. Column names +and types of the `Table` are automatically derived from the `TypeInformation` of the `DataStream`. +Since the DataStream API does not support changelog processing natively, the code assumes +append-only/insert-only semantics during the stream-to-table and table-to-stream conversion. + +{{< tabs "6ec84aa4-d91d-4c47-9fa2-b1aae1e3cdb5" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; + +// create environments of both APIs +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env); + +// create a DataStream +DataStream dataStream = env.fromElements("Alice", "Bob", "John"); + +// interpret the insert-only DataStream as a Table +Table inputTable = tableEnv.fromDataStream(dataStream); + +// register the Table object as a view and query it +tableEnv.createTemporaryView("InputTable", inputTable); +Table resultTable = tableEnv.sqlQuery("SELECT UPPER(f0) FROM InputTable"); + +// interpret the insert-only Table as a DataStream again +DataStream resultStream = tableEnv.toDataStream(resultTable); + +// add a printing sink and execute in DataStream API +resultStream.print(); +env.execute(); + +// prints: +// +I[Alice] +// +I[Bob] +// +I[John] +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.scala._ +import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment +import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment + +// create environments of both APIs +val env = StreamExecutionEnvironment.getExecutionEnvironment +val tableEnv = StreamTableEnvironment.create(env) + +// create a DataStream +val dataStream = env.fromElements("Alice", "Bob", "John") + +// interpret the insert-only DataStream as a Table +val inputTable = tableEnv.fromDataStream(dataStream) + +// register the Table object as a view and query it +tableEnv.createTemporaryView("InputTable", inputTable) +val resultTable = tableEnv.sqlQuery("SELECT UPPER(f0) FROM InputTable") + +// interpret the insert-only Table as a DataStream again +val resultStream = tableEnv.toDataStream(resultTable) + +// add a printing sink and execute in DataStream API +resultStream.print() +env.execute() + + +// prints: +// +I[Alice] +// +I[Bob] +// +I[John] +``` +{{< /tab >}} +{{< /tabs >}} + +The complete semantics of `fromDataStream` and `toDataStream` can be found in the [dedicated section below](#handling-of-insert-only-streams). +In particular, the section discusses how to influence the schema derivation with more complex +and nested types. It also covers working with event-time and watermarks. + +Depending on the kind of query, in many cases the resulting dynamic table is a pipeline that does not +only produce insert-only changes when coverting the `Table` to a `DataStream` but also produces retractions +and other kinds of updates. During table-to-stream conversion, this could lead to an exception similar to + +``` +Table sink 'Unregistered_DataStream_Sink_1' doesn't support consuming update changes [...]. +``` + +in which case one needs to revise the query again or switch to `toChangelogStream`. + +The following example shows how updating tables can be converted. Every result row represents +an entry in a changelog with a change flag that can be queried by calling `row.getKind()` on it. In +the example, the second score for `Alice` creates an _update before_ (`-U`) and _update after_ (`+U`) +change. + +{{< tabs "f45d1374-61a0-40c0-9280-702ed87d2ed0" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import org.apache.flink.types.Row; + +// create environments of both APIs +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env); + +// create a DataStream +DataStream dataStream = env.fromElements( + Row.of("Alice", 12), + Row.of("Bob", 10), + Row.of("Alice", 100)); + +// interpret the insert-only DataStream as a Table +Table inputTable = tableEnv.fromDataStream(dataStream).as("name", "score"); + +// register the Table object as a view and query it +// the query contains an aggregation that produces updates +tableEnv.createTemporaryView("InputTable", inputTable); +Table resultTable = tableEnv.sqlQuery( + "SELECT name, SUM(score) FROM InputTable GROUP BY name"); + +// interpret the updating Table as a changelog DataStream +DataStream resultStream = tableEnv.toChangelogStream(resultTable); + +// add a printing sink and execute in DataStream API +resultStream.print(); +env.execute(); + +// prints: +// +I[Alice, 12] +// +I[Bob, 10] +// -U[Alice, 12] +// +U[Alice, 112] +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.scala.typeutils.Types +import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment +import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment +import org.apache.flink.types.Row + +// create environments of both APIs +val env = StreamExecutionEnvironment.getExecutionEnvironment +val tableEnv = StreamTableEnvironment.create(env) + +// create a DataStream +val dataStream = env.fromElements( + Row.of("Alice", Int.box(12)), + Row.of("Bob", Int.box(10)), + Row.of("Alice", Int.box(100)) +)(Types.ROW(Types.STRING, Types.INT)) + +// interpret the insert-only DataStream as a Table +val inputTable = tableEnv.fromDataStream(dataStream).as("name", "score") + +// register the Table object as a view and query it +// the query contains an aggregation that produces updates +tableEnv.createTemporaryView("InputTable", inputTable) +val resultTable = tableEnv.sqlQuery("SELECT name, SUM(score) FROM InputTable GROUP BY name") + +// interpret the updating Table as a changelog DataStream +val resultStream = tableEnv.toChangelogStream(resultTable) + +// add a printing sink and execute in DataStream API +resultStream.print() +env.execute() + +// prints: +// +I[Alice, 12] +// +I[Bob, 10] +// -U[Alice, 12] +// +U[Alice, 112] +``` +{{< /tab >}} +{{< /tabs >}} + +The complete semantics of `fromChangelogStream` and `toChangelogStream` can be found in the [dedicated section below](#handling-of-insert-only-streams). +In particular, the section discusses how to influence the schema derivation with more complex and nested +types. It covers working with event-time and watermarks. It discusses how to declare a primary key and +changelog mode for the input and output streams. + +### Dependencies and Imports + +Projects that combine Table API with DataStream API need to add one of the following bridging modules. +They include transitive dependencies to `flink-table-api-java` or `flink-table-api-scala` and the +corresponding language-specific DataStream API module. + +{{< tabs "0d2da52a-ee43-4d06-afde-b165517c0617" >}} +{{< tab "Java" >}} +```xml + + org.apache.flink + flink-table-api-java-bridge{{< scala_version >}} + {{< version >}} + provided + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```xml + + org.apache.flink + flink-table-api-scala-bridge{{< scala_version >}} + {{< version >}} + provided + +``` +{{< /tab >}} +{{< /tabs >}} + +The following imports are required to declare common pipelines using either the Java or Scala version +of both DataStream API and Table API. + +{{< tabs "19a47e2d-168b-4f73-a966-abfcc8a6baca" >}} +{{< tab "Java" >}} +```java +// imports for Java DataStream API +import org.apache.flink.streaming.api.*; +import org.apache.flink.streaming.api.environment.*; + +// imports for Table API with bridging to Java DataStream API +import org.apache.flink.table.api.*; +import org.apache.flink.table.api.bridge.java.*; +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// imports for Scala DataStream API +import org.apache.flink.api.scala._ +import org.apache.flink.streaming.api.scala._ + +// imports for Table API with bridging to Scala DataStream API +import org.apache.flink.table.api._ +import org.apache.flink.table.api.bridge.scala._ +``` +{{< /tab >}} +{{< /tabs >}} + +### Configuration + +The `TableEnvironment` will adopt all configuration options from the passed `StreamExecutionEnvironment`. +However, it cannot be guaranteed that further changes to the configuration of `StreamExecutionEnvironment` +are propagated to the `StreamTableEnvironment` after its instantiation. Also, the reverse propagation +of options from Table API to DataStream API is not supported. + +We recommend setting all configuration options in DataStream API early before switching to Table API. + +{{< tabs "47a32814-abea-11eb-8529-0242ac130003" >}} +{{< tab "Java" >}} +```java +import java.time.ZoneId; +import org.apache.flink.streaming.api.CheckpointingMode; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; + +// create Java DataStream API + +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + +// set various configuration early + +env.setMaxParallelism(256); + +env.getConfig().addDefaultKryoSerializer(MyCustomType.class, CustomKryoSerializer.class); + +env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE); + +// then switch to Java Table API + +StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env); + +// set configuration early + +tableEnv.getConfig().setLocalTimeZone(ZoneId.of("Europe/Berlin")); + +// start defining your pipelines in both APIs... +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import java.time.ZoneId +import org.apache.flink.api.scala._ +import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment +import org.apache.flink.streaming.api.CheckpointingMode +import org.apache.flink.table.api.bridge.scala._ + +// create Scala DataStream API + +val env = StreamExecutionEnvironment.getExecutionEnvironment + +// set various configuration early + +env.setMaxParallelism(256) + +env.getConfig.addDefaultKryoSerializer(classOf[MyCustomType], classOf[CustomKryoSerializer]) + +env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE) + +// then switch to Scala Table API + +val tableEnv = StreamTableEnvironment.create(env) + +// set configuration early + +tableEnv.getConfig.setLocalTimeZone(ZoneId.of("Europe/Berlin")) + +// start defining your pipelines in both APIs... +``` +{{< /tab >}} +{{< /tabs >}} + +### Execution Behavior + +Both APIs provide methods to execute pipelines. In other words: if requested, they compile a job +graph that will be submitted to the cluster and triggered for execution. Results will be streamed to +the declared sinks. + +Usually, both APIs mark such behavior with the term `execute` in method names. However, the execution +behavior is slightly different between Table API and DataStream API. + +**DataStream API** + +The DataStream API's `StreamExecutionEnvironment` acts as a _builder pattern_ to construct +a complex pipeline. The pipeline possibly splits into multiple branches that might or might not end with +a sink. + +At least one sink must be defined. Otherwise, the following exception is thrown: +``` +java.lang.IllegalStateException: No operators defined in streaming topology. Cannot execute. +``` + +`StreamExecutionEnvironment.execute()` submits the entire constructed pipeline and clears the builder +afterward. In other words: no sources and sinks are declared anymore, and a new pipeline can be +added to the builder. Thus, every DataStream program usually ends with a call to `StreamExecutionEnvironment.execute()`. +Alternatively, `DataStream.executeAndCollect()` implicitly defines a sink for streaming the results to +the local client and only executes the current branch. + +**Table API** + +In the Table API, branching pipelines is only supported within a `StatementSet` where each branch must +declare a final sink. Both `TableEnvironment` and also `StreamTableEnvironment` do not offer a dedicated +general `execute()` method. Instead, they offer methods for submitting a single source-to-sink +pipeline or a statement set: + +```java +// execute with explicit sink +tableEnv.from("InputTable").executeInsert("OutputTable") + +tableEnv.executeSql("INSERT INTO OutputTable SELECT * FROM InputTable") + +tableEnv.createStatementSet() + .addInsert("OutputTable", tableEnv.from("InputTable")) + .addInsert("OutputTable2", tableEnv.from("InputTable")) + .execute() + +tableEnv.createStatementSet() + .addInsertSql("INSERT INTO OutputTable SELECT * FROM InputTable") + .addInsertSql("INSERT INTO OutputTable2 SELECT * FROM InputTable") + .execute() + +// execute with implicit local sink + +tableEnv.from("InputTable").execute().print() + +tableEnv.executeSql("SELECT * FROM InputTable").print() +``` + +To combine both execution behaviors, every call to `StreamTableEnvironment.toDataStream` +or `StreamTableEnvironment.toChangelogStream` will materialize (i.e. compile) the Table API sub-pipeline +and insert it into the DataStream API pipeline builder. This means that `StreamExecutionEnvironment.execute()` +or `DataStream.executeAndCollect` must be called afterwards. An execution in Table API will not trigger +these "external parts". + +```java +// (1) + +// adds a branch with a printing sink to the StreamExecutionEnvironment +tableEnv.toDataStream(table).print() + +// (2) + +// executes a Table API end-to-end pipeline as a Flink job and prints locally, +// thus (1) has still not been executed +table.execute().print() + +// executes the DataStream API pipeline with the sink defined in (1) as a +// Flink job, (2) was already running before +env.execute() +``` + +{{< top >}} + +Handling of (Insert-Only) Streams +--------------------------------- + +A `StreamTableEnvironment` offers the following methods to convert from and to DataStream API: + +- `fromDataStream(DataStream)`: Interprets a stream of insert-only changes and arbitrary type as +a table. Event-time and watermarks are not propagated by default. + +- `fromDataStream(DataStream, Schema)`: Interprets a stream of insert-only changes and arbitrary +type as a table. The optional schema allows to enrich column data types and add time attributes, +watermarks strategies, other computed columns, or primary keys. + +- `createTemporaryView(String, DataStream)`: Registers the stream under a name to access it in SQL. +It is a shortcut for `createTemporaryView(String, fromDataStream(DataStream))`. + +- `createTemporaryView(String, DataStream, Schema)`: Registers the stream under a name to access it in SQL. +It is a shortcut for `createTemporaryView(String, fromDataStream(DataStream, Schema))`. + +- `toDataStream(Table)`: Converts a table into a stream of insert-only changes. The default +stream record type is `org.apache.flink.types.Row`. A single rowtime attribute column is written +back into the DataStream API's record. Watermarks are propagated as well. + +- `toDataStream(Table, AbstractDataType)`: Converts a table into a stream of insert-only changes. +This method accepts a data type to express the desired stream record type. The planner might insert +implicit casts and reorders columns to map columns to fields of the (possibly nested) data type. + +- `toDataStream(Table, Class)`: A shortcut for `toDataStream(Table, DataTypes.of(Class))` +to quickly create the desired data type reflectively. + +From a Table API's perspective, converting from and to DataStream API is similar to reading from or +writing to a virtual table connector that has been defined using a [`CREATE TABLE` DDL]({{< ref "docs/dev/table/sql/create" >}}#create-table) +in SQL. + +The schema part in the virtual `CREATE TABLE name (schema) WITH (options)` statement can be automatically +derived from the DataStream's type information, enriched, or entirely defined manually using +`org.apache.flink.table.api.Schema`. + +The virtual DataStream table connector exposes the following metadata for every row: + + + + + + + + + + + + + + + + + + +
    KeyData TypeDescriptionR/W
    rowtimeTIMESTAMP_LTZ(3) NOT NULLStream record's timestamp.R/W
    + +The virtual DataStream table source implements [`SupportsSourceWatermark`]({{< ref "docs/dev/table/sourcesSinks" >}}#source-abilities) +and thus allows calling the `SOURCE_WATERMARK()` built-in function as a watermark strategy to adopt +watermarks from the DataStream API. + +### Examples for `fromDataStream` + +The following code shows how to use `fromDataStream` for different scenarios. + +{{< tabs "079cdf25-21ef-4393-ad69-623510027a1b" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.api.Table; +import java.time.Instant; + +// some example POJO +public static class User { + public String name; + + public Integer score; + + public Instant event_time; + + // default constructor for DataStream API + public User() {} + + // fully assigning constructor for Table API + public User(String name, Integer score, Instant event_time) { + this.name = name; + this.score = score; + this.event_time = event_time; + } +} + +// create a DataStream +DataStream dataStream = + env.fromElements( + new User("Alice", 4, Instant.ofEpochMilli(1000)), + new User("Bob", 6, Instant.ofEpochMilli(1001)), + new User("Alice", 10, Instant.ofEpochMilli(1002))); + + +// === EXAMPLE 1 === + +// derive all physical columns automatically + +Table table = tableEnv.fromDataStream(dataStream); +table.printSchema(); +// prints: +// ( +// `name` STRING, +// `score` INT, +// `event_time` TIMESTAMP_LTZ(9) +// ) + + +// === EXAMPLE 2 === + +// derive all physical columns automatically +// but add computed columns (in this case for creating a proctime attribute column) + +Table table = tableEnv.fromDataStream( + dataStream, + Schema.newBuilder() + .columnByExpression("proc_time", "PROCTIME()") + .build()); +table.printSchema(); +// prints: +// ( +// `name` STRING, +// `score` INT NOT NULL, +// `event_time` TIMESTAMP_LTZ(9), +// `proc_time` TIMESTAMP_LTZ(3) NOT NULL *PROCTIME* AS PROCTIME() +//) + + +// === EXAMPLE 3 === + +// derive all physical columns automatically +// but add computed columns (in this case for creating a rowtime attribute column) +// and a custom watermark strategy + +Table table = + tableEnv.fromDataStream( + dataStream, + Schema.newBuilder() + .columnByExpression("rowtime", "CAST(event_time AS TIMESTAMP_LTZ(3))") + .watermark("rowtime", "rowtime - INTERVAL '10' SECOND") + .build()); +table.printSchema(); +// prints: +// ( +// `name` STRING, +// `score` INT, +// `event_time` TIMESTAMP_LTZ(9), +// `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* AS CAST(event_time AS TIMESTAMP_LTZ(3)), +// WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS rowtime - INTERVAL '10' SECOND +// ) + + +// === EXAMPLE 4 === + +// derive all physical columns automatically +// but access the stream record's timestamp for creating a rowtime attribute column +// also rely on the watermarks generated in the DataStream API + +// we assume that a watermark strategy has been defined for `dataStream` before +// (not part of this example) +Table table = + tableEnv.fromDataStream( + dataStream, + Schema.newBuilder() + .columnByMetadata("rowtime", "TIMESTAMP_LTZ(3)") + .watermark("rowtime", "SOURCE_WATERMARK()") + .build()); +table.printSchema(); +// prints: +// ( +// `name` STRING, +// `score` INT, +// `event_time` TIMESTAMP_LTZ(9), +// `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA, +// WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK() +// ) + + +// === EXAMPLE 5 === + +// define physical columns manually +// in this example, +// - we can reduce the default precision of timestamps from 9 to 3 +// - we also project the columns and put `event_time` to the beginning + +Table table = + tableEnv.fromDataStream( + dataStream, + Schema.newBuilder() + .column("event_time", "TIMESTAMP_LTZ(3)") + .column("name", "STRING") + .column("score", "INT") + .watermark("event_time", "SOURCE_WATERMARK()") + .build()); +table.printSchema(); +// prints: +// ( +// `event_time` TIMESTAMP_LTZ(3) *ROWTIME*, +// `name` VARCHAR(200), +// `score` INT +// ) +// note: the watermark strategy is not shown due to the inserted column reordering projection +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.scala._ +import java.time.Instant; + +// some example case class +case class User(name: String, score: java.lang.Integer, event_time: java.time.Instant) + +// create a DataStream +val dataStream = env.fromElements( + User("Alice", 4, Instant.ofEpochMilli(1000)), + User("Bob", 6, Instant.ofEpochMilli(1001)), + User("Alice", 10, Instant.ofEpochMilli(1002))) + + +// === EXAMPLE 1 === + +// derive all physical columns automatically + +val table = tableEnv.fromDataStream(dataStream) +table.printSchema() +// prints: +// ( +// `name` STRING, +// `score` INT, +// `event_time` TIMESTAMP_LTZ(9) +// ) + + +// === EXAMPLE 2 === + +// derive all physical columns automatically +// but add computed columns (in this case for creating a proctime attribute column) + +val table = tableEnv.fromDataStream( + dataStream, + Schema.newBuilder() + .columnByExpression("proc_time", "PROCTIME()") + .build()) +table.printSchema() +// prints: +// ( +// `name` STRING, +// `score` INT NOT NULL, +// `event_time` TIMESTAMP_LTZ(9), +// `proc_time` TIMESTAMP_LTZ(3) NOT NULL *PROCTIME* AS PROCTIME() +//) + + +// === EXAMPLE 3 === + +// derive all physical columns automatically +// but add computed columns (in this case for creating a rowtime attribute column) +// and a custom watermark strategy + +val table = + tableEnv.fromDataStream( + dataStream, + Schema.newBuilder() + .columnByExpression("rowtime", "CAST(event_time AS TIMESTAMP_LTZ(3))") + .watermark("rowtime", "rowtime - INTERVAL '10' SECOND") + .build()) +table.printSchema() +// prints: +// ( +// `name` STRING, +// `score` INT, +// `event_time` TIMESTAMP_LTZ(9), +// `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* AS CAST(event_time AS TIMESTAMP_LTZ(3)), +// WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS rowtime - INTERVAL '10' SECOND +// ) + + +// === EXAMPLE 4 === + +// derive all physical columns automatically +// but access the stream record's timestamp for creating a rowtime attribute column +// also rely on the watermarks generated in the DataStream API + +// we assume that a watermark strategy has been defined for `dataStream` before +// (not part of this example) +val table = + tableEnv.fromDataStream( + dataStream, + Schema.newBuilder() + .columnByMetadata("rowtime", "TIMESTAMP_LTZ(3)") + .watermark("rowtime", "SOURCE_WATERMARK()") + .build()) +table.printSchema() +// prints: +// ( +// `name` STRING, +// `score` INT, +// `event_time` TIMESTAMP_LTZ(9), +// `rowtime` TIMESTAMP_LTZ(3) *ROWTIME* METADATA, +// WATERMARK FOR `rowtime`: TIMESTAMP_LTZ(3) AS SOURCE_WATERMARK() +// ) + + +// === EXAMPLE 5 === + +// define physical columns manually +// in this example, +// - we can reduce the default precision of timestamps from 9 to 3 +// - we also project the columns and put `event_time` to the beginning + +val table = + tableEnv.fromDataStream( + dataStream, + Schema.newBuilder() + .column("event_time", "TIMESTAMP_LTZ(3)") + .column("name", "STRING") + .column("score", "INT") + .watermark("event_time", "SOURCE_WATERMARK()") + .build()) +table.printSchema() +// prints: +// ( +// `event_time` TIMESTAMP_LTZ(3) *ROWTIME*, +// `name` VARCHAR(200), +// `score` INT +// ) +// note: the watermark strategy is not shown due to the inserted column reordering projection +``` +{{< /tab >}} +{{< /tabs >}} + +Example 1 illustrates a simple use case when no time-based operations are needed. + +Example 4 is the most common use case when time-based operations such as windows or interval +joins should be part of the pipeline. Example 2 is the most common use case when these time-based +operations should work in processing time. + +Example 5 entirely relies on the declaration of the user. This can be useful to replace generic types +from the DataStream API (which would be `RAW` in the Table API) with proper data types. + +Since `DataType` is richer than `TypeInformation`, we can easily enable immutable POJOs and other complex +data structures. The following example in Java shows what is possible. Check also the +[Data Types & Serialization]({{< ref "docs/dev/serialization/types_serialization" >}}) page of +the DataStream API for more information about the supported types there. + +```java +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.api.Table; + +// the DataStream API does not support immutable POJOs yet, +// the class will result in a generic type that is a RAW type in Table API by default +public static class User { + + public final String name; + + public final Integer score; + + public User(String name, Integer score) { + this.name = name; + this.score = score; + } +} + +// create a DataStream +DataStream dataStream = env.fromElements( + new User("Alice", 4), + new User("Bob", 6), + new User("Alice", 10)); + +// since fields of a RAW type cannot be accessed, every stream record is treated as an atomic type +// leading to a table with a single column `f0` + +Table table = tableEnv.fromDataStream(dataStream); +table.printSchema(); +// prints: +// ( +// `f0` RAW('User', '...') +// ) + +// instead, declare a more useful data type for columns using the Table API's type system +// in a custom schema and rename the columns in a following `as` projection + +Table table = tableEnv + .fromDataStream( + dataStream, + Schema.newBuilder() + .column("f0", DataTypes.of(User.class)) + .build()) + .as("user"); +table.printSchema(); +// prints: +// ( +// `user` *User<`name` STRING,`score` INT>* +// ) + +// data types can be extracted reflectively as above or explicitly defined + +Table table3 = tableEnv + .fromDataStream( + dataStream, + Schema.newBuilder() + .column( + "f0", + DataTypes.STRUCTURED( + User.class, + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("score", DataTypes.INT()))) + .build()) + .as("user"); +table.printSchema(); +// prints: +// ( +// `user` *User<`name` STRING,`score` INT>* +// ) +``` + +### Examples for `createTemporaryView` + +A `DataStream` can be registered directly as a view (possibly enriched with a schema). + +{{< hint info >}} +Views created from a `DataStream` can only be registered as temporary views. Due to their _inline_/_anonymous_ +nature, it is not possible to register them in a permanent catalog. +{{< /hint >}} + +The following code shows how to use `createTemporaryView` for different scenarios. + +{{< tabs "03d19c44-b994-4991-8c66-00189a2ec5d5" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.streaming.api.datastream.DataStream; + +// create some DataStream +DataStream> dataStream = env.fromElements( + Tuple2.of(12L, "Alice"), + Tuple2.of(0L, "Bob")); + + +// === EXAMPLE 1 === + +// register the DataStream as view "MyView" in the current session +// all columns are derived automatically + +tableEnv.createTemporaryView("MyView", dataStream); + +tableEnv.from("MyView").printSchema(); + +// prints: +// ( +// `f0` BIGINT NOT NULL, +// `f1` STRING +// ) + + +// === EXAMPLE 2 === + +// register the DataStream as view "MyView" in the current session, +// provide a schema to adjust the columns similar to `fromDataStream` + +// in this example, the derived NOT NULL information has been removed + +tableEnv.createTemporaryView( + "MyView", + dataStream, + Schema.newBuilder() + .column("f0", "BIGINT") + .column("f1", "STRING") + .build()); + +tableEnv.from("MyView").printSchema(); + +// prints: +// ( +// `f0` BIGINT, +// `f1` STRING +// ) + + +// === EXAMPLE 3 === + +// use the Table API before creating the view if it is only about renaming columns + +tableEnv.createTemporaryView( + "MyView", + tableEnv.fromDataStream(dataStream).as("id", "name")); + +tableEnv.from("MyView").printSchema(); + +// prints: +// ( +// `id` BIGINT NOT NULL, +// `name` STRING +// ) +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// create some DataStream +val dataStream: DataStream[(Long, String)] = env.fromElements( + (12L, "Alice"), + (0L, "Bob")) + + +// === EXAMPLE 1 === + +// register the DataStream as view "MyView" in the current session +// all columns are derived automatically + +tableEnv.createTemporaryView("MyView", dataStream) + +tableEnv.from("MyView").printSchema() + +// prints: +// ( +// `_1` BIGINT NOT NULL, +// `_2` STRING +// ) + + +// === EXAMPLE 2 === + +// register the DataStream as view "MyView" in the current session, +// provide a schema to adjust the columns similar to `fromDataStream` + +// in this example, the derived NOT NULL information has been removed + +tableEnv.createTemporaryView( + "MyView", + dataStream, + Schema.newBuilder() + .column("_1", "BIGINT") + .column("_2", "STRING") + .build()) + +tableEnv.from("MyView").printSchema() + +// prints: +// ( +// `_1` BIGINT, +// `_2` STRING +// ) + + +// === EXAMPLE 3 === + +// use the Table API before creating the view if it is only about renaming columns + +tableEnv.createTemporaryView( + "MyView", + tableEnv.fromDataStream(dataStream).as("id", "name")) + +tableEnv.from("MyView").printSchema() + +// prints: +// ( +// `id` BIGINT NOT NULL, +// `name` STRING +// ) +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + +### Examples for `toDataStream` + +The following code shows how to use `toDataStream` for different scenarios. + +{{< tabs "213ed312-f854-477a-a8be-2830f04d7154" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Table; +import org.apache.flink.types.Row; +import java.time.Instant; + +// POJO with mutable fields +// since no fully assigning constructor is defined, the field order +// is alphabetical [event_time, name, score] +public static class User { + + public String name; + + public Integer score; + + public Instant event_time; +} + +tableEnv.executeSql( + "CREATE TABLE GeneratedTable " + + "(" + + " name STRING," + + " score INT," + + " event_time TIMESTAMP_LTZ(3)," + + " WATERMARK FOR event_time AS event_time - INTERVAL '10' SECOND" + + ")" + + "WITH ('connector'='datagen')"); + +Table table = tableEnv.from("GeneratedTable"); + + +// === EXAMPLE 1 === + +// use the default conversion to instances of Row + +// since `event_time` is a single rowtime attribute, it is inserted into the DataStream +// metadata and watermarks are propagated + +DataStream dataStream = tableEnv.toDataStream(table); + + +// === EXAMPLE 2 === + +// a data type is extracted from class `User`, +// the planner reorders fields and inserts implicit casts where possible to convert internal +// data structures to the desired structured type + +// since `event_time` is a single rowtime attribute, it is inserted into the DataStream +// metadata and watermarks are propagated + +DataStream dataStream = tableEnv.toDataStream(table, User.class); + +// data types can be extracted reflectively as above or explicitly defined + +DataStream dataStream = + tableEnv.toDataStream( + table, + DataTypes.STRUCTURED( + User.class, + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("score", DataTypes.INT()), + DataTypes.FIELD("event_time", DataTypes.TIMESTAMP_LTZ(3)))); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.streaming.api.scala.DataStream +import org.apache.flink.table.api.DataTypes + +case class User(name: String, score: java.lang.Integer, event_time: java.time.Instant) + +tableEnv.executeSql( + "CREATE TABLE GeneratedTable " + + "(" + + " name STRING," + + " score INT," + + " event_time TIMESTAMP_LTZ(3)," + + " WATERMARK FOR event_time AS event_time - INTERVAL '10' SECOND" + + ")" + + "WITH ('connector'='datagen')") + +val table = tableEnv.from("GeneratedTable") + + +// === EXAMPLE 1 === + +// use the default conversion to instances of Row + +// since `event_time` is a single rowtime attribute, it is inserted into the DataStream +// metadata and watermarks are propagated + +val dataStream: DataStream[Row] = tableEnv.toDataStream(table) + + +// === EXAMPLE 2 === + +// a data type is extracted from class `User`, +// the planner reorders fields and inserts implicit casts where possible to convert internal +// data structures to the desired structured type + +// since `event_time` is a single rowtime attribute, it is inserted into the DataStream +// metadata and watermarks are propagated + +val dataStream: DataStream[User] = tableEnv.toDataStream(table, User.class) + +// data types can be extracted reflectively as above or explicitly defined + +val dataStream: DataStream[User] = + tableEnv.toDataStream( + table, + DataTypes.STRUCTURED( + User.class, + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("score", DataTypes.INT()), + DataTypes.FIELD("event_time", DataTypes.TIMESTAMP_LTZ(3)))) +``` +{{< /tab >}} +{{< /tabs >}} + +Note that only non-updating tables are supported by `toDataStream`. Usually, time-based operations +such as windows, interval joins, or the `MATCH_RECOGNIZE` clause are a good fit for insert-only +pipelines next to simple operations like projections and filters. Pipelines with operations that +produce updates can use `toChangelogStream`. + +{{< top >}} + +Handling of Changelog Streams +----------------------------- + +Internally, Flink's table runtime is a changelog processor. The concepts page describes how +[dynamic tables and streams relate]({{< ref "docs/dev/table/concepts/dynamic_tables" >}}) +to each other. + +A `StreamTableEnvironment` offers the following methods to expose these _change data capture_ (CDC) +functionalities: + +- `fromChangelogStream(DataStream)`: Interprets a stream of changelog entries as a table. The stream +record type must be `org.apache.flink.types.Row` since its `RowKind` flag is evaluated during runtime. +Event-time and watermarks are not propagated by default. This method expects a changelog containing +all kinds of changes (enumerated in `org.apache.flink.types.RowKind`) as the default `ChangelogMode`. + +- `fromChangelogStream(DataStream, Schema)`: Allows to define a schema for the `DataStream` similar +to `fromDataStream(DataStream, Schema)`. Otherwise the semantics are equal to `fromChangelogStream(DataStream)`. + +- `fromChangelogStream(DataStream, Schema, ChangelogMode)`: Gives full control about how to interpret a +stream as a changelog. The passed `ChangelogMode` helps the planner to distinguish between _insert-only_, +_upsert_, or _retract_ behavior. + +- `toChangelogStream(Table)`: Reverse operation of `fromChangelogStream(DataStream)`. It produces a +stream with instances of `org.apache.flink.types.Row` and sets the `RowKind` flag for every record +at runtime. All kinds of updating tables are supported by this method. If the input table contains a +single rowtime column, it will be propagated into a stream record's timestamp. Watermarks will be +propagated as well. + +- `toChangelogStream(Table, Schema)`: Reverse operation of `fromChangelogStream(DataStream, Schema)`. +The method can enrich the produced column data types. The planner might insert implicit casts if necessary. +It is possible to write out the rowtime as a metadata column. + +- `toChangelogStream(Table, Schema, ChangelogMode)`: Gives full control about how to convert a table +to a changelog stream. The passed `ChangelogMode` helps the planner to distinguish between _insert-only_, +_upsert_, or _retract_ behavior. + +From a Table API's perspective, converting from and to DataStream API is similar to reading from or +writing to a virtual table connector that has been defined using a [`CREATE TABLE` DDL]({{< ref "docs/dev/table/sql/create" >}}#create-table) +in SQL. + +Because `fromChangelogStream` behaves similar to `fromDataStream`, we recommend reading +the [previous section](#handling-of-insert-only-streams) before continuing here. + +This virtual connector also supports reading and writing the `rowtime` metadata of the stream record. + +The virtual table source implements [`SupportsSourceWatermark`]({{< ref "docs/dev/table/sourcesSinks" >}}#source-abilities). + +### Examples for `fromChangelogStream` + +The following code shows how to use `fromChangelogStream` for different scenarios. + +{{< tabs "11927973-ce73-4e95-b912-0759e8013f24" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; + +// === EXAMPLE 1 === + +// interpret the stream as a retract stream + +// create a changelog DataStream +DataStream dataStream = + env.fromElements( + Row.ofKind(RowKind.INSERT, "Alice", 12), + Row.ofKind(RowKind.INSERT, "Bob", 5), + Row.ofKind(RowKind.UPDATE_BEFORE, "Alice", 12), + Row.ofKind(RowKind.UPDATE_AFTER, "Alice", 100)); + +// interpret the DataStream as a Table +Table table = tableEnv.fromChangelogStream(dataStream); + +// register the table under a name and perform an aggregation +tableEnv.createTemporaryView("InputTable", table); +tableEnv + .executeSql("SELECT f0 AS name, SUM(f1) AS score FROM InputTable GROUP BY f0") + .print(); + +// prints: +// +----+--------------------------------+-------------+ +// | op | name | score | +// +----+--------------------------------+-------------+ +// | +I | Bob | 5 | +// | +I | Alice | 12 | +// | -D | Alice | 12 | +// | +I | Alice | 100 | +// +----+--------------------------------+-------------+ + + +// === EXAMPLE 2 === + +// interpret the stream as an upsert stream (without a need for UPDATE_BEFORE) + +// create a changelog DataStream +DataStream dataStream = + env.fromElements( + Row.ofKind(RowKind.INSERT, "Alice", 12), + Row.ofKind(RowKind.INSERT, "Bob", 5), + Row.ofKind(RowKind.UPDATE_AFTER, "Alice", 100)); + +// interpret the DataStream as a Table +Table table = + tableEnv.fromChangelogStream( + dataStream, + Schema.newBuilder().primaryKey("f0").build(), + ChangelogMode.upsert()); + +// register the table under a name and perform an aggregation +tableEnv.createTemporaryView("InputTable", table); +tableEnv + .executeSql("SELECT f0 AS name, SUM(f1) AS score FROM InputTable GROUP BY f0") + .print(); + +// prints: +// +----+--------------------------------+-------------+ +// | op | name | score | +// +----+--------------------------------+-------------+ +// | +I | Bob | 5 | +// | +I | Alice | 12 | +// | -D | Alice | 12 | +// | +I | Alice | 100 | +// +----+--------------------------------+-------------+ +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.scala.typeutils.Types +import org.apache.flink.table.api.Schema +import org.apache.flink.table.connector.ChangelogMode +import org.apache.flink.types.{Row, RowKind} + +// === EXAMPLE 1 === + +// interpret the stream as a retract stream + +// create a changelog DataStream +val dataStream = env.fromElements( + Row.ofKind(RowKind.INSERT, "Alice", Int.box(12)), + Row.ofKind(RowKind.INSERT, "Bob", Int.box(5)), + Row.ofKind(RowKind.UPDATE_BEFORE, "Alice", Int.box(12)), + Row.ofKind(RowKind.UPDATE_AFTER, "Alice", Int.box(100)) +)(Types.ROW(Types.STRING, Types.INT)) + + +// interpret the DataStream as a Table +val table = tableEnv.fromChangelogStream(dataStream) + +// register the table under a name and perform an aggregation +tableEnv.createTemporaryView("InputTable", table) +tableEnv + .executeSql("SELECT f0 AS name, SUM(f1) AS score FROM InputTable GROUP BY f0") + .print() + +// prints: +// +----+--------------------------------+-------------+ +// | op | name | score | +// +----+--------------------------------+-------------+ +// | +I | Bob | 5 | +// | +I | Alice | 12 | +// | -D | Alice | 12 | +// | +I | Alice | 100 | +// +----+--------------------------------+-------------+ + + +// === EXAMPLE 2 === + +// interpret the stream as an upsert stream (without a need for UPDATE_BEFORE) + +// create a changelog DataStream +val dataStream = env.fromElements( + Row.ofKind(RowKind.INSERT, "Alice", Int.box(12)), + Row.ofKind(RowKind.INSERT, "Bob", Int.box(5)), + Row.ofKind(RowKind.UPDATE_AFTER, "Alice", Int.box(100)) +)(Types.ROW(Types.STRING, Types.INT)) + +// interpret the DataStream as a Table +val table = + tableEnv.fromChangelogStream( + dataStream, + Schema.newBuilder().primaryKey("f0").build(), + ChangelogMode.upsert()) + +// register the table under a name and perform an aggregation +tableEnv.createTemporaryView("InputTable", table) +tableEnv + .executeSql("SELECT f0 AS name, SUM(f1) AS score FROM InputTable GROUP BY f0") + .print() + +// prints: +// +----+--------------------------------+-------------+ +// | op | name | score | +// +----+--------------------------------+-------------+ +// | +I | Bob | 5 | +// | +I | Alice | 12 | +// | -D | Alice | 12 | +// | +I | Alice | 100 | +// +----+--------------------------------+-------------+ +``` +{{< /tab >}} +{{< /tabs >}} + +The default `ChangelogMode` shown in example 1 should be sufficient for most use cases as it accepts +all kinds of changes. + +However, example 2 shows how to limit the kinds of incoming changes for efficiency by reducing the +number of update messages by 50%. + +### Examples for `toChangelogStream` + +The following code shows how to use `toChangelogStream` for different scenarios. + +{{< tabs "fc4cd538-4345-49ee-b86e-b308f002e069" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.functions.ProcessFunction; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.api.Schema; +import org.apache.flink.table.api.Table; +import org.apache.flink.table.data.StringData; +import org.apache.flink.types.Row; +import org.apache.flink.util.Collector; +import static org.apache.flink.table.api.Expressions.*; + +// create Table with event-time +tableEnv.executeSql( + "CREATE TABLE GeneratedTable " + + "(" + + " name STRING," + + " score INT," + + " event_time TIMESTAMP_LTZ(3)," + + " WATERMARK FOR event_time AS event_time - INTERVAL '10' SECOND" + + ")" + + "WITH ('connector'='datagen')"); + +Table table = tableEnv.from("GeneratedTable"); + + +// === EXAMPLE 1 === + +// convert to DataStream in the simplest and most general way possible (no event-time) + +Table simpleTable = tableEnv + .fromValues(row("Alice", 12), row("Alice", 2), row("Bob", 12)) + .as("name", "score") + .groupBy($("name")) + .select($("name"), $("score").sum()); + +tableEnv + .toChangelogStream(simpleTable) + .executeAndCollect() + .forEachRemaining(System.out::println); + +// prints: +// +I[Bob, 12] +// +I[Alice, 12] +// -U[Alice, 12] +// +U[Alice, 14] + + +// === EXAMPLE 2 === + +// convert to DataStream in the simplest and most general way possible (with event-time) + +DataStream dataStream = tableEnv.toChangelogStream(table); + +// since `event_time` is a single time attribute in the schema, it is set as the +// stream record's timestamp by default; however, at the same time, it remains part of the Row + +dataStream.process( + new ProcessFunction() { + @Override + public void processElement(Row row, Context ctx, Collector out) { + + // prints: [name, score, event_time] + System.out.println(row.getFieldNames(true)); + + // timestamp exists twice + assert ctx.timestamp() == row.getFieldAs("event_time").toEpochMilli(); + } + }); +env.execute(); + + +// === EXAMPLE 3 === + +// convert to DataStream but write out the time attribute as a metadata column which means +// it is not part of the physical schema anymore + +DataStream dataStream = tableEnv.toChangelogStream( + table, + Schema.newBuilder() + .column("name", "STRING") + .column("score", "INT") + .columnByMetadata("rowtime", "TIMESTAMP_LTZ(3)") + .build()); + +// the stream record's timestamp is defined by the metadata; it is not part of the Row + +dataStream.process( + new ProcessFunction() { + @Override + public void processElement(Row row, Context ctx, Collector out) { + + // prints: [name, score] + System.out.println(row.getFieldNames(true)); + + // timestamp exists once + System.out.println(ctx.timestamp()); + } + }); +env.execute(); + + +// === EXAMPLE 4 === + +// for advanced users, it is also possible to use more internal data structures for efficiency + +// note that this is only mentioned here for completeness because using internal data structures +// adds complexity and additional type handling + +// however, converting a TIMESTAMP_LTZ column to `Long` or STRING to `byte[]` might be convenient, +// also structured types can be represented as `Row` if needed + +DataStream dataStream = tableEnv.toChangelogStream( + table, + Schema.newBuilder() + .column( + "name", + DataTypes.STRING().bridgedTo(StringData.class)) + .column( + "score", + DataTypes.INT()) + .column( + "event_time", + DataTypes.TIMESTAMP_LTZ(3).bridgedTo(Long.class)) + .build()); + +// leads to a stream of Row(name: StringData, score: Integer, event_time: Long) +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.api.scala._ +import org.apache.flink.streaming.api.functions.ProcessFunction +import org.apache.flink.streaming.api.scala.DataStream +import org.apache.flink.table.api._ +import org.apache.flink.types.Row +import org.apache.flink.util.Collector +import java.time.Instant + +// create Table with event-time +tableEnv.executeSql( + "CREATE TABLE GeneratedTable " + + "(" + + " name STRING," + + " score INT," + + " event_time TIMESTAMP_LTZ(3)," + + " WATERMARK FOR event_time AS event_time - INTERVAL '10' SECOND" + + ")" + + "WITH ('connector'='datagen')") + +val table = tableEnv.from("GeneratedTable") + + +// === EXAMPLE 1 === + +// convert to DataStream in the simplest and most general way possible (no event-time) + +val simpleTable = tableEnv + .fromValues(row("Alice", 12), row("Alice", 2), row("Bob", 12)) + .as("name", "score") + .groupBy($"name") + .select($"name", $"score".sum()) + +tableEnv + .toChangelogStream(simpleTable) + .executeAndCollect() + .foreach(println) + +// prints: +// +I[Bob, 12] +// +I[Alice, 12] +// -U[Alice, 12] +// +U[Alice, 14] + + +// === EXAMPLE 2 === + +// convert to DataStream in the simplest and most general way possible (with event-time) + +val dataStream: DataStream[Row] = tableEnv.toChangelogStream(table) + +// since `event_time` is a single time attribute in the schema, it is set as the +// stream record's timestamp by default; however, at the same time, it remains part of the Row + +dataStream.process(new ProcessFunction[Row, Unit] { + override def processElement( + row: Row, + ctx: ProcessFunction[Row, Unit]#Context, + out: Collector[Unit]): Unit = { + + // prints: [name, score, event_time] + println(row.getFieldNames(true)) + + // timestamp exists twice + assert(ctx.timestamp() == row.getFieldAs[Instant]("event_time").toEpochMilli) + } +}) +env.execute() + + +// === EXAMPLE 3 === + +// convert to DataStream but write out the time attribute as a metadata column which means +// it is not part of the physical schema anymore + +val dataStream: DataStream[Row] = tableEnv.toChangelogStream( + table, + Schema.newBuilder() + .column("name", "STRING") + .column("score", "INT") + .columnByMetadata("rowtime", "TIMESTAMP_LTZ(3)") + .build()) + +// the stream record's timestamp is defined by the metadata; it is not part of the Row + +dataStream.process(new ProcessFunction[Row, Unit] { + override def processElement( + row: Row, + ctx: ProcessFunction[Row, Unit]#Context, + out: Collector[Unit]): Unit = { + + // prints: [name, score] + println(row.getFieldNames(true)) + + // timestamp exists once + println(ctx.timestamp()) + } +}) +env.execute() + + +// === EXAMPLE 4 === + +// for advanced users, it is also possible to use more internal data structures for better +// efficiency + +// note that this is only mentioned here for completeness because using internal data structures +// adds complexity and additional type handling + +// however, converting a TIMESTAMP_LTZ column to `Long` or STRING to `byte[]` might be convenient, +// also structured types can be represented as `Row` if needed + +val dataStream: DataStream[Row] = tableEnv.toChangelogStream( + table, + Schema.newBuilder() + .column( + "name", + DataTypes.STRING().bridgedTo(classOf[StringData])) + .column( + "score", + DataTypes.INT()) + .column( + "event_time", + DataTypes.TIMESTAMP_LTZ(3).bridgedTo(class[Long])) + .build()) + +// leads to a stream of Row(name: StringData, score: Integer, event_time: Long) +``` +{{< /tab >}} +{{< /tabs >}} + +For more information about which conversions are supported for data types in Example 4, see the +[Table API's Data Types page]({{< ref "docs/dev/table/types" >}}). + +The behavior of `toChangelogStream(Table).executeAndCollect()` is equal to calling `Table.execute().collect()`. +However, `toChangelogStream(Table)` might be more useful for tests because it allows to access the produced +watermarks in a subsequent `ProcessFunction` in DataStream API. + +{{< top >}} + +Mapping between TypeInformation and DataType +-------------------------------------------- + +The DataStream API uses instances of `org.apache.flink.api.common.typeinfo.TypeInformation` to describe +the record type that travels in the stream. In particular, it defines how to serialize and deserialize +records from one DataStream operator to the other. It also helps in serializing state into savepoints +and checkpoints. + +The Table API uses custom data structures to represent records internally and exposes `org.apache.flink.table.types.DataType` +to users for declaring the external format into which the data structures are converted for easier +usage in sources, sinks, UDFs, or DataStream API. + +`DataType` is richer than `TypeInformation` as it also includes details about the logical SQL type. +Therefore, some details will be added implicitly during the conversion. + +Column names and types of a `Table` are automatically derived from the `TypeInformation` of the +`DataStream`. Use `DataStream.getType()` to check whether the type information has been detected +correctly via the DataStream API's reflective type extraction facilities. If the outermost record's +`TypeInformation` is a `CompositeType`, it will be flattened in the first level when deriving a table's +schema. + +### TypeInformation to DataType + +The following rules apply when converting `TypeInformation` to a `DataType`: + +- All subclasses of `TypeInformation` are mapped to logical types, including nullability that is aligned +with Flink's built-in serializers. + +- Subclasses of `TupleTypeInfoBase` are translated into a row (for `Row`) or structured type (for tuples, +POJOs, and case classes). + +- `BigDecimal` is converted to `DECIMAL(38, 18)` by default. + +- The order of `PojoTypeInfo` fields is determined by a constructor with all fields as its parameters. +If that is not found during the conversion, the field order will be alphabetical. + +- `GenericTypeInfo` and other `TypeInformation` that cannot be represented as one of the listed +`org.apache.flink.table.api.DataTypes` will be treated as a black-box `RAW` type. The current session +configuration is used to materialize the serializer of the raw type. Composite nested fields will not +be accessible then. + +- See {{< gh_link file="flink-table/flink-table-common/src/main/java/org/apache/flink/table/types/utils/TypeInfoDataTypeConverter.java" name="TypeInfoDataTypeConverter" >}} for the full translation logic. + +Use `DataTypes.of(TypeInformation)` to call the above logic in custom schema declaration or in UDFs. + +### DataType to TypeInformation + +The table runtime will make sure to properly serialize the output records to the first operator of the +DataStream API. + +{{< hint warning >}} +Afterward, the type information semantics of the DataStream API need to be considered. +{{< /hint >}} + +{{< top >}} + +Legacy Conversion +----------------- + +{{< hint info >}} +The following section describes outdated parts of the API that will be removed in future versions. + +In particular, these parts might not be well integrated into many recent new features and refactorings +(e.g. `RowKind` is not correctly set, type systems don't integrate smoothly). +{{< /hint >}} + + + +### 将 DataStream 或 DataSet 转换成表 + +与在 `TableEnvironment` 注册 `DataStream` 或 `DataSet` 不同,DataStream 和 DataSet 还可以直接转换成 `Table`。如果你想在 Table API 的查询中使用表,这将非常便捷。 + +{{< tabs "53265853-e08d-4f70-93f8-c0f6d1b75e48" >}} +{{< tab "Java" >}} +```java +// get StreamTableEnvironment +// registration of a DataSet in a BatchTableEnvironment is equivalent +StreamTableEnvironment tableEnv = ...; // see "Create a TableEnvironment" section + +DataStream> stream = ... + +// Convert the DataStream into a Table with default fields "f0", "f1" +Table table1 = tableEnv.fromDataStream(stream); + +// Convert the DataStream into a Table with fields "myLong", "myString" +Table table2 = tableEnv.fromDataStream(stream, $("myLong"), $("myString")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// get TableEnvironment +// registration of a DataSet is equivalent +val tableEnv = ... // see "Create a TableEnvironment" section + +val stream: DataStream[(Long, String)] = ... + +// convert the DataStream into a Table with default fields "_1", "_2" +val table1: Table = tableEnv.fromDataStream(stream) + +// convert the DataStream into a Table with fields "myLong", "myString" +val table2: Table = tableEnv.fromDataStream(stream, $"myLong", $"myString") +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + + + +### 将表转换成 DataStream 或 DataSet + +`Table` 可以被转换成 `DataStream` 或 `DataSet`。通过这种方式,定制的 DataSet 或 DataStream 程序就可以在 Table API 或者 SQL 的查询结果上运行了。 + +将 `Table` 转换为 `DataStream` 或者 `DataSet` 时,你需要指定生成的 `DataStream` 或者 `DataSet` 的数据类型,即,`Table` 的每行数据要转换成的数据类型。通常最方便的选择是转换成 `Row` 。以下列表概述了不同选项的功能: + +- **Row**: 字段按位置映射,字段数量任意,支持 `null` 值,无类型安全(type-safe)检查。 +- **POJO**: 字段按名称映射(POJO 必须按`Table` 中字段名称命名),字段数量任意,支持 `null` 值,无类型安全检查。 +- **Case Class**: 字段按位置映射,不支持 `null` 值,有类型安全检查。 +- **Tuple**: 字段按位置映射,字段数量少于 22(Scala)或者 25(Java),不支持 `null` 值,无类型安全检查。 +- **Atomic Type**: `Table` 必须有一个字段,不支持 `null` 值,有类型安全检查。 + + + +#### 将表转换成 DataStream + +流式查询(streaming query)的结果表会动态更新,即,当新纪录到达查询的输入流时,查询结果会改变。因此,像这样将动态查询结果转换成 DataStream 需要对表的更新方式进行编码。 + +将 `Table` 转换为 `DataStream` 有两种模式: + +1. **Append Mode**: 仅当动态 `Table` 仅通过`INSERT`更改进行修改时,才可以使用此模式,即,它仅是追加操作,并且之前输出的结果永远不会更新。 +2. **Retract Mode**: 任何情形都可以使用此模式。它使用 boolean 值对 `INSERT` 和 `DELETE` 操作的数据进行标记。 + +{{< tabs "9533a9f4-e6e8-44d7-a29c-33713724eacc" >}} +{{< tab "Java" >}} +```java +// get StreamTableEnvironment. +StreamTableEnvironment tableEnv = ...; // see "Create a TableEnvironment" section + +// Table with two fields (String name, Integer age) +Table table = ... + +// convert the Table into an append DataStream of Row by specifying the class +DataStream dsRow = tableEnv.toAppendStream(table, Row.class); + +// convert the Table into an append DataStream of Tuple2 +// via a TypeInformation +TupleTypeInfo> tupleType = new TupleTypeInfo<>( + Types.STRING(), + Types.INT()); +DataStream> dsTuple = + tableEnv.toAppendStream(table, tupleType); + +// convert the Table into a retract DataStream of Row. +// A retract stream of type X is a DataStream>. +// The boolean field indicates the type of the change. +// True is INSERT, false is DELETE. +DataStream> retractStream = + tableEnv.toRetractStream(table, Row.class); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// get TableEnvironment. +// registration of a DataSet is equivalent +val tableEnv: StreamTableEnvironment = ... // see "Create a TableEnvironment" section + +// Table with two fields (String name, Integer age) +val table: Table = ... + +// convert the Table into an append DataStream of Row +val dsRow: DataStream[Row] = tableEnv.toAppendStream[Row](table) + +// convert the Table into an append DataStream of Tuple2[String, Int] +val dsTuple: DataStream[(String, Int)] dsTuple = + tableEnv.toAppendStream[(String, Int)](table) + +// convert the Table into a retract DataStream of Row. +// A retract stream of type X is a DataStream[(Boolean, X)]. +// The boolean field indicates the type of the change. +// True is INSERT, false is DELETE. +val retractStream: DataStream[(Boolean, Row)] = tableEnv.toRetractStream[Row](table) +``` +{{< /tab >}} +{{< /tabs >}} + +**注意:** 文档[动态表](streaming/dynamic_tables.html)给出了有关动态表及其属性的详细讨论。 + +注意 **一旦 Table 被转化为 DataStream,必须使用 StreamExecutionEnvironment 的 execute 方法执行该 DataStream 作业。** + + + +#### 将表转换成 DataSet + +将 `Table` 转换成 `DataSet` 的过程如下: + +{{< tabs "65c0fb90-3108-4dc4-9130-626712549183" >}} +{{< tab "Java" >}} +```java +// get BatchTableEnvironment +BatchTableEnvironment tableEnv = BatchTableEnvironment.create(env); + +// Table with two fields (String name, Integer age) +Table table = ... + +// convert the Table into a DataSet of Row by specifying a class +DataSet dsRow = tableEnv.toDataSet(table, Row.class); + +// convert the Table into a DataSet of Tuple2 via a TypeInformation +TupleTypeInfo> tupleType = new TupleTypeInfo<>( + Types.STRING(), + Types.INT()); +DataSet> dsTuple = + tableEnv.toDataSet(table, tupleType); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// get TableEnvironment +// registration of a DataSet is equivalent +val tableEnv = BatchTableEnvironment.create(env) + +// Table with two fields (String name, Integer age) +val table: Table = ... + +// convert the Table into a DataSet of Row +val dsRow: DataSet[Row] = tableEnv.toDataSet[Row](table) + +// convert the Table into a DataSet of Tuple2[String, Int] +val dsTuple: DataSet[(String, Int)] = tableEnv.toDataSet[(String, Int)](table) +``` +{{< /tab >}} +{{< /tabs >}} + +注意 **一旦 Table 被转化为 DataSet,必须使用 ExecutionEnvironment 的 execute 方法执行该 DataSet 作业。** + +{{< top >}} + + + +### 数据类型到 Table Schema 的映射 + +Flink 的 DataStream 和 DataSet APIs 支持多样的数据类型。例如 Tuple(Scala 内置以及Flink Java tuple)、POJO 类型、Scala case class 类型以及 Flink 的 Row 类型等允许嵌套且有多个可在表的表达式中访问的字段的复合数据类型。其他类型被视为原子类型。下面,我们讨论 Table API 如何将这些数据类型类型转换为内部 row 表示形式,并提供将 `DataStream` 转换成 `Table` 的样例。 + +数据类型到 table schema 的映射有两种方式:**基于字段位置**或**基于字段名称**。 + +**基于位置映射** + +基于位置的映射可在保持字段顺序的同时为字段提供更有意义的名称。这种映射方式可用于*具有特定的字段顺序*的复合数据类型以及原子类型。如 tuple、row 以及 case class 这些复合数据类型都有这样的字段顺序。然而,POJO 类型的字段则必须通过名称映射(参见下一章)。可以将字段投影出来,但不能使用`as`重命名。 + +定义基于位置的映射时,输入数据类型中一定不能存在指定的名称,否则 API 会假定应该基于字段名称进行映射。如果未指定任何字段名称,则使用默认的字段名称和复合数据类型的字段顺序,或者使用 `f0` 表示原子类型。 + +{{< tabs "1abe538a-9dc4-4fb4-900a-f27cb888d20d" >}} +{{< tab "Java" >}} +```java +// get a StreamTableEnvironment, works for BatchTableEnvironment equivalently +StreamTableEnvironment tableEnv = ...; // see "Create a TableEnvironment" section; + +DataStream> stream = ... + +// convert DataStream into Table with field "myLong" only +Table table = tableEnv.fromDataStream(stream, $("myLong")); + +// convert DataStream into Table with field names "myLong" and "myInt" +Table table = tableEnv.fromDataStream(stream, $("myLong"), $("myInt")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// get a TableEnvironment +val tableEnv: StreamTableEnvironment = ... // see "Create a TableEnvironment" section + +val stream: DataStream[(Long, Int)] = ... + +// convert DataStream into Table with field "myLong" only +val table: Table = tableEnv.fromDataStream(stream, $"myLong") + +// convert DataStream into Table with field names "myLong" and "myInt" +val table: Table = tableEnv.fromDataStream(stream, $"myLong", $"myInt") +``` +{{< /tab >}} +{{< /tabs >}} + +**基于名称的映射** + +基于名称的映射适用于任何数据类型包括 POJO 类型。这是定义 table schema 映射最灵活的方式。映射中的所有字段均按名称引用,并且可以通过 `as` 重命名。字段可以被重新排序和映射。 + +若果没有指定任何字段名称,则使用默认的字段名称和复合数据类型的字段顺序,或者使用 `f0` 表示原子类型。 + +{{< tabs "e6952073-a5a0-45ff-800e-bd4126c09b26" >}} +{{< tab "Java" >}} +```java +// get a StreamTableEnvironment, works for BatchTableEnvironment equivalently +StreamTableEnvironment tableEnv = ...; // see "Create a TableEnvironment" section + +DataStream> stream = ... + +// convert DataStream into Table with field "f1" only +Table table = tableEnv.fromDataStream(stream, $("f1")); + +// convert DataStream into Table with swapped fields +Table table = tableEnv.fromDataStream(stream, $("f1"), $("f0")); + +// convert DataStream into Table with swapped fields and field names "myInt" and "myLong" +Table table = tableEnv.fromDataStream(stream, $("f1").as("myInt"), $("f0").as("myLong")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// get a TableEnvironment +val tableEnv: StreamTableEnvironment = ... // see "Create a TableEnvironment" section + +val stream: DataStream[(Long, Int)] = ... + +// convert DataStream into Table with field "_2" only +val table: Table = tableEnv.fromDataStream(stream, $"_2") + +// convert DataStream into Table with swapped fields +val table: Table = tableEnv.fromDataStream(stream, $"_2", $"_1") + +// convert DataStream into Table with swapped fields and field names "myInt" and "myLong" +val table: Table = tableEnv.fromDataStream(stream, $"_2" as "myInt", $"_1" as "myLong") +``` +{{< /tab >}} +{{< /tabs >}} + + + +#### 原子类型 + +Flink 将基础数据类型(`Integer`、`Double`、`String`)或者通用数据类型(不可再拆分的数据类型)视为原子类型。原子类型的 `DataStream` 或者 `DataSet` 会被转换成只有一条属性的 `Table`。属性的数据类型可以由原子类型推断出,还可以重新命名属性。 + +{{< tabs "03abca94-5825-4ba7-8ef0-213362c3aaff" >}} +{{< tab "Java" >}} +```java +// get a StreamTableEnvironment, works for BatchTableEnvironment equivalently +StreamTableEnvironment tableEnv = ...; // see "Create a TableEnvironment" section + +DataStream stream = ... + +// convert DataStream into Table with field name "myLong" +Table table = tableEnv.fromDataStream(stream, $("myLong")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// get a TableEnvironment +val tableEnv: StreamTableEnvironment = ... // see "Create a TableEnvironment" section + +val stream: DataStream[Long] = ... + +// convert DataStream into Table with field name "myLong" +val table: Table = tableEnv.fromDataStream(stream, $"myLong") +``` +{{< /tab >}} +{{< /tabs >}} + + + +#### Tuple类型(Scala 和 Java)和 Case Class类型(仅 Scala) + +Flink 支持 Scala 的内置 tuple 类型并给 Java 提供自己的 tuple 类型。两种 tuple 的 DataStream 和 DataSet 都能被转换成表。可以通过提供所有字段名称来重命名字段(基于位置映射)。如果没有指明任何字段名称,则会使用默认的字段名称。如果引用了原始字段名称(对于 Flink tuple 为`f0`、`f1` ... ...,对于 Scala tuple 为`_1`、`_2` ... ...),则 API 会假定映射是基于名称的而不是基于位置的。基于名称的映射可以通过 `as` 对字段和投影进行重新排序。 + +{{< tabs "130f44c6-7432-465a-ae8a-b4c436888361" >}} +{{< tab "Java" >}} +```java +// get a StreamTableEnvironment, works for BatchTableEnvironment equivalently +StreamTableEnvironment tableEnv = ...; // see "Create a TableEnvironment" section + +DataStream> stream = ... + +// convert DataStream into Table with renamed field names "myLong", "myString" (position-based) +Table table = tableEnv.fromDataStream(stream, $("myLong"), $("myString")); + +// convert DataStream into Table with reordered fields "f1", "f0" (name-based) +Table table = tableEnv.fromDataStream(stream, $("f1"), $("f0")); + +// convert DataStream into Table with projected field "f1" (name-based) +Table table = tableEnv.fromDataStream(stream, $("f1")); + +// convert DataStream into Table with reordered and aliased fields "myString", "myLong" (name-based) +Table table = tableEnv.fromDataStream(stream, $("f1").as("myString"), $("f0").as("myLong")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// get a TableEnvironment +val tableEnv: StreamTableEnvironment = ... // see "Create a TableEnvironment" section + +val stream: DataStream[(Long, String)] = ... + +// convert DataStream into Table with field names "myLong", "myString" (position-based) +val table: Table = tableEnv.fromDataStream(stream, $"myLong", $"myString") + +// convert DataStream into Table with reordered fields "_2", "_1" (name-based) +val table: Table = tableEnv.fromDataStream(stream, $"_2", $"_1") + +// convert DataStream into Table with projected field "_2" (name-based) +val table: Table = tableEnv.fromDataStream(stream, $"_2") + +// convert DataStream into Table with reordered and aliased fields "myString", "myLong" (name-based) +val table: Table = tableEnv.fromDataStream(stream, $"_2" as "myString", $"_1" as "myLong") + +// define case class +case class Person(name: String, age: Int) +val streamCC: DataStream[Person] = ... + +// convert DataStream into Table with field names 'myName, 'myAge (position-based) +val table = tableEnv.fromDataStream(streamCC, $"myName", $"myAge") + +// convert DataStream into Table with reordered and aliased fields "myAge", "myName" (name-based) +val table: Table = tableEnv.fromDataStream(stream, $"age" as "myAge", $"name" as "myName") + +``` +{{< /tab >}} +{{< /tabs >}} + + + +#### POJO 类型 (Java 和 Scala) + +Flink 支持 POJO 类型作为复合类型。确定 POJO 类型的规则记录在[这里]({{< ref "docs/dev/serialization/types_serialization" >}}#pojos). + +在不指定字段名称的情况下将 POJO 类型的 `DataStream` 或 `DataSet` 转换成 `Table` 时,将使用原始 POJO 类型字段的名称。名称映射需要原始名称,并且不能按位置进行。字段可以使用别名(带有 `as` 关键字)来重命名,重新排序和投影。 + +{{< tabs "c4bd0a25-c14c-44f8-8353-add5c453c4fd" >}} +{{< tab "Java" >}} +```java +// get a StreamTableEnvironment, works for BatchTableEnvironment equivalently +StreamTableEnvironment tableEnv = ...; // see "Create a TableEnvironment" section + +// Person is a POJO with fields "name" and "age" +DataStream stream = ... + +// convert DataStream into Table with renamed fields "myAge", "myName" (name-based) +Table table = tableEnv.fromDataStream(stream, $("age").as("myAge"), $("name").as("myName")); + +// convert DataStream into Table with projected field "name" (name-based) +Table table = tableEnv.fromDataStream(stream, $("name")); + +// convert DataStream into Table with projected and renamed field "myName" (name-based) +Table table = tableEnv.fromDataStream(stream, $("name").as("myName")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// get a TableEnvironment +val tableEnv: StreamTableEnvironment = ... // see "Create a TableEnvironment" section + +// Person is a POJO with field names "name" and "age" +val stream: DataStream[Person] = ... + +// convert DataStream into Table with renamed fields "myAge", "myName" (name-based) +val table: Table = tableEnv.fromDataStream(stream, $"age" as "myAge", $"name" as "myName") + +// convert DataStream into Table with projected field "name" (name-based) +val table: Table = tableEnv.fromDataStream(stream, $"name") + +// convert DataStream into Table with projected and renamed field "myName" (name-based) +val table: Table = tableEnv.fromDataStream(stream, $"name" as "myName") +``` +{{< /tab >}} +{{< /tabs >}} + + + +#### Row类型 + +`Row` 类型支持任意数量的字段以及具有 `null` 值的字段。字段名称可以通过 `RowTypeInfo` 指定,也可以在将 `Row` 的 `DataStream` 或 `DataSet` 转换为 `Table` 时指定。Row 类型的字段映射支持基于名称和基于位置两种方式。字段可以通过提供所有字段的名称的方式重命名(基于位置映射)或者分别选择进行投影/排序/重命名(基于名称映射)。 + +{{< tabs "3434160b-c826-4064-897c-18d7d7cf7103" >}} +{{< tab "Java" >}} +```java +// get a StreamTableEnvironment, works for BatchTableEnvironment equivalently +StreamTableEnvironment tableEnv = ...; // see "Create a TableEnvironment" section + +// DataStream of Row with two fields "name" and "age" specified in `RowTypeInfo` +DataStream stream = ... + +// convert DataStream into Table with renamed field names "myName", "myAge" (position-based) +Table table = tableEnv.fromDataStream(stream, $("myName"), $("myAge")); + +// convert DataStream into Table with renamed fields "myName", "myAge" (name-based) +Table table = tableEnv.fromDataStream(stream, $("name").as("myName"), $("age").as("myAge")); + +// convert DataStream into Table with projected field "name" (name-based) +Table table = tableEnv.fromDataStream(stream, $("name")); + +// convert DataStream into Table with projected and renamed field "myName" (name-based) +Table table = tableEnv.fromDataStream(stream, $("name").as("myName")); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +// get a TableEnvironment +val tableEnv: StreamTableEnvironment = ... // see "Create a TableEnvironment" section + +// DataStream of Row with two fields "name" and "age" specified in `RowTypeInfo` +val stream: DataStream[Row] = ... + +// convert DataStream into Table with renamed field names "myName", "myAge" (position-based) +val table: Table = tableEnv.fromDataStream(stream, $"myName", $"myAge") + +// convert DataStream into Table with renamed fields "myName", "myAge" (name-based) +val table: Table = tableEnv.fromDataStream(stream, $"name" as "myName", $"age" as "myAge") + +// convert DataStream into Table with projected field "name" (name-based) +val table: Table = tableEnv.fromDataStream(stream, $"name") + +// convert DataStream into Table with projected and renamed field "myName" (name-based) +val table: Table = tableEnv.fromDataStream(stream, $"name" as "myName") +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + diff --git a/docs/content.zh/docs/dev/table/functions/_index.md b/docs/content.zh/docs/dev/table/functions/_index.md new file mode 100644 index 0000000000000..7e059a9a1c731 --- /dev/null +++ b/docs/content.zh/docs/dev/table/functions/_index.md @@ -0,0 +1,23 @@ +--- +title: 函数 +bookCollapseSection: true +weight: 33 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/dev/table/functions/overview.md b/docs/content.zh/docs/dev/table/functions/overview.md new file mode 100644 index 0000000000000..7a1561904f0a9 --- /dev/null +++ b/docs/content.zh/docs/dev/table/functions/overview.md @@ -0,0 +1,98 @@ +--- +title: "函数" +weight: 1 +type: docs +aliases: + - /zh/dev/table/functions/ +--- + + +# 函数 + +Flink 允许用户在 Table API 和 SQL 中使用函数进行数据的转换。 + + + +函数类型 +------------------ + +Flink 中的函数有两个划分标准。 + +一个划分标准是:系统(内置)函数和 Catalog 函数。系统函数没有名称空间,只能通过其名称来进行引用。 +Catalog 函数属于 Catalog 和数据库,因此它们拥有 Catalog 和数据库命名空间。 +用户可以通过全/部分限定名(`catalog.db.func` 或 `db.func`)或者函数名 +来对 Catalog 函数进行引用。 + +另一个划分标准是:临时函数和持久化函数。 +临时函数始终由用户创建,它容易改变并且仅在会话的生命周期内有效。 +持久化函数不是由系统提供,就是存储在 Catalog 中,它在会话的整个生命周期内都有效。 + +这两个划分标准给 Flink 用户提供了 4 种函数: + +1. 临时性系统函数 +2. 系统函数 +3. 临时性 Catalog 函数 +4. Catalog 函数 + +请注意,系统函数始终优先于 Catalog 函数解析,临时函数始终优先于持久化函数解析, +函数解析优先级如下所述。 + +函数引用 +--------------------- + +用户在 Flink 中可以通过精确、模糊两种引用方式引用函数。 + +### 精确函数引用 + +精确函数引用允许用户跨 Catalog,跨数据库调用 Catalog 函数。 +例如:`select mycatalog.mydb.myfunc(x) from mytable` 和 `select mydb.myfunc(x) from mytable`。 + +仅 Flink 1.10 以上版本支持。 + +### 模糊函数引用 + +在模糊函数引用中,用户只需在 SQL 查询中指定函数名,例如: `select myfunc(x) from mytable`。 + + +函数解析顺序 +------------------------- + +当函数名相同,函数类型不同时,函数解析顺序才有意义。 +例如:当有三个都名为 "myfunc" 的临时性 Catalog 函数,Catalog 函数,和系统函数时, +如果没有命名冲突,三个函数将会被解析为一个函数。 + +### 精确函数引用 + +由于系统函数没有命名空间,Flink 中的精确函数引用必须 +指向临时性 Catalog 函数或 Catalog 函数。 + +解析顺序如下: + +1. 临时性 catalog 函数 +2. Catalog 函数 + +### 模糊函数引用 + +解析顺序如下: + +1. 临时性系统函数 +2. 系统函数 +3. 临时性 Catalog 函数, 在会话的当前 Catalog 和当前数据库中 +4. Catalog 函数, 在会话的当前 Catalog 和当前数据库中 diff --git a/docs/content.zh/docs/dev/table/functions/systemFunctions.md b/docs/content.zh/docs/dev/table/functions/systemFunctions.md new file mode 100644 index 0000000000000..060282833562d --- /dev/null +++ b/docs/content.zh/docs/dev/table/functions/systemFunctions.md @@ -0,0 +1,207 @@ +--- +title: "系统(内置)函数" +weight: 32 +type: docs +aliases: + - /zh/dev/table/functions/systemFunctions.html +--- + + +# System (Built-in) Functions + +Flink Table API & SQL provides users with a set of built-in functions for data transformations. This page gives a brief overview of them. +If a function that you need is not supported yet, you can implement a [user-defined function]({{< ref "docs/dev/table/functions/udfs" >}}). +If you think that the function is general enough, please open a Jira issue for it with a detailed description. + +Scalar Functions +---------------- + +The scalar functions take zero, one or more values as the input and return a single value as the result. + +### Comparison Functions + +{{< sql_functions "comparison" >}} + +### Logical Functions + +{{< sql_functions "logical" >}} + +### Arithmetic Functions + +{{< sql_functions "arithmetic" >}} + +### String Functions + +{{< sql_functions "string" >}} + +### Temporal Functions + +{{< sql_functions "temporal" >}} + +### Conditional Functions + +{{< sql_functions "conditional" >}} + +### Type Conversion Functions + +{{< sql_functions "conversion" >}} + +### Collection Functions + +{{< sql_functions "collection" >}} + +### Value Construction Functions + +{{< sql_functions "valueconstruction" >}} + +### Value Access Functions + +{{< sql_functions "valueaccess" >}} + +### Grouping Functions + +{{< sql_functions "grouping" >}} + +### Hash Functions + +{{< sql_functions "hashfunctions" >}} + +### Auxiliary Functions + +{{< sql_functions "auxilary" >}} + +Aggregate Functions +------------------- + +The aggregate functions take an expression across all the rows as the input and return a single aggregated value as the result. + +{{< sql_functions "aggregate" >}} + +Time Interval and Point Unit Specifiers +--------------------------------------- + +The following table lists specifiers for time interval and time point units. + +For Table API, please use `_` for spaces (e.g., `DAY_TO_HOUR`). + +| Time Interval Unit | Time Point Unit | +| :----------------------- | :----------------------------- | +| `MILLENIUM` _(SQL-only)_ | | +| `CENTURY` _(SQL-only)_ | | +| `YEAR` | `YEAR` | +| `YEAR TO MONTH` | | +| `QUARTER` | `QUARTER` | +| `MONTH` | `MONTH` | +| `WEEK` | `WEEK` | +| `DAY` | `DAY` | +| `DAY TO HOUR` | | +| `DAY TO MINUTE` | | +| `DAY TO SECOND` | | +| `HOUR` | `HOUR` | +| `HOUR TO MINUTE` | | +| `HOUR TO SECOND` | | +| `MINUTE` | `MINUTE` | +| `MINUTE TO SECOND` | | +| `SECOND` | `SECOND` | +| | `MILLISECOND` | +| | `MICROSECOND` | +| `DOY` _(SQL-only)_ | | +| `DOW` _(SQL-only)_ | | +| | `SQL_TSI_YEAR` _(SQL-only)_ | +| | `SQL_TSI_QUARTER` _(SQL-only)_ | +| | `SQL_TSI_MONTH` _(SQL-only)_ | +| | `SQL_TSI_WEEK` _(SQL-only)_ | +| | `SQL_TSI_DAY` _(SQL-only)_ | +| | `SQL_TSI_HOUR` _(SQL-only)_ | +| | `SQL_TSI_MINUTE` _(SQL-only)_ | +| | `SQL_TSI_SECOND ` _(SQL-only)_ | + +{{< top >}} + +Column Functions +--------------------------------------- + +The column functions are used to select or deselect table columns. + +{{< hint info >}} +Column functions are only used in Table API. +{{< /hint >}} + +| SYNTAX | DESC | +| :--------------------- | :-------------------------- | +| withColumns(...) | select the specified columns | +| withoutColumns(...) | deselect the columns specified | + +The detailed syntax is as follows: + +```text +columnFunction: + withColumns(columnExprs) + withoutColumns(columnExprs) + +columnExprs: + columnExpr [, columnExpr]* + +columnExpr: + columnRef | columnIndex to columnIndex | columnName to columnName + +columnRef: + columnName(The field name that exists in the table) | columnIndex(a positive integer starting from 1) +``` + +The usage of the column function is illustrated in the following table. (Suppose we have a table with 5 columns: `(a: Int, b: Long, c: String, d:String, e: String)`): + +| API | Usage | Description | +|-|-|-| +| withColumns(*)| select("withColumns(*)") | select("*") = select("a, b, c, d, e") | all the columns | +| withColumns(m to n) | select("withColumns(2 to 4)") = select("b, c, d") | columns from m to n | +| withColumns(m, n, k) | select("withColumns(1, 3, e)") = select("a, c, e") | columns m, n, k | +| withColumns(m, n to k) | select("withColumns(1, 3 to 5)") = select("a, c, d ,e") | mixing of the above two representation | +| withoutColumns(m to n) | select("withoutColumns(2 to 4)") = select("a, e") | deselect columns from m to n | +| withoutColumns(m, n, k) | select("withoutColumns(1, 3, 5)") = select("b, d") | deselect columns m, n, k | +| withoutColumns(m, n to k) | select("withoutColumns(1, 3 to 5)") = select("b") | mixing of the above two representation | + +The column functions can be used in all places where column fields are expected, such as `select, groupBy, orderBy, UDFs etc.` e.g.: + +{{< tabs "402fe551-5fb9-4b17-bd64-e05cbd56b4cc" >}} +{{< tab "Java" >}} +```java +table + .groupBy("withColumns(1 to 3)") + .select("withColumns(a to b), myUDAgg(myUDF(withColumns(5 to 20)))") +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +table + .groupBy(withColumns(1 to 3)) + .select(withColumns('a to 'b), myUDAgg(myUDF(withColumns(5 to 20)))) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +table \ + .group_by("withColumns(1 to 3)") \ + .select("withColumns(a to b), myUDAgg(myUDF(withColumns(5 to 20)))") +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/functions/udfs.md b/docs/content.zh/docs/dev/table/functions/udfs.md new file mode 100644 index 0000000000000..b898ef3e17630 --- /dev/null +++ b/docs/content.zh/docs/dev/table/functions/udfs.md @@ -0,0 +1,1918 @@ +--- +title: "自定义函数" +weight: 51 +type: docs +aliases: + - /zh/dev/table/functions/udfs.html +--- + + +# 自定义函数 + +自定义函数(UDF)是一种扩展开发机制,可以用来在查询语句里调用难以用其他方式表达的频繁使用或自定义的逻辑。 + +自定义函数可以用 JVM 语言(例如 Java 或 Scala)或 Python 实现,实现者可以在 UDF 中使用任意第三方库,本文聚焦于使用 JVM 语言开发自定义函数。 + + + +概述 +-------- + +当前 Flink 有如下几种函数: + +- *标量函数* 将标量值转换成一个新标量值; +- *表值函数* 将标量值转换成新的行数据; +- *聚合函数* 将多行数据里的标量值转换成一个新标量值; +- *表值聚合函数* 将多行数据里的标量值转换成新的行数据; +- *异步表值函数* 是异步查询外部数据系统的特殊函数。 + +注意 标量和表值函数已经使用了新的基于[数据类型]({{< ref "docs/dev/table/types" >}})的类型系统,聚合函数仍然使用基于 `TypeInformation` 的旧类型系统。 + +以下示例展示了如何创建一个基本的标量函数,以及如何在 Table API 和 SQL 里调用这个函数。 + +函数用于 SQL 查询前要先经过注册;而在用于 Table API 时,函数可以先注册后调用,也可以 _内联_ 后直接使用。 + +{{< tabs "1a14d788-01f7-4582-827f-5dda58de0512" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.table.api.*; +import org.apache.flink.table.functions.ScalarFunction; +import static org.apache.flink.table.api.Expressions.*; + +// 定义函数逻辑 +public static class SubstringFunction extends ScalarFunction { + public String eval(String s, Integer begin, Integer end) { + return s.substring(begin, end); + } +} + +TableEnvironment env = TableEnvironment.create(...); + +// 在 Table API 里不经注册直接“内联”调用函数 +env.from("MyTable").select(call(SubstringFunction.class, $("myField"), 5, 12)); + +// 注册函数 +env.createTemporarySystemFunction("SubstringFunction", SubstringFunction.class); + +// 在 Table API 里调用注册好的函数 +env.from("MyTable").select(call("SubstringFunction", $("myField"), 5, 12)); + +// 在 SQL 里调用注册好的函数 +env.sqlQuery("SELECT SubstringFunction(myField, 5, 12) FROM MyTable"); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.table.api._ +import org.apache.flink.table.functions.ScalarFunction + +// define function logic +class SubstringFunction extends ScalarFunction { + def eval(s: String, begin: Integer, end: Integer): String = { + s.substring(begin, end) + } +} + +val env = TableEnvironment.create(...) + +// 在 Table API 里不经注册直接“内联”调用函数 +env.from("MyTable").select(call(classOf[SubstringFunction], $"myField", 5, 12)) + +// 注册函数 +env.createTemporarySystemFunction("SubstringFunction", classOf[SubstringFunction]) + +// 在 Table API 里调用注册好的函数 +env.from("MyTable").select(call("SubstringFunction", $"myField", 5, 12)) + +// 在 SQL 里调用注册好的函数 +env.sqlQuery("SELECT SubstringFunction(myField, 5, 12) FROM MyTable") + +``` +{{< /tab >}} +{{< /tabs >}} + +对于交互式会话,还可以在使用或注册函数之前对其进行参数化,这样可以把函数 _实例_ 而不是函数 _类_ 用作临时函数。 + +为确保函数实例可应用于集群环境,参数必须是可序列化的。 + +{{< tabs "0a4ec6d3-9f17-43bd-9b4d-506ca2cc7ec2" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.table.api.*; +import org.apache.flink.table.functions.ScalarFunction; +import static org.apache.flink.table.api.Expressions.*; + +// 定义可参数化的函数逻辑 +public static class SubstringFunction extends ScalarFunction { + + private boolean endInclusive; + + public SubstringFunction(boolean endInclusive) { + this.endInclusive = endInclusive; + } + + public String eval(String s, Integer begin, Integer end) { + return s.substring(begin, endInclusive ? end + 1 : end); + } +} + +TableEnvironment env = TableEnvironment.create(...); + +// 在 Table API 里不经注册直接“内联”调用函数 +env.from("MyTable").select(call(new SubstringFunction(true), $("myField"), 5, 12)); + +// 注册函数 +env.createTemporarySystemFunction("SubstringFunction", new SubstringFunction(true)); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.table.api._ +import org.apache.flink.table.functions.ScalarFunction + +// 定义可参数化的函数逻辑 +class SubstringFunction(val endInclusive) extends ScalarFunction { + def eval(s: String, begin: Integer, end: Integer): String = { + s.substring(endInclusive ? end + 1 : end) + } +} + +val env = TableEnvironment.create(...) + +// 在 Table API 里不经注册直接“内联”调用函数 +env.from("MyTable").select(call(new SubstringFunction(true), $"myField", 5, 12)) + +// 注册函数 +env.createTemporarySystemFunction("SubstringFunction", new SubstringFunction(true)) + +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + +开发指南 +-------- + +注意在聚合函数使用新的类型系统前,本节仅适用于标量和表值函数。 + +所有的自定义函数都遵循一些基本的实现原则。 + +### 函数类 + +实现类必须继承自合适的基类之一(例如 `org.apache.flink.table.functions.ScalarFunction` )。 + +该类必须声明为 `public` ,而不是 `abstract` ,并且可以被全局访问。不允许使用非静态内部类或匿名类。 + +为了将自定义函数存储在持久化的 catalog 中,该类必须具有默认构造器,且在运行时可实例化。 + +### 求值方法 + +基类提供了一组可以被重写的方法,例如 `open()`、 `close()` 或 `isDeterministic()` 。 + +但是,除了上述方法之外,作用于每条传入记录的主要逻辑还必须通过专门的 _求值方法_ 来实现。 + +根据函数的种类,后台生成的运算符会在运行时调用诸如 `eval()`、`accumulate()` 或 `retract()` 之类的求值方法。 + +这些方法必须声明为 `public` ,并带有一组定义明确的参数。 + +常规的 JVM 方法调用语义是适用的。因此可以: +- 实现重载的方法,例如 `eval(Integer)` 和 `eval(LocalDateTime)`; +- 使用变长参数,例如 `eval(Integer...)`; +- 使用对象继承,例如 `eval(Object)` 可接受 `LocalDateTime` 和 `Integer` 作为参数; +- 也可组合使用,例如 `eval(Object...)` 可接受所有类型的参数。 + +以下代码片段展示了一个重载函数的示例: + +{{< tabs "cef0f4dd-cd54-4bd8-a739-a5384709ed14" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.table.functions.ScalarFunction; + +// 有多个重载求值方法的函数 +public static class SumFunction extends ScalarFunction { + + public Integer eval(Integer a, Integer b) { + return a + b; + } + + public Integer eval(String a, String b) { + return Integer.valueOf(a) + Integer.valueOf(b); + } + + public Integer eval(Double... d) { + double result = 0; + for (double value : d) + result += value; + return (int) result; + } +} + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.table.functions.ScalarFunction +import scala.annotation.varargs + +// 有多个重载求值方法的函数 +class SumFunction extends ScalarFunction { + + def eval(a: Integer, b: Integer): Integer = { + a + b + } + + def eval(a: String, b: String): Integer = { + Integer.valueOf(a) + Integer.valueOf(b) + } + + @varargs // generate var-args like Java + def eval(d: Double*): Integer = { + d.sum.toInt + } +} + +``` +{{< /tab >}} +{{< /tabs >}} + +### 类型推导 + +Table(类似于 SQL 标准)是一种强类型的 API。因此,函数的参数和返回类型都必须映射到[数据类型]({%link dev/table/types.zh.md %})。 + +从逻辑角度看,Planner 需要知道数据类型、精度和小数位数;从 JVM 角度来看,Planner 在调用自定义函数时需要知道如何将内部数据结构表示为 JVM 对象。 + +术语 _类型推导_ 概括了意在验证输入值、派生出参数/返回值数据类型的逻辑。 + +Flink 自定义函数实现了自动的类型推导提取,通过反射从函数的类及其求值方法中派生数据类型。如果这种隐式的反射提取方法不成功,则可以通过使用 `@DataTypeHint` 和 `@FunctionHint` 注解相关参数、类或方法来支持提取过程,下面展示了有关如何注解函数的例子。 + +如果需要更高级的类型推导逻辑,实现者可以在每个自定义函数中显式重写 `getTypeInference()` 方法。但是,建议使用注解方式,因为它可使自定义类型推导逻辑保持在受影响位置附近,而在其他位置则保持默认状态。 + + +#### 自动类型推导 + +自动类型推导会检查函数的类和求值方法,派生出函数参数和结果的数据类型, `@DataTypeHint` 和 `@FunctionHint` 注解支持自动类型推导。 + +有关可以隐式映射到数据类型的类的完整列表,请参阅[数据类型]({%link dev/table/types.zh.md %}#数据类型注解)。 + +**`@DataTypeHint`** + +在许多情况下,需要支持以 _内联_ 方式自动提取出函数参数、返回值的类型。 + +以下例子展示了如何使用 `@DataTypeHint`,详情可参考该注解类的文档。 + +{{< tabs "f5e228c1-ded2-4892-a46d-fdb2431e6841" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.table.annotation.DataTypeHint; +import org.apache.flink.table.annotation.InputGroup; +import org.apache.flink.table.functions.ScalarFunction; +import org.apache.flink.types.Row; + +// 有多个重载求值方法的函数 +public static class OverloadedFunction extends ScalarFunction { + + // no hint required + public Long eval(long a, long b) { + return a + b; + } + + // 定义 decimal 的精度和小数位 + public @DataTypeHint("DECIMAL(12, 3)") BigDecimal eval(double a, double b) { + return BigDecimal.valueOf(a + b); + } + + // 定义嵌套数据类型 + @DataTypeHint("ROW") + public Row eval(int i) { + return Row.of(String.valueOf(i), Instant.ofEpochSecond(i)); + } + + // 允许任意类型的符入,并输出序列化定制后的值 + @DataTypeHint(value = "RAW", bridgedTo = ByteBuffer.class) + public ByteBuffer eval(@DataTypeHint(inputGroup = InputGroup.ANY) Object o) { + return MyUtils.serializeToByteBuffer(o); + } +} + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.table.annotation.DataTypeHint +import org.apache.flink.table.annotation.InputGroup +import org.apache.flink.table.functions.ScalarFunction +import org.apache.flink.types.Row +import scala.annotation.varargs + +// function with overloaded evaluation methods +class OverloadedFunction extends ScalarFunction { + + // no hint required + def eval(a: Long, b: Long): Long = { + a + b + } + + // 定义 decimal 的精度和小数位 + @DataTypeHint("DECIMAL(12, 3)") + def eval(double a, double b): BigDecimal = { + java.lang.BigDecimal.valueOf(a + b) + } + + // 定义嵌套数据类型 + @DataTypeHint("ROW") + def eval(Int i): Row = { + Row.of(java.lang.String.valueOf(i), java.time.Instant.ofEpochSecond(i)) + } + + // 允许任意类型的符入,并输出定制序列化后的值 + @DataTypeHint(value = "RAW", bridgedTo = classOf[java.nio.ByteBuffer]) + def eval(@DataTypeHint(inputGroup = InputGroup.ANY) Object o): java.nio.ByteBuffer = { + MyUtils.serializeToByteBuffer(o) + } +} + +``` +{{< /tab >}} +{{< /tabs >}} + +**`@FunctionHint`** + +有时我们希望一种求值方法可以同时处理多种数据类型,有时又要求对重载的多个求值方法仅声明一次通用的结果类型。 + +`@FunctionHint` 注解可以提供从入参数据类型到结果数据类型的映射,它可以在整个函数类或求值方法上注解输入、累加器和结果的数据类型。可以在类顶部声明一个或多个注解,也可以为类的所有求值方法分别声明一个或多个注解。所有的 hint 参数都是可选的,如果未定义参数,则使用默认的基于反射的类型提取。在函数类顶部定义的 hint 参数被所有求值方法继承。 + +以下例子展示了如何使用 `@FunctionHint`,详情可参考该注解类的文档。 + +{{< tabs "89e481b6-9709-4f8d-883b-921f0e67b041" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.table.annotation.DataTypeHint; +import org.apache.flink.table.annotation.FunctionHint; +import org.apache.flink.table.functions.TableFunction; +import org.apache.flink.types.Row; + +// 为函数类的所有求值方法指定同一个输出类型 +@FunctionHint(output = @DataTypeHint("ROW")) +public static class OverloadedFunction extends TableFunction { + + public void eval(int a, int b) { + collect(Row.of("Sum", a + b)); + } + + // overloading of arguments is still possible + public void eval() { + collect(Row.of("Empty args", -1)); + } +} + +// 解耦类型推导与求值方法,类型推导完全取决于 FunctionHint +@FunctionHint( + input = {@DataTypeHint("INT"), @DataTypeHint("INT")}, + output = @DataTypeHint("INT") +) +@FunctionHint( + input = {@DataTypeHint("BIGINT"), @DataTypeHint("BIGINT")}, + output = @DataTypeHint("BIGINT") +) +@FunctionHint( + input = {}, + output = @DataTypeHint("BOOLEAN") +) +public static class OverloadedFunction extends TableFunction { + + // an implementer just needs to make sure that a method exists + // that can be called by the JVM + public void eval(Object... o) { + if (o.length == 0) { + collect(false); + } + collect(o[0]); + } +} + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala + +import org.apache.flink.table.annotation.DataTypeHint +import org.apache.flink.table.annotation.FunctionHint +import org.apache.flink.table.functions.TableFunction +import org.apache.flink.types.Row + +// 为函数类的所有求值方法指定同一个输出类型 +@FunctionHint(output = new DataTypeHint("ROW")) +class OverloadedFunction extends TableFunction[Row] { + + def eval(a: Int, b: Int): Unit = { + collect(Row.of("Sum", Int.box(a + b))) + } + + // overloading of arguments is still possible + def eval(): Unit = { + collect(Row.of("Empty args", Int.box(-1))) + } +} + +// 解耦类型推导与求值方法,类型推导完全取决于 @FunctionHint +@FunctionHint( + input = Array(new DataTypeHint("INT"), new DataTypeHint("INT")), + output = new DataTypeHint("INT") +) +@FunctionHint( + input = Array(new DataTypeHint("BIGINT"), new DataTypeHint("BIGINT")), + output = new DataTypeHint("BIGINT") +) +@FunctionHint( + input = Array(), + output = new DataTypeHint("BOOLEAN") +) +class OverloadedFunction extends TableFunction[AnyRef] { + + // an implementer just needs to make sure that a method exists + // that can be called by the JVM + @varargs + def eval(o: AnyRef*) = { + if (o.length == 0) { + collect(Boolean.box(false)) + } + collect(o(0)) + } +} + +``` +{{< /tab >}} +{{< /tabs >}} + +#### 定制类型推导 + +在大多数情况下,`@DataTypeHint` 和 `@FunctionHint` 足以构建自定义函数,然而通过重写 `getTypeInference()` 定制自动类型推导逻辑,实现者可以创建任意像系统内置函数那样有用的函数。 + +以下用 Java 实现的例子展示了定制类型推导的潜力,它根据字符串参数来确定函数的结果类型。该函数带有两个字符串参数:第一个参数表示要分析的字符串,第二个参数表示目标类型。 + +{{< tabs "db47e1b8-e387-432a-8bd5-1317e12ff64b" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.catalog.DataTypeFactory; +import org.apache.flink.table.functions.ScalarFunction; +import org.apache.flink.table.types.inference.TypeInference; +import org.apache.flink.types.Row; + +public static class LiteralFunction extends ScalarFunction { + public Object eval(String s, String type) { + switch (type) { + case "INT": + return Integer.valueOf(s); + case "DOUBLE": + return Double.valueOf(s); + case "STRING": + default: + return s; + } + } + + // 禁用自动的反射式类型推导,使用如下逻辑进行类型推导 + @Override + public TypeInference getTypeInference(DataTypeFactory typeFactory) { + return TypeInference.newBuilder() + // 指定输入参数的类型,必要时参数会被隐式转换 + .typedArguments(DataTypes.STRING(), DataTypes.STRING()) + // specify a strategy for the result data type of the function + .outputTypeStrategy(callContext -> { + if (!callContext.isArgumentLiteral(1) || callContext.isArgumentNull(1)) { + throw callContext.newValidationError("Literal expected for second argument."); + } + // 基于字符串值返回数据类型 + final String literal = callContext.getArgumentValue(1, String.class).orElse("STRING"); + switch (literal) { + case "INT": + return Optional.of(DataTypes.INT().notNull()); + case "DOUBLE": + return Optional.of(DataTypes.DOUBLE().notNull()); + case "STRING": + default: + return Optional.of(DataTypes.STRING()); + } + }) + .build(); + } +} + +``` +{{< /tab >}} +{{< /tabs >}} + +For more examples of custom type inference, see also the `flink-examples-table` module with +{{< gh_link file="flink-examples/flink-examples-table/src/main/java/org/apache/flink/table/examples/java/functions/AdvancedFunctionsExample.java" name="advanced function implementation" >}}. + +### 运行时集成 +------------------- + +有时候自定义函数需要获取一些全局信息,或者在真正被调用之前做一些配置(setup)/清理(clean-up)的工作。自定义函数也提供了 `open()` 和 `close()` 方法,你可以重写这两个方法做到类似于 DataStream API 中 `RichFunction` 的功能。 + +open() 方法在求值方法被调用之前先调用。close() 方法在求值方法调用完之后被调用。 + +open() 方法提供了一个 FunctionContext,它包含了一些自定义函数被执行时的上下文信息,比如 metric group、分布式文件缓存,或者是全局的作业参数等。 + +下面的信息可以通过调用 `FunctionContext` 的对应的方法来获得: + +| 方法 | 描述 | +| :------------------------------------ | :----------------------------------------------------- | +| `getMetricGroup()` | 执行该函数的 subtask 的 Metric Group。 | +| `getCachedFile(name)` | 分布式文件缓存的本地临时文件副本。| +| `getJobParameter(name, defaultValue)` | 跟对应的 key 关联的全局参数值。 | + +下面的例子展示了如何在一个标量函数中通过 FunctionContext 来获取一个全局的任务参数: + +{{< tabs "39170f36-8e99-464a-bc9c-0d052568ef8a" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.table.api.*; +import org.apache.flink.table.functions.FunctionContext; +import org.apache.flink.table.functions.ScalarFunction; + +public static class HashCodeFunction extends ScalarFunction { + + private int factor = 0; + + @Override + public void open(FunctionContext context) throws Exception { + // 获取参数 "hashcode_factor" + // 如果不存在,则使用默认值 "12" + factor = Integer.parseInt(context.getJobParameter("hashcode_factor", "12")); + } + + public int eval(String s) { + return s.hashCode() * factor; + } +} + +TableEnvironment env = TableEnvironment.create(...); + +// 设置任务参数 +env.getConfig().addJobParameter("hashcode_factor", "31"); + +// 注册函数 +env.createTemporarySystemFunction("hashCode", HashCodeFunction.class); + +// 调用函数 +env.sqlQuery("SELECT myField, hashCode(myField) FROM MyTable"); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.table.api._ +import org.apache.flink.table.functions.FunctionContext +import org.apache.flink.table.functions.ScalarFunction + +class HashCodeFunction extends ScalarFunction { + + private var factor: Int = 0 + + override def open(context: FunctionContext): Unit = { + // 获取参数 "hashcode_factor" + // 如果不存在,则使用默认值 "12" + factor = context.getJobParameter("hashcode_factor", "12").toInt + } + + def eval(s: String): Int = { + s.hashCode * factor + } +} + +val env = TableEnvironment.create(...) + +// 设置任务参数 +env.getConfig.addJobParameter("hashcode_factor", "31") + +// 注册函数 +env.createTemporarySystemFunction("hashCode", classOf[HashCodeFunction]) + +// 调用函数 +env.sqlQuery("SELECT myField, hashCode(myField) FROM MyTable") + +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + +标量函数 +---------------- + +自定义标量函数可以把 0 到多个标量值映射成 1 个标量值,[数据类型]({%link dev/table/types.zh.md %})里列出的任何数据类型都可作为求值方法的参数和返回值类型。 + +想要实现自定义标量函数,你需要扩展 `org.apache.flink.table.functions` 里面的 `ScalarFunction` 并且实现一个或者多个求值方法。标量函数的行为取决于你写的求值方法。求值方法必须是 `public` 的,而且名字必须是 `eval`。 + +下面的例子展示了如何实现一个求哈希值的函数并在查询里调用它,详情可参考[开发指南](#开发指南): + +{{< tabs "6828e58c-65f1-4bed-a35e-63fec7aa8c19" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.table.annotation.InputGroup; +import org.apache.flink.table.api.*; +import org.apache.flink.table.functions.ScalarFunction; +import static org.apache.flink.table.api.Expressions.*; + +public static class HashFunction extends ScalarFunction { + + // 接受任意类型输入,返回 INT 型输出 + public int eval(@DataTypeHint(inputGroup = InputGroup.ANY) Object o) { + return o.hashCode(); + } +} + +TableEnvironment env = TableEnvironment.create(...); + +// 在 Table API 里不经注册直接“内联”调用函数 +env.from("MyTable").select(call(HashFunction.class, $("myField"))); + +// 注册函数 +env.createTemporarySystemFunction("HashFunction", HashFunction.class); + +// 在 Table API 里调用注册好的函数 +env.from("MyTable").select(call("HashFunction", $("myField"))); + +// 在 SQL 里调用注册好的函数 +env.sqlQuery("SELECT HashFunction(myField) FROM MyTable"); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.table.annotation.InputGroup +import org.apache.flink.table.api._ +import org.apache.flink.table.functions.ScalarFunction + +class HashFunction extends ScalarFunction { + + // 接受任意类型输入,返回 INT 型输出 + def eval(@DataTypeHint(inputGroup = InputGroup.ANY) o: AnyRef): Int { + return o.hashCode(); + } +} + +val env = TableEnvironment.create(...) + +// 在 Table API 里不经注册直接“内联”调用函数 +env.from("MyTable").select(call(classOf[HashFunction], $"myField")) + +// 注册函数 +env.createTemporarySystemFunction("HashFunction", classOf[HashFunction]) + +// 在 Table API 里调用注册好的函数 +env.from("MyTable").select(call("HashFunction", $"myField")) + +// 在 SQL 里调用注册好的函数 +env.sqlQuery("SELECT HashFunction(myField) FROM MyTable") + +``` +{{< /tab >}} +{{< /tabs >}} + +如果你打算使用 Python 实现或调用标量函数,详情可参考 [Python 标量函数]({{< ref "docs/dev/python/table/udfs/python_udfs" >}}#scalar-functions)。 + +{{< top >}} + +表值函数 +--------------- + +跟自定义标量函数一样,自定义表值函数的输入参数也可以是 0 到多个标量。但是跟标量函数只能返回一个值不同的是,它可以返回任意多行。返回的每一行可以包含 1 到多列,如果输出行只包含 1 列,会省略结构化信息并生成标量值,这个标量值在运行阶段会隐式地包装进行里。 + +要定义一个表值函数,你需要扩展 `org.apache.flink.table.functions` 下的 `TableFunction`,可以通过实现多个名为 `eval` 的方法对求值方法进行重载。像其他函数一样,输入和输出类型也可以通过反射自动提取出来。表值函数返回的表的类型取决于 `TableFunction` 类的泛型参数 `T`,不同于标量函数,表值函数的求值方法本身不包含返回类型,而是通过 `collect(T)` 方法来发送要输出的行。 + +在 Table API 中,表值函数是通过 `.joinLateral(...)` 或者 `.leftOuterJoinLateral(...)` 来使用的。`joinLateral` 算子会把外表(算子左侧的表)的每一行跟跟表值函数返回的所有行(位于算子右侧)进行 (cross)join。`leftOuterJoinLateral` 算子也是把外表(算子左侧的表)的每一行跟表值函数返回的所有行(位于算子右侧)进行(cross)join,并且如果表值函数返回 0 行也会保留外表的这一行。 + +在 SQL 里面用 `JOIN` 或者 以 `ON TRUE` 为条件的 `LEFT JOIN` 来配合 `LATERAL TABLE()` 的使用。 + +下面的例子展示了如何实现一个分隔函数并在查询里调用它,详情可参考[开发指南](#开发指南): + +{{< tabs "1cf0d01a-2559-4b16-bb98-9b348f37bdc0" >}} +{{< tab "Java" >}} +```java +import org.apache.flink.table.annotation.DataTypeHint; +import org.apache.flink.table.annotation.FunctionHint; +import org.apache.flink.table.api.*; +import org.apache.flink.table.functions.TableFunction; +import org.apache.flink.types.Row; +import static org.apache.flink.table.api.Expressions.*; + +@FunctionHint(output = @DataTypeHint("ROW")) +public static class SplitFunction extends TableFunction { + + public void eval(String str) { + for (String s : str.split(" ")) { + // use collect(...) to emit a row + collect(Row.of(s, s.length())); + } + } +} + +TableEnvironment env = TableEnvironment.create(...); + +// 在 Table API 里不经注册直接“内联”调用函数 +env + .from("MyTable") + .joinLateral(call(SplitFunction.class, $("myField"))) + .select($("myField"), $("word"), $("length")); +env + .from("MyTable") + .leftOuterJoinLateral(call(SplitFunction.class, $("myField"))) + .select($("myField"), $("word"), $("length")); + +// 在 Table API 里重命名函数字段 +env + .from("MyTable") + .leftOuterJoinLateral(call(SplitFunction.class, $("myField")).as("newWord", "newLength")) + .select($("myField"), $("newWord"), $("newLength")); + +// 注册函数 +env.createTemporarySystemFunction("SplitFunction", SplitFunction.class); + +// 在 Table API 里调用注册好的函数 +env + .from("MyTable") + .joinLateral(call("SplitFunction", $("myField"))) + .select($("myField"), $("word"), $("length")); +env + .from("MyTable") + .leftOuterJoinLateral(call("SplitFunction", $("myField"))) + .select($("myField"), $("word"), $("length")); + +// 在 SQL 里调用注册好的函数 +env.sqlQuery( + "SELECT myField, word, length " + + "FROM MyTable, LATERAL TABLE(SplitFunction(myField))"); +env.sqlQuery( + "SELECT myField, word, length " + + "FROM MyTable " + + "LEFT JOIN LATERAL TABLE(SplitFunction(myField)) ON TRUE"); + +// 在 SQL 里重命名函数字段 +env.sqlQuery( + "SELECT myField, newWord, newLength " + + "FROM MyTable " + + "LEFT JOIN LATERAL TABLE(SplitFunction(myField)) AS T(newWord, newLength) ON TRUE"); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import org.apache.flink.table.annotation.DataTypeHint +import org.apache.flink.table.annotation.FunctionHint +import org.apache.flink.table.api._ +import org.apache.flink.table.functions.TableFunction +import org.apache.flink.types.Row + +@FunctionHint(output = new DataTypeHint("ROW")) +class SplitFunction extends TableFunction[Row] { + + def eval(str: String): Unit = { + // use collect(...) to emit a row + str.split(" ").foreach(s => collect(Row.of(s, Int.box(s.length)))) + } +} + +val env = TableEnvironment.create(...) + +// 在 Table API 里不经注册直接“内联”调用函数 +env + .from("MyTable") + .joinLateral(call(classOf[SplitFunction], $"myField") + .select($"myField", $"word", $"length") +env + .from("MyTable") + .leftOuterJoinLateral(call(classOf[SplitFunction], $"myField")) + .select($"myField", $"word", $"length") + +// 在 Table API 里重命名函数字段 +env + .from("MyTable") + .leftOuterJoinLateral(call(classOf[SplitFunction], $"myField").as("newWord", "newLength")) + .select($"myField", $"newWord", $"newLength") + +// 注册函数 +env.createTemporarySystemFunction("SplitFunction", classOf[SplitFunction]) + +// 在 Table API 里调用注册好的函数 +env + .from("MyTable") + .joinLateral(call("SplitFunction", $"myField")) + .select($"myField", $"word", $"length") +env + .from("MyTable") + .leftOuterJoinLateral(call("SplitFunction", $"myField")) + .select($"myField", $"word", $"length") + +// 在 SQL 里调用注册好的函数 +env.sqlQuery( + "SELECT myField, word, length " + + "FROM MyTable, LATERAL TABLE(SplitFunction(myField))"); +env.sqlQuery( + "SELECT myField, word, length " + + "FROM MyTable " + + "LEFT JOIN LATERAL TABLE(SplitFunction(myField)) ON TRUE") + +// 在 SQL 里重命名函数字段 +env.sqlQuery( + "SELECT myField, newWord, newLength " + + "FROM MyTable " + + "LEFT JOIN LATERAL TABLE(SplitFunction(myField)) AS T(newWord, newLength) ON TRUE") + +``` +{{< /tab >}} +{{< /tabs >}} + +如果你打算使用 Scala,不要把表值函数声明为 Scala `object`,Scala `object` 是单例对象,将导致并发问题。 + +如果你打算使用 Python 实现或调用表值函数,详情可参考 [Python 表值函数]({{< ref "docs/dev/python/table/udfs/python_udfs" >}}#table-functions)。 + +{{< top >}} + +聚合函数 +--------------------- + +自定义聚合函数(UDAGG)是把一个表(一行或者多行,每行可以有一列或者多列)聚合成一个标量值。 + +UDAGG mechanism + +上面的图片展示了一个聚合的例子。假设你有一个关于饮料的表。表里面有三个字段,分别是 `id`、`name`、`price`,表里有 5 行数据。假设你需要找到所有饮料里最贵的饮料的价格,即执行一个 `max()` 聚合。你需要遍历所有 5 行数据,而结果就只有一个数值。 + +自定义聚合函数是通过扩展 `AggregateFunction` 来实现的。`AggregateFunction` 的工作过程如下。首先,它需要一个 `accumulator`,它是一个数据结构,存储了聚合的中间结果。通过调用 `AggregateFunction` 的 `createAccumulator()` 方法创建一个空的 accumulator。接下来,对于每一行数据,会调用 `accumulate()` 方法来更新 accumulator。当所有的数据都处理完了之后,通过调用 `getValue` 方法来计算和返回最终的结果。 + +**下面几个方法是每个 `AggregateFunction` 必须要实现的:** + +- `createAccumulator()` +- `accumulate()` +- `getValue()` + +Flink 的类型推导在遇到复杂类型的时候可能会推导出错误的结果,比如那些非基本类型和普通的 POJO 类型的复杂类型。所以跟 `ScalarFunction` 和 `TableFunction` 一样,`AggregateFunction` 也提供了 `AggregateFunction#getResultType()` 和 `AggregateFunction#getAccumulatorType()` 来分别指定返回值类型和 accumulator 的类型,两个函数的返回值类型也都是 `TypeInformation`。 + +除了上面的方法,还有几个方法可以选择实现。这些方法有些可以让查询更加高效,而有些是在某些特定场景下必须要实现的。例如,如果聚合函数用在会话窗口(当两个会话窗口合并的时候需要 merge 他们的 accumulator)的话,`merge()` 方法就是必须要实现的。 + +**`AggregateFunction` 的以下方法在某些场景下是必须实现的:** + +- `retract()` 在 bounded `OVER` 窗口中是必须实现的。 +- `merge()` 在许多批式聚合和会话以及滚动窗口聚合中是必须实现的。除此之外,这个方法对于优化也很多帮助。例如,两阶段聚合优化就需要所有的 `AggregateFunction` 都实现 `merge` 方法。 +- `resetAccumulator()` 在许多批式聚合中是必须实现的。 + +`AggregateFunction` 的所有方法都必须是 `public` 的,不能是 `static` 的,而且名字必须跟上面写的一样。`createAccumulator`、`getValue`、`getResultType` 以及 `getAccumulatorType` 这几个函数是在抽象类 `AggregateFunction` 中定义的,而其他函数都是约定的方法。如果要定义一个聚合函数,你需要扩展 `org.apache.flink.table.functions.AggregateFunction`,并且实现一个(或者多个)`accumulate` 方法。`accumulate` 方法可以重载,每个方法的参数类型不同,并且支持变长参数。 + +`AggregateFunction` 的所有方法的详细文档如下。 + +{{< tabs "55ea8f05-96d8-462f-8bc4-8316da3d097b" >}} +{{< tab "Java" >}} +```java +/** + * Base class for user-defined aggregates and table aggregates. + * + * @param the type of the aggregation result. + * @param the type of the aggregation accumulator. The accumulator is used to keep the + * aggregated values which are needed to compute an aggregation result. + */ +public abstract class UserDefinedAggregateFunction extends UserDefinedFunction { + + /** + * Creates and init the Accumulator for this (table)aggregate function. + * + * @return the accumulator with the initial value + */ + public ACC createAccumulator(); // MANDATORY + + /** + * Returns the TypeInformation of the (table)aggregate function's result. + * + * @return The TypeInformation of the (table)aggregate function's result or null if the result + * type should be automatically inferred. + */ + public TypeInformation getResultType = null; // PRE-DEFINED + + /** + * Returns the TypeInformation of the (table)aggregate function's accumulator. + * + * @return The TypeInformation of the (table)aggregate function's accumulator or null if the + * accumulator type should be automatically inferred. + */ + public TypeInformation getAccumulatorType = null; // PRE-DEFINED +} + +/** + * Base class for aggregation functions. + * + * @param the type of the aggregation result + * @param the type of the aggregation accumulator. The accumulator is used to keep the + * aggregated values which are needed to compute an aggregation result. + * AggregateFunction represents its state using accumulator, thereby the state of the + * AggregateFunction must be put into the accumulator. + */ +public abstract class AggregateFunction extends UserDefinedAggregateFunction { + + /** Processes the input values and update the provided accumulator instance. The method + * accumulate can be overloaded with different custom types and arguments. An AggregateFunction + * requires at least one accumulate() method. + * + * @param accumulator the accumulator which contains the current aggregated results + * @param [user defined inputs] the input value (usually obtained from a new arrived data). + */ + public void accumulate(ACC accumulator, [user defined inputs]); // MANDATORY + + /** + * Retracts the input values from the accumulator instance. The current design assumes the + * inputs are the values that have been previously accumulated. The method retract can be + * overloaded with different custom types and arguments. This function must be implemented for + * datastream bounded over aggregate. + * + * @param accumulator the accumulator which contains the current aggregated results + * @param [user defined inputs] the input value (usually obtained from a new arrived data). + */ + public void retract(ACC accumulator, [user defined inputs]); // OPTIONAL + + /** + * Merges a group of accumulator instances into one accumulator instance. This function must be + * implemented for datastream session window grouping aggregate and dataset grouping aggregate. + * + * @param accumulator the accumulator which will keep the merged aggregate results. It should + * be noted that the accumulator may contain the previous aggregated + * results. Therefore user should not replace or clean this instance in the + * custom merge method. + * @param its an {@link java.lang.Iterable} pointed to a group of accumulators that will be + * merged. + */ + public void merge(ACC accumulator, java.lang.Iterable its); // OPTIONAL + + /** + * Called every time when an aggregation result should be materialized. + * The returned value could be either an early and incomplete result + * (periodically emitted as data arrive) or the final result of the + * aggregation. + * + * @param accumulator the accumulator which contains the current + * aggregated results + * @return the aggregation result + */ + public T getValue(ACC accumulator); // MANDATORY + + /** + * Resets the accumulator for this [[AggregateFunction]]. This function must be implemented for + * dataset grouping aggregate. + * + * @param accumulator the accumulator which needs to be reset + */ + public void resetAccumulator(ACC accumulator); // OPTIONAL + + /** + * Returns true if this AggregateFunction can only be applied in an OVER window. + * + * @return true if the AggregateFunction requires an OVER window, false otherwise. + */ + public Boolean requiresOver = false; // PRE-DEFINED +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +/** + * Base class for user-defined aggregates and table aggregates. + * + * @tparam T the type of the aggregation result. + * @tparam ACC the type of the aggregation accumulator. The accumulator is used to keep the + * aggregated values which are needed to compute an aggregation result. + */ +abstract class UserDefinedAggregateFunction[T, ACC] extends UserDefinedFunction { + + /** + * Creates and init the Accumulator for this (table)aggregate function. + * + * @return the accumulator with the initial value + */ + def createAccumulator(): ACC // MANDATORY + + /** + * Returns the TypeInformation of the (table)aggregate function's result. + * + * @return The TypeInformation of the (table)aggregate function's result or null if the result + * type should be automatically inferred. + */ + def getResultType: TypeInformation[T] = null // PRE-DEFINED + + /** + * Returns the TypeInformation of the (table)aggregate function's accumulator. + * + * @return The TypeInformation of the (table)aggregate function's accumulator or null if the + * accumulator type should be automatically inferred. + */ + def getAccumulatorType: TypeInformation[ACC] = null // PRE-DEFINED +} + +/** + * Base class for aggregation functions. + * + * @tparam T the type of the aggregation result + * @tparam ACC the type of the aggregation accumulator. The accumulator is used to keep the + * aggregated values which are needed to compute an aggregation result. + * AggregateFunction represents its state using accumulator, thereby the state of the + * AggregateFunction must be put into the accumulator. + */ +abstract class AggregateFunction[T, ACC] extends UserDefinedAggregateFunction[T, ACC] { + + /** + * Processes the input values and update the provided accumulator instance. The method + * accumulate can be overloaded with different custom types and arguments. An AggregateFunction + * requires at least one accumulate() method. + * + * @param accumulator the accumulator which contains the current aggregated results + * @param [user defined inputs] the input value (usually obtained from a new arrived data). + */ + def accumulate(accumulator: ACC, [user defined inputs]): Unit // MANDATORY + + /** + * Retracts the input values from the accumulator instance. The current design assumes the + * inputs are the values that have been previously accumulated. The method retract can be + * overloaded with different custom types and arguments. This function must be implemented for + * datastream bounded over aggregate. + * + * @param accumulator the accumulator which contains the current aggregated results + * @param [user defined inputs] the input value (usually obtained from a new arrived data). + */ + def retract(accumulator: ACC, [user defined inputs]): Unit // OPTIONAL + + /** + * Merges a group of accumulator instances into one accumulator instance. This function must be + * implemented for datastream session window grouping aggregate and dataset grouping aggregate. + * + * @param accumulator the accumulator which will keep the merged aggregate results. It should + * be noted that the accumulator may contain the previous aggregated + * results. Therefore user should not replace or clean this instance in the + * custom merge method. + * @param its an [[java.lang.Iterable]] pointed to a group of accumulators that will be + * merged. + */ + def merge(accumulator: ACC, its: java.lang.Iterable[ACC]): Unit // OPTIONAL + + /** + * Called every time when an aggregation result should be materialized. + * The returned value could be either an early and incomplete result + * (periodically emitted as data arrive) or the final result of the + * aggregation. + * + * @param accumulator the accumulator which contains the current + * aggregated results + * @return the aggregation result + */ + def getValue(accumulator: ACC): T // MANDATORY + + /** + * Resets the accumulator for this [[AggregateFunction]]. This function must be implemented for + * dataset grouping aggregate. + * + * @param accumulator the accumulator which needs to be reset + */ + def resetAccumulator(accumulator: ACC): Unit // OPTIONAL + + /** + * Returns true if this AggregateFunction can only be applied in an OVER window. + * + * @return true if the AggregateFunction requires an OVER window, false otherwise. + */ + def requiresOver: Boolean = false // PRE-DEFINED +} +``` +{{< /tab >}} +{{< /tabs >}} + + +下面的例子展示了如何: + +- 定义一个聚合函数来计算某一列的加权平均, +- 在 `TableEnvironment` 中注册函数, +- 在查询中使用函数。 + +为了计算加权平均值,accumulator 需要存储加权总和以及数据的条数。在我们的例子里,我们定义了一个类 `WeightedAvgAccum` 来作为 accumulator。Flink 的 checkpoint 机制会自动保存 accumulator,在失败时进行恢复,以此来保证精确一次的语义。 + +我们的 `WeightedAvg`(聚合函数)的 `accumulate` 方法有三个输入参数。第一个是 `WeightedAvgAccum` accumulator,另外两个是用户自定义的输入:输入的值 `ivalue` 和 输入的权重 `iweight`。尽管 `retract()`、`merge()`、`resetAccumulator()` 这几个方法在大多数聚合类型中都不是必须实现的,我们也在样例中提供了他们的实现。请注意我们在 Scala 样例中也是用的是 Java 的基础类型,并且定义了 `getResultType()` 和 `getAccumulatorType()`,因为 Flink 的类型推导对于 Scala 的类型推导做的不是很好。 + +{{< tabs "ee5bdf7f-9cb8-4b3a-9522-8d20cd97522c" >}} +{{< tab "Java" >}} +```java +/** + * Accumulator for WeightedAvg. + */ +public static class WeightedAvgAccum { + public long sum = 0; + public int count = 0; +} + +/** + * Weighted Average user-defined aggregate function. + */ +public static class WeightedAvg extends AggregateFunction { + + @Override + public WeightedAvgAccum createAccumulator() { + return new WeightedAvgAccum(); + } + + @Override + public Long getValue(WeightedAvgAccum acc) { + if (acc.count == 0) { + return null; + } else { + return acc.sum / acc.count; + } + } + + public void accumulate(WeightedAvgAccum acc, long iValue, int iWeight) { + acc.sum += iValue * iWeight; + acc.count += iWeight; + } + + public void retract(WeightedAvgAccum acc, long iValue, int iWeight) { + acc.sum -= iValue * iWeight; + acc.count -= iWeight; + } + + public void merge(WeightedAvgAccum acc, Iterable it) { + Iterator iter = it.iterator(); + while (iter.hasNext()) { + WeightedAvgAccum a = iter.next(); + acc.count += a.count; + acc.sum += a.sum; + } + } + + public void resetAccumulator(WeightedAvgAccum acc) { + acc.count = 0; + acc.sum = 0L; + } +} + +// 注册函数 +StreamTableEnvironment tEnv = ... +tEnv.registerFunction("wAvg", new WeightedAvg()); + +// 使用函数 +tEnv.sqlQuery("SELECT user, wAvg(points, level) AS avgPoints FROM userScores GROUP BY user"); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import java.lang.{Long => JLong, Integer => JInteger} +import org.apache.flink.api.java.tuple.{Tuple1 => JTuple1} +import org.apache.flink.api.java.typeutils.TupleTypeInfo +import org.apache.flink.table.api.Types +import org.apache.flink.table.functions.AggregateFunction + +/** + * Accumulator for WeightedAvg. + */ +class WeightedAvgAccum extends JTuple1[JLong, JInteger] { + sum = 0L + count = 0 +} + +/** + * Weighted Average user-defined aggregate function. + */ +class WeightedAvg extends AggregateFunction[JLong, CountAccumulator] { + + override def createAccumulator(): WeightedAvgAccum = { + new WeightedAvgAccum + } + + override def getValue(acc: WeightedAvgAccum): JLong = { + if (acc.count == 0) { + null + } else { + acc.sum / acc.count + } + } + + def accumulate(acc: WeightedAvgAccum, iValue: JLong, iWeight: JInteger): Unit = { + acc.sum += iValue * iWeight + acc.count += iWeight + } + + def retract(acc: WeightedAvgAccum, iValue: JLong, iWeight: JInteger): Unit = { + acc.sum -= iValue * iWeight + acc.count -= iWeight + } + + def merge(acc: WeightedAvgAccum, it: java.lang.Iterable[WeightedAvgAccum]): Unit = { + val iter = it.iterator() + while (iter.hasNext) { + val a = iter.next() + acc.count += a.count + acc.sum += a.sum + } + } + + def resetAccumulator(acc: WeightedAvgAccum): Unit = { + acc.count = 0 + acc.sum = 0L + } + + override def getAccumulatorType: TypeInformation[WeightedAvgAccum] = { + new TupleTypeInfo(classOf[WeightedAvgAccum], Types.LONG, Types.INT) + } + + override def getResultType: TypeInformation[JLong] = Types.LONG +} + +// 注册函数 +val tEnv: StreamTableEnvironment = ??? +tEnv.registerFunction("wAvg", new WeightedAvg()) + +// 使用函数 +tEnv.sqlQuery("SELECT user, wAvg(points, level) AS avgPoints FROM userScores GROUP BY user") + +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +''' +Java code: + +/** + * Accumulator for WeightedAvg. + */ +public static class WeightedAvgAccum { + public long sum = 0; + public int count = 0; +} + +// The java class must have a public no-argument constructor and can be founded in current java classloader. +// Java 类必须有一个 public 的无参构造函数,并且可以在当前类加载器中加载到。 + +/** + * Weighted Average user-defined aggregate function. + */ +public static class WeightedAvg extends AggregateFunction { + + @Override + public WeightedAvgAccum createAccumulator() { + return new WeightedAvgAccum(); + } + + @Override + public Long getValue(WeightedAvgAccum acc) { + if (acc.count == 0) { + return null; + } else { + return acc.sum / acc.count; + } + } + + public void accumulate(WeightedAvgAccum acc, long iValue, int iWeight) { + acc.sum += iValue * iWeight; + acc.count += iWeight; + } + + public void retract(WeightedAvgAccum acc, long iValue, int iWeight) { + acc.sum -= iValue * iWeight; + acc.count -= iWeight; + } + + public void merge(WeightedAvgAccum acc, Iterable it) { + Iterator iter = it.iterator(); + while (iter.hasNext()) { + WeightedAvgAccum a = iter.next(); + acc.count += a.count; + acc.sum += a.sum; + } + } + + public void resetAccumulator(WeightedAvgAccum acc) { + acc.count = 0; + acc.sum = 0L; + } +} +''' + +# 注册函数 +t_env = ... # type: StreamTableEnvironment +t_env.register_java_function("wAvg", "my.java.function.WeightedAvg") + +# 使用函数 +t_env.sql_query("SELECT user, wAvg(points, level) AS avgPoints FROM userScores GROUP BY user") + +``` +{{< /tab >}} +{{< /tabs >}} + +如果你打算使用 Python 实现或调用聚合函数,详情可参考 [Python 聚合函数]({{< ref "docs/dev/python/table/udfs/python_udfs" >}}#aggregate-functions)。 + +{{< top >}} + +表值聚合函数 +--------------------- + +自定义表值聚合函数(UDTAGG)可以把一个表(一行或者多行,每行有一列或者多列)聚合成另一张表,结果中可以有多行多列。 + +UDAGG mechanism + +上图展示了一个表值聚合函数的例子。假设你有一个饮料的表,这个表有 3 列,分别是 `id`、`name` 和 `price`,一共有 5 行。假设你需要找到价格最高的两个饮料,类似于 `top2()` 表值聚合函数。你需要遍历所有 5 行数据,结果是有 2 行数据的一个表。 + +用户自定义表值聚合函数是通过扩展 `TableAggregateFunction` 类来实现的。一个 `TableAggregateFunction` 的工作过程如下。首先,它需要一个 `accumulator`,这个 `accumulator` 负责存储聚合的中间结果。 通过调用 `TableAggregateFunction` 的 `createAccumulator` 方法来构造一个空的 accumulator。接下来,对于每一行数据,会调用 `accumulate` 方法来更新 accumulator。当所有数据都处理完之后,调用 `emitValue` 方法来计算和返回最终的结果。 + +**下面几个 `TableAggregateFunction` 的方法是必须要实现的:** + +- `createAccumulator()` +- `accumulate()` + +Flink 的类型推导在遇到复杂类型的时候可能会推导出错误的结果,比如那些非基本类型和普通的 POJO 类型的复杂类型。所以类似于 `ScalarFunction` 和 `TableFunction`,`TableAggregateFunction` 也提供了 `TableAggregateFunction#getResultType()` 和 `TableAggregateFunction#getAccumulatorType()` 方法来指定返回值类型和 accumulator 的类型,这两个方法都需要返回 `TypeInformation`。 + +除了上面的方法,还有几个其他的方法可以选择性的实现。有些方法可以让查询更加高效,而有些方法对于某些特定场景是必须要实现的。比如,在会话窗口(当两个会话窗口合并时会合并两个 accumulator)中使用聚合函数时,必须要实现`merge()` 方法。 + +**下面几个 `TableAggregateFunction` 的方法在某些特定场景下是必须要实现的:** + +- `retract()` 在 bounded `OVER` 窗口中的聚合函数必须要实现。 +- `merge()` 在许多批式聚合和以及流式会话和滑动窗口聚合中是必须要实现的。 +- `resetAccumulator()` 在许多批式聚合中是必须要实现的。 +- `emitValue()` 在批式聚合以及窗口聚合中是必须要实现的。 + +**下面的 `TableAggregateFunction` 的方法可以提升流式任务的效率:** + +- `emitUpdateWithRetract()` 在 retract 模式下,该方法负责发送被更新的值。 + +`emitValue` 方法会发送所有 accumulator 给出的结果。拿 TopN 来说,`emitValue` 每次都会发送所有的最大的 n 个值。这在流式任务中可能会有一些性能问题。为了提升性能,用户可以实现 `emitUpdateWithRetract` 方法。这个方法在 retract 模式下会增量的输出结果,比如有数据更新了,我们必须要撤回老的数据,然后再发送新的数据。如果定义了 `emitUpdateWithRetract` 方法,那它会优先于 `emitValue` 方法被使用,因为一般认为 `emitUpdateWithRetract` 会更加高效,因为它的输出是增量的。 + +`TableAggregateFunction` 的所有方法都必须是 `public` 的、非 `static` 的,而且名字必须跟上面提到的一样。`createAccumulator`、`getResultType` 和 `getAccumulatorType` 这三个方法是在抽象父类 `TableAggregateFunction` 中定义的,而其他的方法都是约定的方法。要实现一个表值聚合函数,你必须扩展 `org.apache.flink.table.functions.TableAggregateFunction`,并且实现一个(或者多个)`accumulate` 方法。`accumulate` 方法可以有多个重载的方法,也可以支持变长参数。 + +`TableAggregateFunction` 的所有方法的详细文档如下。 + +{{< tabs "be05e2c1-92b0-4373-8ef8-cf7dce67f85c" >}} +{{< tab "Java" >}} +```java + +/** + * Base class for user-defined aggregates and table aggregates. + * + * @param the type of the aggregation result. + * @param the type of the aggregation accumulator. The accumulator is used to keep the + * aggregated values which are needed to compute an aggregation result. + */ +public abstract class UserDefinedAggregateFunction extends UserDefinedFunction { + + /** + * Creates and init the Accumulator for this (table)aggregate function. + * + * @return the accumulator with the initial value + */ + public ACC createAccumulator(); // MANDATORY + + /** + * Returns the TypeInformation of the (table)aggregate function's result. + * + * @return The TypeInformation of the (table)aggregate function's result or null if the result + * type should be automatically inferred. + */ + public TypeInformation getResultType = null; // PRE-DEFINED + + /** + * Returns the TypeInformation of the (table)aggregate function's accumulator. + * + * @return The TypeInformation of the (table)aggregate function's accumulator or null if the + * accumulator type should be automatically inferred. + */ + public TypeInformation getAccumulatorType = null; // PRE-DEFINED +} + +/** + * Base class for table aggregation functions. + * + * @param the type of the aggregation result + * @param the type of the aggregation accumulator. The accumulator is used to keep the + * aggregated values which are needed to compute a table aggregation result. + * TableAggregateFunction represents its state using accumulator, thereby the state of + * the TableAggregateFunction must be put into the accumulator. + */ +public abstract class TableAggregateFunction extends UserDefinedAggregateFunction { + + /** Processes the input values and update the provided accumulator instance. The method + * accumulate can be overloaded with different custom types and arguments. A TableAggregateFunction + * requires at least one accumulate() method. + * + * @param accumulator the accumulator which contains the current aggregated results + * @param [user defined inputs] the input value (usually obtained from a new arrived data). + */ + public void accumulate(ACC accumulator, [user defined inputs]); // MANDATORY + + /** + * Retracts the input values from the accumulator instance. The current design assumes the + * inputs are the values that have been previously accumulated. The method retract can be + * overloaded with different custom types and arguments. This function must be implemented for + * datastream bounded over aggregate. + * + * @param accumulator the accumulator which contains the current aggregated results + * @param [user defined inputs] the input value (usually obtained from a new arrived data). + */ + public void retract(ACC accumulator, [user defined inputs]); // OPTIONAL + + /** + * Merges a group of accumulator instances into one accumulator instance. This function must be + * implemented for datastream session window grouping aggregate and dataset grouping aggregate. + * + * @param accumulator the accumulator which will keep the merged aggregate results. It should + * be noted that the accumulator may contain the previous aggregated + * results. Therefore user should not replace or clean this instance in the + * custom merge method. + * @param its an {@link java.lang.Iterable} pointed to a group of accumulators that will be + * merged. + */ + public void merge(ACC accumulator, java.lang.Iterable its); // OPTIONAL + + /** + * Called every time when an aggregation result should be materialized. The returned value + * could be either an early and incomplete result (periodically emitted as data arrive) or + * the final result of the aggregation. + * + * @param accumulator the accumulator which contains the current + * aggregated results + * @param out the collector used to output data + */ + public void emitValue(ACC accumulator, Collector out); // OPTIONAL + + /** + * Called every time when an aggregation result should be materialized. The returned value + * could be either an early and incomplete result (periodically emitted as data arrive) or + * the final result of the aggregation. + * + * Different from emitValue, emitUpdateWithRetract is used to emit values that have been updated. + * This method outputs data incrementally in retract mode, i.e., once there is an update, we + * have to retract old records before sending new updated ones. The emitUpdateWithRetract + * method will be used in preference to the emitValue method if both methods are defined in the + * table aggregate function, because the method is treated to be more efficient than emitValue + * as it can outputvalues incrementally. + * + * @param accumulator the accumulator which contains the current + * aggregated results + * @param out the retractable collector used to output data. Use collect method + * to output(add) records and use retract method to retract(delete) + * records. + */ + public void emitUpdateWithRetract(ACC accumulator, RetractableCollector out); // OPTIONAL + + /** + * Collects a record and forwards it. The collector can output retract messages with the retract + * method. Note: only use it in {@code emitRetractValueIncrementally}. + */ + public interface RetractableCollector extends Collector { + + /** + * Retract a record. + * + * @param record The record to retract. + */ + void retract(T record); + } +} +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +/** + * Base class for user-defined aggregates and table aggregates. + * + * @tparam T the type of the aggregation result. + * @tparam ACC the type of the aggregation accumulator. The accumulator is used to keep the + * aggregated values which are needed to compute an aggregation result. + */ +abstract class UserDefinedAggregateFunction[T, ACC] extends UserDefinedFunction { + + /** + * Creates and init the Accumulator for this (table)aggregate function. + * + * @return the accumulator with the initial value + */ + def createAccumulator(): ACC // MANDATORY + + /** + * Returns the TypeInformation of the (table)aggregate function's result. + * + * @return The TypeInformation of the (table)aggregate function's result or null if the result + * type should be automatically inferred. + */ + def getResultType: TypeInformation[T] = null // PRE-DEFINED + + /** + * Returns the TypeInformation of the (table)aggregate function's accumulator. + * + * @return The TypeInformation of the (table)aggregate function's accumulator or null if the + * accumulator type should be automatically inferred. + */ + def getAccumulatorType: TypeInformation[ACC] = null // PRE-DEFINED +} + +/** + * Base class for table aggregation functions. + * + * @tparam T the type of the aggregation result + * @tparam ACC the type of the aggregation accumulator. The accumulator is used to keep the + * aggregated values which are needed to compute an aggregation result. + * TableAggregateFunction represents its state using accumulator, thereby the state of + * the TableAggregateFunction must be put into the accumulator. + */ +abstract class TableAggregateFunction[T, ACC] extends UserDefinedAggregateFunction[T, ACC] { + + /** + * Processes the input values and update the provided accumulator instance. The method + * accumulate can be overloaded with different custom types and arguments. A TableAggregateFunction + * requires at least one accumulate() method. + * + * @param accumulator the accumulator which contains the current aggregated results + * @param [user defined inputs] the input value (usually obtained from a new arrived data). + */ + def accumulate(accumulator: ACC, [user defined inputs]): Unit // MANDATORY + + /** + * Retracts the input values from the accumulator instance. The current design assumes the + * inputs are the values that have been previously accumulated. The method retract can be + * overloaded with different custom types and arguments. This function must be implemented for + * datastream bounded over aggregate. + * + * @param accumulator the accumulator which contains the current aggregated results + * @param [user defined inputs] the input value (usually obtained from a new arrived data). + */ + def retract(accumulator: ACC, [user defined inputs]): Unit // OPTIONAL + + /** + * Merges a group of accumulator instances into one accumulator instance. This function must be + * implemented for datastream session window grouping aggregate and dataset grouping aggregate. + * + * @param accumulator the accumulator which will keep the merged aggregate results. It should + * be noted that the accumulator may contain the previous aggregated + * results. Therefore user should not replace or clean this instance in the + * custom merge method. + * @param its an [[java.lang.Iterable]] pointed to a group of accumulators that will be + * merged. + */ + def merge(accumulator: ACC, its: java.lang.Iterable[ACC]): Unit // OPTIONAL + + /** + * Called every time when an aggregation result should be materialized. The returned value + * could be either an early and incomplete result (periodically emitted as data arrive) or + * the final result of the aggregation. + * + * @param accumulator the accumulator which contains the current + * aggregated results + * @param out the collector used to output data + */ + def emitValue(accumulator: ACC, out: Collector[T]): Unit // OPTIONAL + + /** + * Called every time when an aggregation result should be materialized. The returned value + * could be either an early and incomplete result (periodically emitted as data arrive) or + * the final result of the aggregation. + * + * Different from emitValue, emitUpdateWithRetract is used to emit values that have been updated. + * This method outputs data incrementally in retract mode, i.e., once there is an update, we + * have to retract old records before sending new updated ones. The emitUpdateWithRetract + * method will be used in preference to the emitValue method if both methods are defined in the + * table aggregate function, because the method is treated to be more efficient than emitValue + * as it can outputvalues incrementally. + * + * @param accumulator the accumulator which contains the current + * aggregated results + * @param out the retractable collector used to output data. Use collect method + * to output(add) records and use retract method to retract(delete) + * records. + */ + def emitUpdateWithRetract(accumulator: ACC, out: RetractableCollector[T]): Unit // OPTIONAL + + /** + * Collects a record and forwards it. The collector can output retract messages with the retract + * method. Note: only use it in `emitRetractValueIncrementally`. + */ + trait RetractableCollector[T] extends Collector[T] { + + /** + * Retract a record. + * + * @param record The record to retract. + */ + def retract(record: T): Unit + } +} +``` +{{< /tab >}} +{{< /tabs >}} + + +下面的例子展示了如何 + +- 定义一个 `TableAggregateFunction` 来计算给定列的最大的 2 个值, +- 在 `TableEnvironment` 中注册函数, +- 在 Table API 查询中使用函数(当前只在 Table API 中支持 TableAggregateFunction)。 + +为了计算最大的 2 个值,accumulator 需要保存当前看到的最大的 2 个值。在我们的例子中,我们定义了类 `Top2Accum` 来作为 accumulator。Flink 的 checkpoint 机制会自动保存 accumulator,并且在失败时进行恢复,来保证精确一次的语义。 + +我们的 `Top2` 表值聚合函数(`TableAggregateFunction`)的 `accumulate()` 方法有两个输入,第一个是 `Top2Accum` accumulator,另一个是用户定义的输入:输入的值 `v`。尽管 `merge()` 方法在大多数聚合类型中不是必须的,我们也在样例中提供了它的实现。请注意,我们在 Scala 样例中也使用的是 Java 的基础类型,并且定义了 `getResultType()` 和 `getAccumulatorType()` 方法,因为 Flink 的类型推导对于 Scala 的类型推导支持的不是很好。 + +{{< tabs "1348443b-e70d-4e88-888a-69a21ecc7857" >}} +{{< tab "Java" >}} +```java +/** + * Accumulator for Top2. + */ +public class Top2Accum { + public Integer first; + public Integer second; +} + +/** + * The top2 user-defined table aggregate function. + */ +public static class Top2 extends TableAggregateFunction, Top2Accum> { + + @Override + public Top2Accum createAccumulator() { + Top2Accum acc = new Top2Accum(); + acc.first = Integer.MIN_VALUE; + acc.second = Integer.MIN_VALUE; + return acc; + } + + + public void accumulate(Top2Accum acc, Integer v) { + if (v > acc.first) { + acc.second = acc.first; + acc.first = v; + } else if (v > acc.second) { + acc.second = v; + } + } + + public void merge(Top2Accum acc, java.lang.Iterable iterable) { + for (Top2Accum otherAcc : iterable) { + accumulate(acc, otherAcc.first); + accumulate(acc, otherAcc.second); + } + } + + public void emitValue(Top2Accum acc, Collector> out) { + // emit the value and rank + if (acc.first != Integer.MIN_VALUE) { + out.collect(Tuple2.of(acc.first, 1)); + } + if (acc.second != Integer.MIN_VALUE) { + out.collect(Tuple2.of(acc.second, 2)); + } + } +} + +// 注册函数 +StreamTableEnvironment tEnv = ... +tEnv.registerFunction("top2", new Top2()); + +// 初始化表 +Table tab = ...; + +// 使用函数 +tab.groupBy("key") + .flatAggregate("top2(a) as (v, rank)") + .select("key, v, rank"); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import java.lang.{Integer => JInteger} +import org.apache.flink.table.api.Types +import org.apache.flink.table.functions.TableAggregateFunction + +/** + * Accumulator for top2. + */ +class Top2Accum { + var first: JInteger = _ + var second: JInteger = _ +} + +/** + * The top2 user-defined table aggregate function. + */ +class Top2 extends TableAggregateFunction[JTuple2[JInteger, JInteger], Top2Accum] { + + override def createAccumulator(): Top2Accum = { + val acc = new Top2Accum + acc.first = Int.MinValue + acc.second = Int.MinValue + acc + } + + def accumulate(acc: Top2Accum, v: Int) { + if (v > acc.first) { + acc.second = acc.first + acc.first = v + } else if (v > acc.second) { + acc.second = v + } + } + + def merge(acc: Top2Accum, its: JIterable[Top2Accum]): Unit = { + val iter = its.iterator() + while (iter.hasNext) { + val top2 = iter.next() + accumulate(acc, top2.first) + accumulate(acc, top2.second) + } + } + + def emitValue(acc: Top2Accum, out: Collector[JTuple2[JInteger, JInteger]]): Unit = { + // emit the value and rank + if (acc.first != Int.MinValue) { + out.collect(JTuple2.of(acc.first, 1)) + } + if (acc.second != Int.MinValue) { + out.collect(JTuple2.of(acc.second, 2)) + } + } +} + +// 初始化表 +val tab = ... + +// 使用函数 +tab + .groupBy('key) + .flatAggregate(top2('a) as ('v, 'rank)) + .select('key, 'v, 'rank) + +``` +{{< /tab >}} +{{< /tabs >}} + + +下面的例子展示了如何使用 `emitUpdateWithRetract` 方法来只发送更新的数据。为了只发送更新的结果,accumulator 保存了上一次的最大的2个值,也保存了当前最大的2个值。注意:如果 TopN 中的 n 非常大,这种既保存上次的结果,也保存当前的结果的方式不太高效。一种解决这种问题的方式是把输入数据直接存储到 `accumulator` 中,然后在调用 `emitUpdateWithRetract` 方法时再进行计算。 + +{{< tabs "e0d841fe-8d95-4706-9e19-e76141171966" >}} +{{< tab "Java" >}} +```java +/** + * Accumulator for Top2. + */ +public class Top2Accum { + public Integer first; + public Integer second; + public Integer oldFirst; + public Integer oldSecond; +} + +/** + * The top2 user-defined table aggregate function. + */ +public static class Top2 extends TableAggregateFunction, Top2Accum> { + + @Override + public Top2Accum createAccumulator() { + Top2Accum acc = new Top2Accum(); + acc.first = Integer.MIN_VALUE; + acc.second = Integer.MIN_VALUE; + acc.oldFirst = Integer.MIN_VALUE; + acc.oldSecond = Integer.MIN_VALUE; + return acc; + } + + public void accumulate(Top2Accum acc, Integer v) { + if (v > acc.first) { + acc.second = acc.first; + acc.first = v; + } else if (v > acc.second) { + acc.second = v; + } + } + + public void emitUpdateWithRetract(Top2Accum acc, RetractableCollector> out) { + if (!acc.first.equals(acc.oldFirst)) { + // if there is an update, retract old value then emit new value. + if (acc.oldFirst != Integer.MIN_VALUE) { + out.retract(Tuple2.of(acc.oldFirst, 1)); + } + out.collect(Tuple2.of(acc.first, 1)); + acc.oldFirst = acc.first; + } + + if (!acc.second.equals(acc.oldSecond)) { + // if there is an update, retract old value then emit new value. + if (acc.oldSecond != Integer.MIN_VALUE) { + out.retract(Tuple2.of(acc.oldSecond, 2)); + } + out.collect(Tuple2.of(acc.second, 2)); + acc.oldSecond = acc.second; + } + } +} + +// 注册函数 +StreamTableEnvironment tEnv = ... +tEnv.registerFunction("top2", new Top2()); + +// 初始化表 +Table tab = ...; + +// 使用函数 +tab.groupBy("key") + .flatAggregate("top2(a) as (v, rank)") + .select("key, v, rank"); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +import java.lang.{Integer => JInteger} +import org.apache.flink.table.api.Types +import org.apache.flink.table.functions.TableAggregateFunction + +/** + * Accumulator for top2. + */ +class Top2Accum { + var first: JInteger = _ + var second: JInteger = _ + var oldFirst: JInteger = _ + var oldSecond: JInteger = _ +} + +/** + * The top2 user-defined table aggregate function. + */ +class Top2 extends TableAggregateFunction[JTuple2[JInteger, JInteger], Top2Accum] { + + override def createAccumulator(): Top2Accum = { + val acc = new Top2Accum + acc.first = Int.MinValue + acc.second = Int.MinValue + acc.oldFirst = Int.MinValue + acc.oldSecond = Int.MinValue + acc + } + + def accumulate(acc: Top2Accum, v: Int) { + if (v > acc.first) { + acc.second = acc.first + acc.first = v + } else if (v > acc.second) { + acc.second = v + } + } + + def emitUpdateWithRetract( + acc: Top2Accum, + out: RetractableCollector[JTuple2[JInteger, JInteger]]) + : Unit = { + if (acc.first != acc.oldFirst) { + // if there is an update, retract old value then emit new value. + if (acc.oldFirst != Int.MinValue) { + out.retract(JTuple2.of(acc.oldFirst, 1)) + } + out.collect(JTuple2.of(acc.first, 1)) + acc.oldFirst = acc.first + } + if (acc.second != acc.oldSecond) { + // if there is an update, retract old value then emit new value. + if (acc.oldSecond != Int.MinValue) { + out.retract(JTuple2.of(acc.oldSecond, 2)) + } + out.collect(JTuple2.of(acc.second, 2)) + acc.oldSecond = acc.second + } + } +} + +// 初始化表 +val tab = ... + +// 使用函数 +tab + .groupBy('key) + .flatAggregate(top2('a) as ('v, 'rank)) + .select('key, 'v, 'rank) + +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/legacy_planner.md b/docs/content.zh/docs/dev/table/legacy_planner.md new file mode 100644 index 0000000000000..696a8bd6cd451 --- /dev/null +++ b/docs/content.zh/docs/dev/table/legacy_planner.md @@ -0,0 +1,342 @@ +--- +title: "Legacy Planner" +weight: 1002 +type: docs +aliases: + - /zh/dev/table/legacy_planner.html +--- + + +# Legacy Planner + +Table planners are responsible for translating relational operators into an executable, optimized Flink job. +Flink supports two different planner implementations; the modern planner (sometimes referred to as `Blink`) and the legacy planner. +For production use cases, we recommend the modern planner which is the default. + +The legacy planner is in maintenance mode and no longer under active development. +The primary reason to continue using the legacy planner is [DataSet]({{< ref "docs/dev/dataset/overview" >}}) interop. + +{{< hint warning >}} +If you are not using the Legacy planner for DataSet interop, the community strongly +encourages you to consider the modern table planner. Both batch and stream processing pipelines +can be expressed in the unified `TableEnvironment`. + +** The legacy planner is deprecated and will be dropped in Flink 1.14.** +{{< /hint >}} + +This page describes how to use the Legacy planner and where its semantics differ from the +modern planner. + + + +## Setup + +### Dependencies + +When deploying to a cluster, the legacy planner is bundled in Flinks distribution by default. +If you want to run the Table API & SQL programs locally within your IDE, you must add the +following set of modules to your application. + +```xml + + org.apache.flink + flink-table-planner{{< scala_version >}} + {{< version >}} + provided + + + org.apache.flink + flink-streaming-scala{{< scala_version >}} + {{< version >}} + provided + +``` + +### Configuring the TableEnvironment + +When creating a `TableEnvironment` the Legacy planner is configured via the `EnvironmentSettings`. + +{{< tabs "95cbb391-0326-4830-b38e-a5871c87d19f" >}} +{{< tab "Java" >}} +```java +EnvironmentSettings settings = EnvironmentSettings + .newInstance() + .useOldPlanner() + .inStreamingMode() + // or in batch mode + //.inBatchMode() + .build(); + +TableEnvironment tEnv = TableEnvironment.create(settings); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val settings = EnvironmentSettings + .newInstance() + .useOldPlanner() + .inStreamingMode() + // or in batch mode + //.inBatchMode() + .build() + +val tEnv = TableEnvironment.create(settings) +``` +{{< /tab >}} +{{< /tabs >}} + +`BatchTableEnvironment` may used for [DataSet]({{< ref "docs/dev/dataset/overview" >}}) and [DataStream]({{< ref "docs/dev/datastream/overview" >}}) interop respectively. + +{{< tabs "3d8abf94-f59c-4756-a5c0-7b645bf43870" >}} +{{< tab "Java" >}} +```java +ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); +BatchTableEnvironment tEnv = BatchTableEnvironment.create(env); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = ExecutionEnvironment.getExecutionEnvironment() +val tEnv = BatchTableEnvironment.create(env) +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +from pyflink.dataset import ExecutionEnvironment +from pyflink.table import BatchTableEnvironment + +b_env = ExecutionEnvironment.get_execution_environment() +t_env = BatchTableEnvironment.create(b_env, table_config) +``` +{{< /tab >}} +{{< /tabs >}} + +## Integration with DataSet + +The primary use case for the Legacy planner is interoperation with the DataSet API. +To translate `DataSet`s to and from tables, applications must use the `BatchTableEnvironment`. + +### Create a View from a DataSet + +A `DataSet` can be registered in a `BatchTableEnvironment` as a `View`. +The schema of the resulting view depends on the data type of the registered collection. + +**Note:** Views created from a `DataSet` can be registered as temporary views only. + +{{< tabs "02002c94-df73-4f10-b7b7-6c0f6fbf5909" >}} +{{< tab "Java" >}} +```java +BatchTableEnvironment tEnv = ...; +DataSet> dataset = ...; + +tEnv.createTemporaryView("my-table", dataset, $("myLong"), $("myString")) +``` +{{< /tab >}} +{{< tab "Scala" >}} +```java +val tEnv: BatchTableEnvironment = ??? +val dataset: DataSet[(Long, String)] = ??? + +tEnv.createTemporaryView("my-table", dataset, $"myLong", $"myString") +``` +{{< /tab >}} +{{< /tabs >}} + +### Create a Table from a DataSet + +A `DataSet` can be directly converted to a `Table` in a `BatchTableEnvironment`. +The schema of the resulting view depends on the data type of the registered collection. + +{{< tabs "9bc01f1c-f24c-4be7-9c65-7f6575a12740" >}} +{{< tab "Java" >}} +```java +BatchTableEnvironment tEnv = ...; +DataSet> dataset = ...; + +Table myTable = tEnv.fromDataSet("my-table", dataset, $("myLong"), $("myString")) +``` +{{< /tab >}} +{{< tab "Scala" >}} +```java +val tEnv: BatchTableEnvironment = ??? +val dataset: DataSet[(Long, String)] = ??? + +val table = tEnv.fromDataSet("my-table", dataset, $"myLong", $"myString") +``` +{{< /tab >}} +{{< /tabs >}} + +### Convert a Table to a DataSet + +A `Table` can be converted to a `DataSet`. +In this way, custom DataSet programs can be run on the result of a Table API or SQL query. + +When converting from a `Table`, users must specify the data type of the results. +Often the most convenient conversion type is `Row`. +The following list gives an overview of the features of the different options. + +- **Row**: fields are mapped by position, arbitrary number of fields, support for `null` values, no type-safe access. +- **POJO**: fields are mapped by name (POJO fields must be named as `Table` fields), arbitrary number of fields, support for `null` values, type-safe access. +- **Case Class**: fields are mapped by position, no support for `null` values, type-safe access. +- **Tuple**: fields are mapped by position, limitation to 22 (Scala) or 25 (Java) fields, no support for `null` values, type-safe access. +- **Atomic Type**: `Table` must have a single field, no support for `null` values, type-safe access. + +{{< tabs "c34bd733-f34e-4780-8cf0-c440465c9f58" >}} +{{< tab "Java" >}} +```java +BatchTableEnvironment tableEnv = BatchTableEnvironment.create(env); + +Table table = tableEnv.fromValues( + DataTypes.Row( + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("age", DataTypes.INT()), + row("john", 35), + row("sarah", 32)); + +// Convert the Table into a DataSet of Row by specifying a class +DataSet dsRow = tableEnv.toDataSet(table, Row.class); + +// Convert the Table into a DataSet of Tuple2 via a TypeInformation +TupleTypeInfo> tupleType = new TupleTypeInfo<>(Types.STRING(), Types.INT()); +DataSet> dsTuple = tableEnv.toDataSet(table, tupleType); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val tableEnv = BatchTableEnvironment.create(env) + +val table = tableEnv.fromValues( + DataTypes.Row( + DataTypes.FIELD("name", DataTypes.STRING()), + DataTypes.FIELD("age", DataTypes.INT()), + row("john", 35), + row("sarah", 32)); + +// Convert the Table into a DataSet of Row +val dsRow: DataSet[Row] = tableEnv.toDataSet[Row](table) + +// Convert the Table into a DataSet of Tuple2[String, Int] +val dsTuple: DataSet[(String, Int)] = tableEnv.toDataSet[(String, Int)](table) +``` +{{< /tab >}} +{{< /tabs >}} + +Attention **Once the Table is converted to a DataSet, we must use the ExecutionEnvironment.execute method to execute the DataSet program.** + +## Data Types + +The legacy planner, introduced before Flink 1.9, primarily supports type information. +It has only limited support for data types. +It is possible to declare data types that can be translated into type information such that the legacy planner understands them. + +The following table summarizes the difference between data type and type information. +Most simple types, as well as the row type remain the same. +Time types, array types, and the decimal type need special attention. +Other hints as the ones mentioned are not allowed. + +For the *Type Information* column the table omits the prefix `org.apache.flink.table.api.Types`. + +For the *Data Type Representation* column the table omits the prefix `org.apache.flink.table.api.DataTypes`. + +| Type Information | Java Expression String | Data Type Representation | Remarks for Data Type | +|:-----------------|:-----------------------|:-------------------------|:----------------------| +| `STRING()` | `STRING` | `STRING()` | | +| `BOOLEAN()` | `BOOLEAN` | `BOOLEAN()` | | +| `BYTE()` | `BYTE` | `TINYINT()` | | +| `SHORT()` | `SHORT` | `SMALLINT()` | | +| `INT()` | `INT` | `INT()` | | +| `LONG()` | `LONG` | `BIGINT()` | | +| `FLOAT()` | `FLOAT` | `FLOAT()` | | +| `DOUBLE()` | `DOUBLE` | `DOUBLE()` | | +| `ROW(...)` | `ROW<...>` | `ROW(...)` | | +| `BIG_DEC()` | `DECIMAL` | [`DECIMAL()`] | Not a 1:1 mapping as precision and scale are ignored and Java's variable precision and scale are used. | +| `SQL_DATE()` | `SQL_DATE` | `DATE()`
    `.bridgedTo(java.sql.Date.class)` | | +| `SQL_TIME()` | `SQL_TIME` | `TIME(0)`
    `.bridgedTo(java.sql.Time.class)` | | +| `SQL_TIMESTAMP()` | `SQL_TIMESTAMP` | `TIMESTAMP(3)`
    `.bridgedTo(java.sql.Timestamp.class)` | | +| `INTERVAL_MONTHS()` | `INTERVAL_MONTHS` | `INTERVAL(MONTH())`
    `.bridgedTo(Integer.class)` | | +| `INTERVAL_MILLIS()` | `INTERVAL_MILLIS` | `INTERVAL(DataTypes.SECOND(3))`
    `.bridgedTo(Long.class)` | | +| `PRIMITIVE_ARRAY(...)` | `PRIMITIVE_ARRAY<...>` | `ARRAY(DATATYPE.notNull()`
    `.bridgedTo(PRIMITIVE.class))` | Applies to all JVM primitive types except for `byte`. | +| `PRIMITIVE_ARRAY(BYTE())` | `PRIMITIVE_ARRAY` | `BYTES()` | | +| `OBJECT_ARRAY(...)` | `OBJECT_ARRAY<...>` | `ARRAY(`
    `DATATYPE.bridgedTo(OBJECT.class))` | | +| `MULTISET(...)` | | `MULTISET(...)` | | +| `MAP(..., ...)` | `MAP<...,...>` | `MAP(...)` | | +| other generic types | | `RAW(...)` | | + +Attention If there is a problem with the new type system. Users +can fallback to type information defined in `org.apache.flink.table.api.Types` at any time. + +## Unsupported Features + +The following features are not supported by the legacy planner. + +- [Deduplication]({{< ref "docs/dev/table/sql/queries" >}}#deduplication %}) +- [Key Value Configurations]({{< ref "docs/dev/table/config" >}}#overview) +- [Streaming Aggregation Optimization]({{< ref "docs/dev/table/tuning" >}}) +- Streaming mode Grouping sets, Rollup and Cube aggregations +- [Top-N]({{< ref "docs/dev/table/sql/queries" >}}#top-n) +- [Versioned Tables]({{< ref "docs/dev/table/concepts/versioned_tables" >}}) + +## Unsupported Built-In Functions + +The following built-in functions are not supported by the legacy planner. + +- `PI` +- `ASCII(string)` +- `CHR(integer)` +- `DECODE(binary, string)` +- `ENCODE(string1, string2)` +- `INSTR(string1, string2)` +- `LEFT(string, integer)` +- `RIGHT(string, integer)` +- `LOCATE(string1, string2[, integer])` +- `PARSE_URL(string1, string2[, string3])` +- `REGEXP(string1, string2)` +- `REVERSE(string)` +- `SPLIT_INDEX(string1, string2, integer1)` +- `STR_TO_MAP(string1[, string2, string3]])` +- `SUBSTR(string[, integer1[, integer2]])` +- `CONVERT_TZ(string1, string2, string3)` +- `FROM_UNIXTIME(numeric[, string])` +- `UNIX_TIMESTAMP()` +- `UNIX_TIMESTAMP(string1[, string2])` +- `TO_DATE(string1[, string2])` +- `TO_TIMESTAMP(string1[, string2])` +- `NOW()` +- `IF(condition, true_value, false_value)` +- `IS_ALPHA(string)` +- `IS_DECIMAL(string)` +- `IS_DIGIT(string)` +- `VARIANCE([ ALL | DISTINCT ] expression)` +- `RANK()` +- `DENSE_RANK()` +- `ROW_NUMBER()` +- `LEAD(expression [, offset] [, default] )` +- `LAG(expression [, offset] [, default])` +- `FIRST_VALUE(expression)` +- `LAST_VALUE(expression)` +- `LISTAGG(expression [, separator])` + +{{< hint danger >}} +`DATE_FORMAT(timestamp, string)` is available in the legacy planner but has serious bugs and should not be used. +Please implement a custom UDF instead or use `EXTRACT` as a workaround. +{{< /hint >}} + + + diff --git a/docs/content.zh/docs/dev/table/modules.md b/docs/content.zh/docs/dev/table/modules.md new file mode 100644 index 0000000000000..b0072f5c9cb39 --- /dev/null +++ b/docs/content.zh/docs/dev/table/modules.md @@ -0,0 +1,716 @@ +--- +title: "模块" +is_beta: true +weight: 71 +type: docs +aliases: + - /zh/dev/table/modules.html +--- + + +# Modules + +Modules allow users to extend Flink's built-in objects, such as defining functions that behave like Flink +built-in functions. They are pluggable, and while Flink provides a few pre-built modules, users can write +their own. + +For example, users can define their own geo functions and plug them into Flink as built-in functions to be used in +Flink SQL and Table APIs. Another example is users can load an out-of-shelf Hive module to use Hive built-in +functions as Flink built-in functions. + +## Module Types + +### CoreModule + +`CoreModule` contains all of Flink's system (built-in) functions and is loaded and enabled by default. + +### HiveModule + +The `HiveModule` provides Hive built-in functions as Flink's system functions to SQL and Table API users. +Flink's [Hive documentation]({{< ref "docs/connectors/table/hive/hive_functions" >}}) provides full details on setting up the module. + +### User-Defined Module + +Users can develop custom modules by implementing the `Module` interface. +To use custom modules in SQL CLI, users should develop both a module and its corresponding module factory by implementing +the `ModuleFactory` interface. + +A module factory defines a set of properties for configuring the module when the SQL CLI bootstraps. +Properties are passed to a discovery service where the service tries to match the properties to + a `ModuleFactory` and instantiate a corresponding module instance. + +## Module Lifecycle and Resolution Order + +A module can be loaded, enabled, disabled and unloaded. When TableEnvironment loads a module initially, it enables the module by default. Flink supports multiple modules and keeps track of the loading order to resolve metadata. +Besides, Flink only resolves the functions among enabled modules. *E.g.*, when there are two functions of the same name residing in two modules, there will be three conditions. +- If both of the modules are enabled, then Flink resolves the function according to the resolution order of the modules. +- If one of them is disabled, then Flink resolves the function to the enabled module. +- If both of the modules are disabled, then Flink cannot resolve the function. + +Users can change the resolution order by using modules in a different declared order. *E.g.*, users can specify Flink to find functions first in Hive by `USE MODULES hive, core`. + +Besides, users can also disable modules by not declaring them. *E.g.*, users can specify Flink to disable core module by `USE MODULES hive` (However, it is strongly not recommended disabling core module). Disable a module does not unload it, and users can enable it again by using it. *E.g.*, users can bring back core module and place it in the first by `USE MODULES core, hive`. A module can be enabled only when it is loaded already. Using an unloaded module will throw an Exception. Eventually, users can unload a module. + +The difference between disabling and unloading a module is that TableEnvironment still keeps the disabled modules, and users can list all loaded modules to view the disabled modules. + +## Namespace + +Objects provided by modules are considered part of Flink's system (built-in) objects; thus, they don't have any namespaces. + +## How to Load, Unload, Use and List Modules + +### Using SQL + +Users can use SQL to load/unload/use/list modules in both Table API and SQL CLI. + +{{< tabs "SQL snippets" >}} +{{< tab "Java" >}} +```java +EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().build(); +TableEnvironment tableEnv = TableEnvironment.create(setting); + +// Show initially loaded and enabled modules +tableEnv.executeSql("SHOW MODULES").print(); +// +-------------+ +// | module name | +// +-------------+ +// | core | +// +-------------+ +tableEnv.executeSql("SHOW FULL MODULES").print(); +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | core | true | +// +-------------+------+ + +// Load a hive module +tableEnv.executeSql("LOAD MODULE hive WITH ('hive-version' = '...')"); + +// Show all enabled modules +tableEnv.executeSql("SHOW MODULES").print(); +// +-------------+ +// | module name | +// +-------------+ +// | core | +// | hive | +// +-------------+ + +// Show all loaded modules with both name and use status +tableEnv.executeSql("SHOW FULL MODULES").print(); +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | core | true | +// | hive | true | +// +-------------+------+ + +// Change resolution order +tableEnv.executeSql("USE MODULES hive, core"); +tableEnv.executeSql("SHOW MODULES").print(); +// +-------------+ +// | module name | +// +-------------+ +// | hive | +// | core | +// +-------------+ +tableEnv.executeSql("SHOW FULL MODULES").print(); +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | hive | true | +// | core | true | +// +-------------+------+ + +// Disable core module +tableEnv.executeSql("USE MODULES hive"); +tableEnv.executeSql("SHOW MODULES").print(); +// +-------------+ +// | module name | +// +-------------+ +// | hive | +// +-------------+ +tableEnv.executeSql("SHOW FULL MODULES").print(); +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | hive | true | +// | core | false | +// +-------------+-------+ + +// Unload hive module +tableEnv.executeSql("UNLOAD MODULE hive"); +tableEnv.executeSql("SHOW MODULES").print(); +// Empty set +tableEnv.executeSql("SHOW FULL MODULES").print(); +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | hive | false | +// +-------------+-------+ +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val settings = EnvironmentSettings.newInstance().useBlinkPlanner().build() +val tableEnv = TableEnvironment.create(setting) + +// Show initially loaded and enabled modules +tableEnv.executeSql("SHOW MODULES").print() +// +-------------+ +// | module name | +// +-------------+ +// | core | +// +-------------+ +tableEnv.executeSql("SHOW FULL MODULES").print() +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | core | true | +// +-------------+------+ + +// Load a hive module +tableEnv.executeSql("LOAD MODULE hive WITH ('hive-version' = '...')") + +// Show all enabled modules +tableEnv.executeSql("SHOW MODULES").print() +// +-------------+ +// | module name | +// +-------------+ +// | core | +// | hive | +// +-------------+ + +// Show all loaded modules with both name and use status +tableEnv.executeSql("SHOW FULL MODULES") +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | core | true | +// | hive | true | +// +-------------+------+ + +// Change resolution order +tableEnv.executeSql("USE MODULES hive, core") +tableEnv.executeSql("SHOW MODULES").print() +// +-------------+ +// | module name | +// +-------------+ +// | hive | +// | core | +// +-------------+ +tableEnv.executeSql("SHOW FULL MODULES").print() +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | hive | true | +// | core | true | +// +-------------+------+ + +// Disable core module +tableEnv.executeSql("USE MODULES hive") +tableEnv.executeSql("SHOW MODULES").print() +// +-------------+ +// | module name | +// +-------------+ +// | hive | +// +-------------+ +tableEnv.executeSql("SHOW FULL MODULES").print() +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | hive | true | +// | core | false | +// +-------------+-------+ + +// Unload hive module +tableEnv.executeSql("UNLOAD MODULE hive") +tableEnv.executeSql("SHOW MODULES").print() +// Empty set +tableEnv.executeSql("SHOW FULL MODULES").print() +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | hive | false | +// +-------------+-------+ +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +from pyflink.table import * + +# environment configuration +settings = EnvironmentSettings.new_instance().use_blink_planner().build() +t_env = TableEnvironment.create(settings) + +# Show initially loaded and enabled modules +t_env.execute_sql("SHOW MODULES").print() +# +-------------+ +# | module name | +# +-------------+ +# | core | +# +-------------+ +t_env.execute_sql("SHOW FULL MODULES").print() +# +-------------+------+ +# | module name | used | +# +-------------+------+ +# | core | true | +# +-------------+------+ + +# Load a hive module +t_env.execute_sql("LOAD MODULE hive WITH ('hive-version' = '...')") + +# Show all enabled modules +t_env.execute_sql("SHOW MODULES").print() +# +-------------+ +# | module name | +# +-------------+ +# | core | +# | hive | +# +-------------+ + +# Show all loaded modules with both name and use status +t_env.execute_sql("SHOW FULL MODULES").print() +# +-------------+------+ +# | module name | used | +# +-------------+------+ +# | core | true | +# | hive | true | +# +-------------+------+ + +# Change resolution order +t_env.execute_sql("USE MODULES hive, core") +t_env.execute_sql("SHOW MODULES").print() +# +-------------+ +# | module name | +# +-------------+ +# | hive | +# | core | +# +-------------+ +t_env.execute_sql("SHOW FULL MODULES").print() +# +-------------+------+ +# | module name | used | +# +-------------+------+ +# | hive | true | +# | core | true | +# +-------------+------+ + +# Disable core module +t_env.execute_sql("USE MODULES hive") +t_env.execute_sql("SHOW MODULES").print() +# +-------------+ +# | module name | +# +-------------+ +# | hive | +# +-------------+ +t_env.execute_sql("SHOW FULL MODULES").print() +# +-------------+-------+ +# | module name | used | +# +-------------+-------+ +# | hive | true | +# | core | false | +# +-------------+-------+ + +# Unload hive module +t_env.execute_sql("UNLOAD MODULE hive") +t_env.execute_sql("SHOW MODULES").print() +# Empty set +t_env.execute_sql("SHOW FULL MODULES").print() +# +-------------+-------+ +# | module name | used | +# +-------------+-------+ +# | hive | false | +# +-------------+-------+ +``` +{{< /tab >}} +{{< tab "SQL Client" >}} +```sql +-- Show initially loaded and enabled modules +Flink SQL> SHOW MODULES; ++-------------+ +| module name | ++-------------+ +| core | ++-------------+ +1 row in set +Flink SQL> SHOW FULL MODULES; ++-------------+------+ +| module name | used | ++-------------+------+ +| core | true | ++-------------+------+ +1 row in set + +-- Load a hive module +Flink SQL> LOAD MODULE hive WITH ('hive-version' = '...'); + +-- Show all enabled modules +Flink SQL> SHOW MODULES; ++-------------+ +| module name | ++-------------+ +| core | +| hive | ++-------------+ +2 rows in set + +-- Show all loaded modules with both name and use status +Flink SQL> SHOW FULL MODULES; ++-------------+------+ +| module name | used | ++-------------+------+ +| core | true | +| hive | true | ++-------------+------+ +2 rows in set + +-- Change resolution order +Flink SQL> USE MODULES hive, core ; +Flink SQL> SHOW MODULES; ++-------------+ +| module name | ++-------------+ +| hive | +| core | ++-------------+ +2 rows in set +Flink SQL> SHOW FULL MODULES; ++-------------+------+ +| module name | used | ++-------------+------+ +| hive | true | +| core | true | ++-------------+------+ +2 rows in set + +-- Unload hive module +Flink SQL> UNLOAD MODULE hive; +Flink SQL> SHOW MODULES; +Empty set +Flink SQL> SHOW FULL MODULES; ++-------------+-------+ +| module name | used | ++-------------+-------+ +| hive | false | ++-------------+-------+ +1 row in set +``` +{{< /tab >}} +{{< tab "YAML" >}} + +All modules defined using YAML must provide a `type` property that specifies the type. +The following types are supported out of the box. + + + + + + + + + + + + + + + + + + +
    ModuleType Value
    CoreModulecore
    HiveModulehive
    + +```yaml +modules: + - name: core + type: core + - name: hive + type: hive +``` +{{< /tab >}} +{{< /tabs >}} +{{< hint warning >}} +When using SQL, module name is used to perform the module discovery. It is parsed as a simple identifier and case-sensitive. +{{< /hint >}} + +### Using Java, Scala or Python + +Users can use Java, Scala or Python to load/unload/use/list modules programmatically. + +{{< tabs "API snippets" >}} +{{< tab "Java" >}} +```java +EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().build(); +TableEnvironment tableEnv = TableEnvironment.create(setting); + +// Show initially loaded and enabled modules +tableEnv.listModules(); +// +-------------+ +// | module name | +// +-------------+ +// | core | +// +-------------+ +tableEnv.listFullModules(); +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | core | true | +// +-------------+------+ + +// Load a hive module +tableEnv.loadModule("hive", new HiveModule()); + +// Show all enabled modules +tableEnv.listModules(); +// +-------------+ +// | module name | +// +-------------+ +// | core | +// | hive | +// +-------------+ + +// Show all loaded modules with both name and use status +tableEnv.listFullModules(); +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | core | true | +// | hive | true | +// +-------------+------+ + +// Change resolution order +tableEnv.useModules("hive", "core"); +tableEnv.listModules(); +// +-------------+ +// | module name | +// +-------------+ +// | hive | +// | core | +// +-------------+ +tableEnv.listFullModules(); +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | hive | true | +// | core | true | +// +-------------+------+ + +// Disable core module +tableEnv.useModules("hive"); +tableEnv.listModules(); +// +-------------+ +// | module name | +// +-------------+ +// | hive | +// +-------------+ +tableEnv.listFullModules(); +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | hive | true | +// | core | false | +// +-------------+-------+ + +// Unload hive module +tableEnv.unloadModule("hive"); +tableEnv.listModules(); +// Empty set +tableEnv.listFullModules(); +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | hive | false | +// +-------------+-------+ +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val settings = EnvironmentSettings.newInstance().useBlinkPlanner().build() +val tableEnv = TableEnvironment.create(setting) + +// Show initially loaded and enabled modules +tableEnv.listModules() +// +-------------+ +// | module name | +// +-------------+ +// | core | +// +-------------+ +tableEnv.listFullModules() +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | core | true | +// +-------------+------+ + +// Load a hive module +tableEnv.loadModule("hive", new HiveModule()) + +// Show all enabled modules +tableEnv.listModules() +// +-------------+ +// | module name | +// +-------------+ +// | core | +// | hive | +// +-------------+ + +// Show all loaded modules with both name and use status +tableEnv.listFullModules() +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | core | true | +// | hive | true | +// +-------------+------+ + +// Change resolution order +tableEnv.useModules("hive", "core") +tableEnv.listModules() +// +-------------+ +// | module name | +// +-------------+ +// | hive | +// | core | +// +-------------+ +tableEnv.listFullModules() +// +-------------+------+ +// | module name | used | +// +-------------+------+ +// | hive | true | +// | core | true | +// +-------------+------+ + +// Disable core module +tableEnv.useModules("hive") +tableEnv.listModules() +// +-------------+ +// | module name | +// +-------------+ +// | hive | +// +-------------+ +tableEnv.listFullModules() +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | hive | true | +// | core | false | +// +-------------+-------+ + +// Unload hive module +tableEnv.unloadModule("hive") +tableEnv.listModules() +// Empty set +tableEnv.listFullModules() +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | hive | false | +// +-------------+-------+ +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +from pyflink.table import * + +# environment configuration +settings = EnvironmentSettings.new_instance().use_blink_planner().build() +t_env = TableEnvironment.create(settings) + +# Show initially loaded and enabled modules +t_env.list_modules() +# +-------------+ +# | module name | +# +-------------+ +# | core | +# +-------------+ +t_env.list_full_modules() +# +-------------+------+ +# | module name | used | +# +-------------+------+ +# | core | true | +# +-------------+------+ + +# Load a hive module +t_env.load_module("hive", HiveModule()) + +# Show all enabled modules +t_env.list_modules() +# +-------------+ +# | module name | +# +-------------+ +# | core | +# | hive | +# +-------------+ + +# Show all loaded modules with both name and use status +t_env.list_full_modules() +# +-------------+------+ +# | module name | used | +# +-------------+------+ +# | core | true | +# | hive | true | +# +-------------+------+ + +# Change resolution order +t_env.use_modules("hive", "core") +t_env.list_modules() +# +-------------+ +# | module name | +# +-------------+ +# | hive | +# | core | +# +-------------+ +t_env.list_full_modules() +# +-------------+------+ +# | module name | used | +# +-------------+------+ +# | hive | true | +# | core | true | +# +-------------+------+ + +# Disable core module +t_env.use_modules("hive") +t_env.list_modules() +# +-------------+ +# | module name | +# +-------------+ +# | hive | +# +-------------+ +t_env.list_full_modules() +# +-------------+-------+ +# | module name | used | +# +-------------+-------+ +# | hive | true | +# | core | false | +# +-------------+-------+ + +# Unload hive module +t_env.unload_module("hive") +t_env.list_modules() +# Empty set +t_env.list_full_modules() +# +-------------+-------+ +# | module name | used | +# +-------------+-------+ +# | hive | false | +# +-------------+-------+ +``` +{{< /tab >}} +{{< /tabs >}} +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/overview.md b/docs/content.zh/docs/dev/table/overview.md new file mode 100644 index 0000000000000..c5eb5289559bf --- /dev/null +++ b/docs/content.zh/docs/dev/table/overview.md @@ -0,0 +1,147 @@ +--- +title: "概览" +is_beta: false +weight: 1 +type: docs +aliases: + - /zh/dev/table/ +--- + + +# Table API & SQL + +Apache Flink 有两种关系型 API 来做流批统一处理:Table API 和 SQL。Table API 是用于 Scala 和 Java 语言的查询API,它可以用一种非常直观的方式来组合使用选取、过滤、join 等关系型算子。Flink SQL 是基于 [Apache Calcite](https://calcite.apache.org) 来实现的标准 SQL。这两种 API 中的查询对于批(DataSet)和流(DataStream)的输入有相同的语义,也会产生同样的计算结果。 + +Table API 和 SQL 两种 API 是紧密集成的,以及 DataStream 和 DataSet API。你可以在这些 API 之间,以及一些基于这些 API 的库之间轻松的切换。比如,你可以先用 [CEP]({{< ref "docs/libs/cep" >}}) 从 DataStream 中做模式匹配,然后用 Table API 来分析匹配的结果;或者你可以用 SQL 来扫描、过滤、聚合一个批式的表,然后再跑一个 [Gelly 图算法]({{< ref "docs/libs/gelly/overview" >}}) 来处理已经预处理好的数据。 + +**注意:Table API 和 SQL 现在还处于活跃开发阶段,还没有完全实现所有的特性。不是所有的 \[Table API,SQL\] 和 \[流,批\] 的组合都是支持的。** + +依赖图 +-------------------- + +从1.9开始,Flink 提供了两个 Table Planner 实现来执行 Table API 和 SQL 程序:Blink Planner 和 Old Planner,Old Planner 在1.9之前就已经存在了。 +Planner 的作用主要是把关系型的操作翻译成可执行的、经过优化的 Flink 任务。两种 Planner 所使用的优化规则以及运行时类都不一样。 +它们在支持的功能上也有些差异。 + +注意 对于生产环境,我们建议使用在1.11版本之后已经变成默认的Blink Planner。 + +所有的 Table API 和 SQL 的代码都在 `flink-table` 或者 `flink-table-blink` Maven artifacts 下。 + +下面是各个依赖: + +* `flink-table-common`: 公共模块,比如自定义函数、格式等需要依赖的。 +* `flink-table-api-java`: Table 和 SQL API,使用 Java 语言编写的,给纯 table 程序使用(还在早期开发阶段,不建议使用) +* `flink-table-api-scala`: Table 和 SQL API,使用 Scala 语言编写的,给纯 table 程序使用(还在早期开发阶段,不建议使用) +* `flink-table-api-java-bridge`: Table 和 SQL API 结合 DataStream/DataSet API 一起使用,给 Java 语言使用。 +* `flink-table-api-scala-bridge`: Table 和 SQL API 结合 DataStream/DataSet API 一起使用,给 Scala 语言使用。 +* `flink-table-planner`: table Planner 和运行时。这是在1.9之前 Flink 的唯一的 Planner,但是从1.11版本开始我们不推荐继续使用。 +* `flink-table-planner-blink`: 新的 Blink Planner,从1.11版本开始成为默认的 Planner。 +* `flink-table-runtime-blink`: 新的 Blink 运行时。 +* `flink-table-uber`: 把上述模块以及 Old Planner 打包到一起,可以在大部分 Table & SQL API 场景下使用。打包到一起的 jar 文件 `flink-table-*.jar` 默认会直接放到 Flink 发行版的 `/lib` 目录下。 +* `flink-table-uber-blink`: 把上述模块以及 Blink Planner 打包到一起,可以在大部分 Table & SQL API 场景下使用。打包到一起的 jar 文件 `flink-table-blink-*.jar` 默认会放到 Flink 发行版的 `/lib` 目录下。 + +关于如何使用 Old Planner 以及 Blink Planner,可以参考[公共 API](common.html)。 + +### Table 程序依赖 + +取决于你使用的编程语言,选择 Java 或者 Scala API 来构建你的 Table API 和 SQL 程序: + +```xml + + + org.apache.flink + flink-table-api-java-bridge{{< scala_version >}} + {{< version >}} + provided + + + + org.apache.flink + flink-table-api-scala-bridge{{< scala_version >}} + {{< version >}} + provided + +``` + +除此之外,如果你想在 IDE 本地运行你的程序,你需要添加下面的模块,具体用哪个取决于你使用哪个 Planner: + +```xml + + + org.apache.flink + flink-table-planner{{< scala_version >}} + {{< version >}} + provided + + + + org.apache.flink + flink-table-planner-blink{{< scala_version >}} + {{< version >}} + provided + +``` + +内部实现上,部分 table 相关的代码是用 Scala 实现的。所以,下面的依赖也需要添加到你的程序里,不管是批式还是流式的程序: + +```xml + + org.apache.flink + flink-streaming-scala{{< scala_version >}} + {{< version >}} + provided + +``` + +### 扩展依赖 + +如果你想实现[自定义格式]({{< ref "docs/dev/table/sourcesSinks" >}}#define-a-tablefactory)来解析 Kafka 数据,或者[自定义函数]({{< ref "docs/dev/table/functions/systemFunctions" >}}),下面的依赖就足够了,编译出来的 jar 文件可以直接给 SQL Client 使用: + +```xml + + org.apache.flink + flink-table-common + {{< version >}} + provided + +``` + +当前,本模块包含以下可以扩展的接口: +- `SerializationSchemaFactory` +- `DeserializationSchemaFactory` +- `ScalarFunction` +- `TableFunction` +- `AggregateFunction` + +{{< top >}} + +接下来? +----------------- + +* [公共概念和 API]({{< ref "docs/dev/table/common" >}}): Table API 和 SQL 公共概念以及 API。 +* [数据类型]({{< ref "docs/dev/table/types" >}}): 内置数据类型以及它们的属性 +* [流式概念]({{< ref "docs/dev/table/concepts/overview" >}}): Table API 和 SQL 中流式相关的文档,比如配置时间属性和如何处理更新结果。 +* [连接外部系统]({{< ref "docs/connectors/table/overview" >}}): 读写外部系统的连接器和格式。 +* [Table API]({{< ref "docs/dev/table/tableApi" >}}): Table API 支持的操作。 +* [SQL]({{< ref "docs/dev/table/sql/overview" >}}): SQL 支持的操作和语法。 +* [内置函数]({{< ref "docs/dev/table/functions/systemFunctions" >}}): Table API 和 SQL 中的内置函数。 +* [SQL Client]({{< ref "docs/dev/table/sqlClient" >}}): 不用编写代码就可以尝试 Flink SQL,可以直接提交 SQL 任务到集群上。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sourcesSinks.md b/docs/content.zh/docs/dev/table/sourcesSinks.md new file mode 100644 index 0000000000000..b2932f57b6932 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sourcesSinks.md @@ -0,0 +1,823 @@ +--- +title: "User-defined Sources & Sinks" +weight: 131 +type: docs +aliases: + - /dev/table/sourceSinks.html +--- + + +# User-defined Sources & Sinks + +_Dynamic tables_ are the core concept of Flink's Table & SQL API for processing both bounded and unbounded +data in a unified fashion. + +Because dynamic tables are only a logical concept, Flink does not own the data itself. Instead, the content +of a dynamic table is stored in external systems (such as databases, key-value stores, message queues) or files. + +_Dynamic sources_ and _dynamic sinks_ can be used to read and write data from and to an external system. In +the documentation, sources and sinks are often summarized under the term _connector_. + +Flink provides pre-defined connectors for Kafka, Hive, and different file systems. See the [connector section]({{< ref "docs/connectors/table/overview" >}}) +for more information about built-in table sources and sinks. + +This page focuses on how to develop a custom, user-defined connector. + +Overview +-------- + +In many cases, implementers don't need to create a new connector from scratch but would like to slightly +modify existing connectors or hook into the existing stack. In other cases, implementers would like to +create specialized connectors. + +This section helps for both kinds of use cases. It explains the general architecture of table connectors +from pure declaration in the API to runtime code that will be executed on the cluster. + +The filled arrows show how objects are transformed to other objects from one stage to the next stage during +the translation process. + +{{< img width="90%" src="/fig/table_connectors.svg" alt="Translation of table connectors" >}} + +### Metadata + +Both Table API and SQL are declarative APIs. This includes the declaration of tables. Thus, executing +a `CREATE TABLE` statement results in updated metadata in the target catalog. + +For most catalog implementations, physical data in the external system is not modified for such an +operation. Connector-specific dependencies don't have to be present in the classpath yet. The options declared +in the `WITH` clause are neither validated nor otherwise interpreted. + +The metadata for dynamic tables (created via DDL or provided by the catalog) is represented as instances +of `CatalogTable`. A table name will be resolved into a `CatalogTable` internally when necessary. + +### Planning + +When it comes to planning and optimization of the table program, a `CatalogTable` needs to be resolved +into a `DynamicTableSource` (for reading in a `SELECT` query) and `DynamicTableSink` (for writing in +an `INSERT INTO` statement). + +`DynamicTableSourceFactory` and `DynamicTableSinkFactory` provide connector-specific logic for translating +the metadata of a `CatalogTable` into instances of `DynamicTableSource` and `DynamicTableSink`. In most +of the cases, a factory's purpose is to validate options (such as `'port' = '5022'` in the example), +configure encoding/decoding formats (if required), and create a parameterized instance of the table +connector. + +By default, instances of `DynamicTableSourceFactory` and `DynamicTableSinkFactory` are discovered using +Java's [Service Provider Interfaces (SPI)](https://docs.oracle.com/javase/tutorial/sound/SPI-intro.html). The +`connector` option (such as `'connector' = 'custom'` in the example) must correspond to a valid factory +identifier. + +Although it might not be apparent in the class naming, `DynamicTableSource` and `DynamicTableSink` +can also be seen as stateful factories that eventually produce concrete runtime implementation for reading/writing +the actual data. + +The planner uses the source and sink instances to perform connector-specific bidirectional communication +until an optimal logical plan could be found. Depending on the optionally declared ability interfaces (e.g. +`SupportsProjectionPushDown` or `SupportsOverwrite`), the planner might apply changes to an instance and +thus mutate the produced runtime implementation. + +### Runtime + +Once the logical planning is complete, the planner will obtain the _runtime implementation_ from the table +connector. Runtime logic is implemented in Flink's core connector interfaces such as `InputFormat` or `SourceFunction`. + +Those interfaces are grouped by another level of abstraction as subclasses of `ScanRuntimeProvider`, +`LookupRuntimeProvider`, and `SinkRuntimeProvider`. + +For example, both `OutputFormatProvider` (providing `org.apache.flink.api.common.io.OutputFormat`) and `SinkFunctionProvider` (providing `org.apache.flink.streaming.api.functions.sink.SinkFunction`) are concrete instances of `SinkRuntimeProvider` +that the planner can handle. + +{{< top >}} + +Extension Points +---------------- + +This section explains the available interfaces for extending Flink's table connectors. + +### Dynamic Table Factories + +Dynamic table factories are used to configure a dynamic table connector for an external storage system from catalog +and session information. + +`org.apache.flink.table.factories.DynamicTableSourceFactory` can be implemented to construct a `DynamicTableSource`. + +`org.apache.flink.table.factories.DynamicTableSinkFactory` can be implemented to construct a `DynamicTableSink`. + +By default, the factory is discovered using the value of the `connector` option as the factory identifier +and Java's Service Provider Interface. + +In JAR files, references to new implementations can be added to the service file: + +`META-INF/services/org.apache.flink.table.factories.Factory` + +The framework will check for a single matching factory that is uniquely identified by factory identifier +and requested base class (e.g. `DynamicTableSourceFactory`). + +The factory discovery process can be bypassed by the catalog implementation if necessary. For this, a +catalog needs to return an instance that implements the requested base class in `org.apache.flink.table.catalog.Catalog#getFactory`. + +### Dynamic Table Source + +By definition, a dynamic table can change over time. + +When reading a dynamic table, the content can either be considered as: +- A changelog (finite or infinite) for which all changes are consumed continuously until the changelog + is exhausted. This is represented by the `ScanTableSource` interface. +- A continuously changing or very large external table whose content is usually never read entirely + but queried for individual values when necessary. This is represented by the `LookupTableSource` + interface. + +A class can implement both of these interfaces at the same time. The planner decides about their usage depending +on the specified query. + +#### Scan Table Source + +A `ScanTableSource` scans all rows from an external storage system during runtime. + +The scanned rows don't have to contain only insertions but can also contain updates and deletions. Thus, +the table source can be used to read a (finite or infinite) changelog. The returned _changelog mode_ indicates +the set of changes that the planner can expect during runtime. + +For regular batch scenarios, the source can emit a bounded stream of insert-only rows. + +For regular streaming scenarios, the source can emit an unbounded stream of insert-only rows. + +For change data capture (CDC) scenarios, the source can emit bounded or unbounded streams with insert, +update, and delete rows. + +A table source can implement further ability interfaces such as `SupportsProjectionPushDown` that might +mutate an instance during planning. All abilities can be found in the `org.apache.flink.table.connector.source.abilities` +package and are listed in the [source abilities table](#source-abilities). + +The runtime implementation of a `ScanTableSource` must produce internal data structures. Thus, records +must be emitted as `org.apache.flink.table.data.RowData`. The framework provides runtime converters such +that a source can still work on common data structures and perform a conversion at the end. + +#### Lookup Table Source + +A `LookupTableSource` looks up rows of an external storage system by one or more keys during runtime. + +Compared to `ScanTableSource`, the source does not have to read the entire table and can lazily fetch individual +values from a (possibly continuously changing) external table when necessary. + +Compared to `ScanTableSource`, a `LookupTableSource` does only support emitting insert-only changes currently. + +Further abilities are not supported. See the documentation of `org.apache.flink.table.connector.source.LookupTableSource` +for more information. + +The runtime implementation of a `LookupTableSource` is a `TableFunction` or `AsyncTableFunction`. The function +will be called with values for the given lookup keys during runtime. + +#### Source Abilities + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    InterfaceDescription
    SupportsFilterPushDownEnables to push down the filter into the DynamicTableSource. For efficiency, a source can + push filters further down in order to be close to the actual data generation.
    SupportsLimitPushDownEnables to push down a limit (the expected maximum number of produced records) into a DynamicTableSource.
    SupportsPartitionPushDownEnables to pass available partitions to the planner and push down partitions into a DynamicTableSource. + During the runtime, the source will only read data from the passed partition list for efficiency.
    SupportsProjectionPushDown Enables to push down a (possibly nested) projection into a DynamicTableSource. For efficiency, + a source can push a projection further down in order to be close to the actual data generation. If the source + also implements SupportsReadingMetadata, the source will also read the required metadata only. +
    SupportsReadingMetadataEnables to read metadata columns from a DynamicTableSource. The source + is responsible to add the required metadata at the end of the produced rows. This includes + potentially forwarding metadata column from contained formats.
    SupportsWatermarkPushDownEnables to push down a watermark strategy into a DynamicTableSource. The watermark + strategy is a builder/factory for timestamp extraction and watermark generation. During the runtime, the + watermark generator is located inside the source and is able to generate per-partition watermarks.
    SupportsSourceWatermarkEnables to fully rely on the watermark strategy provided by the ScanTableSource + itself. Thus, a CREATE TABLE DDL is able to use SOURCE_WATERMARK() which + is a built-in marker function that will be detected by the planner and translated into a call + to this interface if available.
    + +Attention The interfaces above are currently only available for +`ScanTableSource`, not for `LookupTableSource`. + +### Dynamic Table Sink + +By definition, a dynamic table can change over time. + +When writing a dynamic table, the content can always be considered as a changelog (finite or infinite) +for which all changes are written out continuously until the changelog is exhausted. The returned _changelog mode_ +indicates the set of changes that the sink accepts during runtime. + +For regular batch scenarios, the sink can solely accept insert-only rows and write out bounded streams. + +For regular streaming scenarios, the sink can solely accept insert-only rows and can write out unbounded streams. + +For change data capture (CDC) scenarios, the sink can write out bounded or unbounded streams with insert, +update, and delete rows. + +A table sink can implement further ability interfaces such as `SupportsOverwrite` that might mutate an +instance during planning. All abilities can be found in the `org.apache.flink.table.connector.sink.abilities` +package and are listed in the [sink abilities table](#sink-abilities). + +The runtime implementation of a `DynamicTableSink` must consume internal data structures. Thus, records +must be accepted as `org.apache.flink.table.data.RowData`. The framework provides runtime converters such +that a sink can still work on common data structures and perform a conversion at the beginning. + +#### Sink Abilities + + + + + + + + + + + + + + + + + + + + + + +
    InterfaceDescription
    SupportsOverwriteEnables to overwrite existing data in a DynamicTableSink. By default, if + this interface is not implemented, existing tables or partitions cannot be overwritten using + e.g. the SQL INSERT OVERWRITE clause.
    SupportsPartitioningEnables to write partitioned data in a DynamicTableSink.
    SupportsWritingMetadataEnables to write metadata columns into a DynamicTableSource. A table sink is + responsible for accepting requested metadata columns at the end of consumed rows and persist + them. This includes potentially forwarding metadata columns to contained formats.
    + +### Encoding / Decoding Formats + +Some table connectors accept different formats that encode and decode keys and/or values. + +Formats work similar to the pattern `DynamicTableSourceFactory -> DynamicTableSource -> ScanRuntimeProvider`, +where the factory is responsible for translating options and the source is responsible for creating runtime logic. + +Because formats might be located in different modules, they are discovered using Java's Service Provider +Interface similar to [table factories](#dynamic-table-factories). In order to discover a format factory, +the dynamic table factory searches for a factory that corresponds to a factory identifier and connector-specific +base class. + +For example, the Kafka table source requires a `DeserializationSchema` as runtime interface for a decoding +format. Therefore, the Kafka table source factory uses the value of the `value.format` option to discover +a `DeserializationFormatFactory`. + +The following format factories are currently supported: + +``` +org.apache.flink.table.factories.DeserializationFormatFactory +org.apache.flink.table.factories.SerializationFormatFactory +``` + +The format factory translates the options into an `EncodingFormat` or a `DecodingFormat`. Those interfaces are +another kind of factory that produce specialized format runtime logic for the given data type. + +For example, for a Kafka table source factory, the `DeserializationFormatFactory` would return an `EncodingFormat` +that can be passed into the Kafka table source. + +{{< top >}} + +Full Stack Example +------------------ + +This section sketches how to implement a scan table source with a decoding format that supports changelog +semantics. The example illustrates how all of the mentioned components play together. It can serve as +a reference implementation. + +In particular, it shows how to +- create factories that parse and validate options, +- implement table connectors, +- implement and discover custom formats, +- and use provided utilities such as data structure converters and the `FactoryUtil`. + +The table source uses a simple single-threaded `SourceFunction` to open a socket that listens for incoming +bytes. The raw bytes are decoded into rows by a pluggable format. The format expects a changelog flag +as the first column. + +We will use most of the interfaces mentioned above to enable the following DDL: + +```sql +CREATE TABLE UserScores (name STRING, score INT) +WITH ( + 'connector' = 'socket', + 'hostname' = 'localhost', + 'port' = '9999', + 'byte-delimiter' = '10', + 'format' = 'changelog-csv', + 'changelog-csv.column-delimiter' = '|' +); +``` + +Because the format supports changelog semantics, we are able to ingest updates during runtime and create +an updating view that can continuously evaluate changing data: + +```sql +SELECT name, SUM(score) FROM UserScores GROUP BY name; +``` + +Use the following command to ingest data in a terminal: +```text +> nc -lk 9999 +INSERT|Alice|12 +INSERT|Bob|5 +DELETE|Alice|12 +INSERT|Alice|18 +``` + +### Factories + +This section illustrates how to translate metadata coming from the catalog to concrete connector instances. + +Both factories have been added to the `META-INF/services` directory. + +**`SocketDynamicTableFactory`** + +The `SocketDynamicTableFactory` translates the catalog table to a table source. Because the table source +requires a decoding format, we are discovering the format using the provided `FactoryUtil` for convenience. + +```java +import org.apache.flink.api.common.serialization.DeserializationSchema; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.connector.format.DecodingFormat; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.factories.DeserializationFormatFactory; +import org.apache.flink.table.factories.DynamicTableSourceFactory; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.flink.table.types.DataType; + +public class SocketDynamicTableFactory implements DynamicTableSourceFactory { + + // define all options statically + public static final ConfigOption HOSTNAME = ConfigOptions.key("hostname") + .stringType() + .noDefaultValue(); + + public static final ConfigOption PORT = ConfigOptions.key("port") + .intType() + .noDefaultValue(); + + public static final ConfigOption BYTE_DELIMITER = ConfigOptions.key("byte-delimiter") + .intType() + .defaultValue(10); // corresponds to '\n' + + @Override + public String factoryIdentifier() { + return "socket"; // used for matching to `connector = '...'` + } + + @Override + public Set> requiredOptions() { + final Set> options = new HashSet<>(); + options.add(HOSTNAME); + options.add(PORT); + options.add(FactoryUtil.FORMAT); // use pre-defined option for format + return options; + } + + @Override + public Set> optionalOptions() { + final Set> options = new HashSet<>(); + options.add(BYTE_DELIMITER); + return options; + } + + @Override + public DynamicTableSource createDynamicTableSource(Context context) { + // either implement your custom validation logic here ... + // or use the provided helper utility + final FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context); + + // discover a suitable decoding format + final DecodingFormat> decodingFormat = helper.discoverDecodingFormat( + DeserializationFormatFactory.class, + FactoryUtil.FORMAT); + + // validate all options + helper.validate(); + + // get the validated options + final ReadableConfig options = helper.getOptions(); + final String hostname = options.get(HOSTNAME); + final int port = options.get(PORT); + final byte byteDelimiter = (byte) (int) options.get(BYTE_DELIMITER); + + // derive the produced data type (excluding computed columns) from the catalog table + final DataType producedDataType = context.getCatalogTable().getSchema().toPhysicalRowDataType(); + + // create and return dynamic table source + return new SocketDynamicTableSource(hostname, port, byteDelimiter, decodingFormat, producedDataType); + } +} +``` + +**`ChangelogCsvFormatFactory`** + +The `ChangelogCsvFormatFactory` translates format-specific options to a format. The `FactoryUtil` in `SocketDynamicTableFactory` +takes care of adapting the option keys accordingly and handles the prefixing like `changelog-csv.column-delimiter`. + +Because this factory implements `DeserializationFormatFactory`, it could also be used for other connectors +that support deserialization formats such as the Kafka connector. + +```java +import org.apache.flink.api.common.serialization.DeserializationSchema; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.connector.format.DecodingFormat; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.factories.FactoryUtil; +import org.apache.flink.table.factories.DeserializationFormatFactory; +import org.apache.flink.table.factories.DynamicTableFactory; + +public class ChangelogCsvFormatFactory implements DeserializationFormatFactory { + + // define all options statically + public static final ConfigOption COLUMN_DELIMITER = ConfigOptions.key("column-delimiter") + .stringType() + .defaultValue("|"); + + @Override + public String factoryIdentifier() { + return "changelog-csv"; + } + + @Override + public Set> requiredOptions() { + return Collections.emptySet(); + } + + @Override + public Set> optionalOptions() { + final Set> options = new HashSet<>(); + options.add(COLUMN_DELIMITER); + return options; + } + + @Override + public DecodingFormat> createDecodingFormat( + DynamicTableFactory.Context context, + ReadableConfig formatOptions) { + // either implement your custom validation logic here ... + // or use the provided helper method + FactoryUtil.validateFactoryOptions(this, formatOptions); + + // get the validated options + final String columnDelimiter = formatOptions.get(COLUMN_DELIMITER); + + // create and return the format + return new ChangelogCsvFormat(columnDelimiter); + } +} +``` + +### Table Source and Decoding Format + +This section illustrates how to translate from instances of the planning layer to runtime instances that +are shipped to the cluster. + +**`SocketDynamicTableSource`** + +The `SocketDynamicTableSource` is used during planning. In our example, we don't implement any of the +available ability interfaces. Therefore, the main logic can be found in `getScanRuntimeProvider(...)` +where we instantiate the required `SourceFunction` and its `DeserializationSchema` for runtime. Both +instances are parameterized to return internal data structures (i.e. `RowData`). + +```java +import org.apache.flink.api.common.serialization.DeserializationSchema; +import org.apache.flink.streaming.api.functions.source.SourceFunction; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.format.DecodingFormat; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.connector.source.SourceFunctionProvider; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; + +public class SocketDynamicTableSource implements ScanTableSource { + + private final String hostname; + private final int port; + private final byte byteDelimiter; + private final DecodingFormat> decodingFormat; + private final DataType producedDataType; + + public SocketDynamicTableSource( + String hostname, + int port, + byte byteDelimiter, + DecodingFormat> decodingFormat, + DataType producedDataType) { + this.hostname = hostname; + this.port = port; + this.byteDelimiter = byteDelimiter; + this.decodingFormat = decodingFormat; + this.producedDataType = producedDataType; + } + + @Override + public ChangelogMode getChangelogMode() { + // in our example the format decides about the changelog mode + // but it could also be the source itself + return decodingFormat.getChangelogMode(); + } + + @Override + public ScanRuntimeProvider getScanRuntimeProvider(ScanContext runtimeProviderContext) { + + // create runtime classes that are shipped to the cluster + + final DeserializationSchema deserializer = decodingFormat.createRuntimeDecoder( + runtimeProviderContext, + producedDataType); + + final SourceFunction sourceFunction = new SocketSourceFunction( + hostname, + port, + byteDelimiter, + deserializer); + + return SourceFunctionProvider.of(sourceFunction, false); + } + + @Override + public DynamicTableSource copy() { + return new SocketDynamicTableSource(hostname, port, byteDelimiter, decodingFormat, producedDataType); + } + + @Override + public String asSummaryString() { + return "Socket Table Source"; + } +} +``` + +**`ChangelogCsvFormat`** + +The `ChangelogCsvFormat` is a decoding format that uses a `DeserializationSchema` during runtime. It +supports emitting `INSERT` and `DELETE` changes. + +```java +import org.apache.flink.api.common.serialization.DeserializationSchema; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.table.connector.ChangelogMode; +import org.apache.flink.table.connector.format.DecodingFormat; +import org.apache.flink.table.connector.source.DynamicTableSource; +import org.apache.flink.table.connector.source.DynamicTableSource.DataStructureConverter; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.types.RowKind; + +public class ChangelogCsvFormat implements DecodingFormat> { + + private final String columnDelimiter; + + public ChangelogCsvFormat(String columnDelimiter) { + this.columnDelimiter = columnDelimiter; + } + + @Override + @SuppressWarnings("unchecked") + public DeserializationSchema createRuntimeDecoder( + DynamicTableSource.Context context, + DataType producedDataType) { + // create type information for the DeserializationSchema + final TypeInformation producedTypeInfo = (TypeInformation) context.createTypeInformation( + producedDataType); + + // most of the code in DeserializationSchema will not work on internal data structures + // create a converter for conversion at the end + final DataStructureConverter converter = context.createDataStructureConverter(producedDataType); + + // use logical types during runtime for parsing + final List parsingTypes = producedDataType.getLogicalType().getChildren(); + + // create runtime class + return new ChangelogCsvDeserializer(parsingTypes, converter, producedTypeInfo, columnDelimiter); + } + + @Override + public ChangelogMode getChangelogMode() { + // define that this format can produce INSERT and DELETE rows + return ChangelogMode.newBuilder() + .addContainedKind(RowKind.INSERT) + .addContainedKind(RowKind.DELETE) + .build(); + } +} +``` + +### Runtime + +For completeness, this section illustrates the runtime logic for both `SourceFunction` and `DeserializationSchema`. + +**ChangelogCsvDeserializer** + +The `ChangelogCsvDeserializer` contains a simple parsing logic for converting bytes into `Row` of `Integer` +and `String` with a row kind. The final conversion step converts those into internal data structures. + +```java +import org.apache.flink.api.common.serialization.DeserializationSchema; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.table.connector.RuntimeConverter.Context; +import org.apache.flink.table.connector.source.DynamicTableSource.DataStructureConverter; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeRoot; +import org.apache.flink.types.Row; +import org.apache.flink.types.RowKind; + +public class ChangelogCsvDeserializer implements DeserializationSchema { + + private final List parsingTypes; + private final DataStructureConverter converter; + private final TypeInformation producedTypeInfo; + private final String columnDelimiter; + + public ChangelogCsvDeserializer( + List parsingTypes, + DataStructureConverter converter, + TypeInformation producedTypeInfo, + String columnDelimiter) { + this.parsingTypes = parsingTypes; + this.converter = converter; + this.producedTypeInfo = producedTypeInfo; + this.columnDelimiter = columnDelimiter; + } + + @Override + public TypeInformation getProducedType() { + // return the type information required by Flink's core interfaces + return producedTypeInfo; + } + + @Override + public void open(InitializationContext context) { + // converters must be open + converter.open(Context.create(ChangelogCsvDeserializer.class.getClassLoader())); + } + + @Override + public RowData deserialize(byte[] message) { + // parse the columns including a changelog flag + final String[] columns = new String(message).split(Pattern.quote(columnDelimiter)); + final RowKind kind = RowKind.valueOf(columns[0]); + final Row row = new Row(kind, parsingTypes.size()); + for (int i = 0; i < parsingTypes.size(); i++) { + row.setField(i, parse(parsingTypes.get(i).getTypeRoot(), columns[i + 1])); + } + // convert to internal data structure + return (RowData) converter.toInternal(row); + } + + private static Object parse(LogicalTypeRoot root, String value) { + switch (root) { + case INTEGER: + return Integer.parseInt(value); + case VARCHAR: + return value; + default: + throw new IllegalArgumentException(); + } + } + + @Override + public boolean isEndOfStream(RowData nextElement) { + return false; + } +} +``` + +**SocketSourceFunction** + +The `SocketSourceFunction` opens a socket and consumes bytes. It splits records by the given byte +delimiter (`\n` by default) and delegates the decoding to a pluggable `DeserializationSchema`. The +source function can only work with a parallelism of 1. + +```java +import org.apache.flink.api.common.serialization.DeserializationSchema; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.streaming.api.functions.source.RichSourceFunction; +import org.apache.flink.table.data.RowData; + +public class SocketSourceFunction extends RichSourceFunction implements ResultTypeQueryable { + + private final String hostname; + private final int port; + private final byte byteDelimiter; + private final DeserializationSchema deserializer; + + private volatile boolean isRunning = true; + private Socket currentSocket; + + public SocketSourceFunction(String hostname, int port, byte byteDelimiter, DeserializationSchema deserializer) { + this.hostname = hostname; + this.port = port; + this.byteDelimiter = byteDelimiter; + this.deserializer = deserializer; + } + + @Override + public TypeInformation getProducedType() { + return deserializer.getProducedType(); + } + + @Override + public void open(Configuration parameters) throws Exception { + deserializer.open(() -> getRuntimeContext().getMetricGroup()); + } + + @Override + public void run(SourceContext ctx) throws Exception { + while (isRunning) { + // open and consume from socket + try (final Socket socket = new Socket()) { + currentSocket = socket; + socket.connect(new InetSocketAddress(hostname, port), 0); + try (InputStream stream = socket.getInputStream()) { + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + int b; + while ((b = stream.read()) >= 0) { + // buffer until delimiter + if (b != byteDelimiter) { + buffer.write(b); + } + // decode and emit record + else { + ctx.collect(deserializer.deserialize(buffer.toByteArray())); + buffer.reset(); + } + } + } + } catch (Throwable t) { + t.printStackTrace(); // print and continue + } + Thread.sleep(1000); + } + } + + @Override + public void cancel() { + isRunning = false; + try { + currentSocket.close(); + } catch (Throwable t) { + // ignore + } + } +} +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/_index.md b/docs/content.zh/docs/dev/table/sql/_index.md new file mode 100644 index 0000000000000..f123f26f90509 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/_index.md @@ -0,0 +1,23 @@ +--- +title: SQL +bookCollapseSection: true +weight: 32 +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/dev/table/sql/alter.md b/docs/content.zh/docs/dev/table/sql/alter.md new file mode 100644 index 0000000000000..c968bb31e3308 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/alter.md @@ -0,0 +1,197 @@ +--- +title: "ALTER 语句" +weight: 6 +type: docs +aliases: + - /zh/dev/table/sql/alter.html +--- + + +# ALTER 语句 + + + +ALTER 语句用于修改一个已经在 [Catalog]({{< ref "docs/dev/table/catalogs" >}}) 中注册的表、视图或函数定义。 + +Flink SQL 目前支持以下 ALTER 语句: + +- ALTER TABLE +- ALTER DATABASE +- ALTER FUNCTION + +## 执行 ALTER 语句 + +{{< tabs "explain" >}} +{{< tab "Java" >}} +可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行 ALTER 语句。 若 ALTER 操作执行成功,`executeSql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 ALTER 语句。 +{{< /tab >}} +{{< tab "Scala" >}} +可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行 ALTER 语句。 若 ALTER 操作执行成功,`executeSql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 ALTER 语句。 +{{< /tab >}} +{{< tab "Python" >}} + +可以使用 `TableEnvironment` 中的 `execute_sql()` 方法执行 ALTER 语句。 若 ALTER 操作执行成功,`execute_sql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 ALTER 语句。 + +{{< /tab >}} +{{< tab "SQL CLI" >}} + +可以在 [SQL CLI]({{< ref "docs/dev/table/sqlClient" >}}) 中执行 ALTER 语句。 + +以下的例子展示了如何在 SQL CLI 中执行一个 ALTER 语句。 + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "147c58e0-44d1-4f78-b995-88b3edba7bec" >}} +{{< tab "Java" >}} +```java +EnvironmentSettings settings = EnvironmentSettings.newInstance()... +TableEnvironment tableEnv = TableEnvironment.create(settings); + +// 注册名为 “Orders” 的表 +tableEnv.executeSql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)"); + +// 字符串数组: ["Orders"] +String[] tables = tableEnv.listTables(); +// or tableEnv.executeSql("SHOW TABLES").print(); + +// 把 “Orders” 的表名改为 “NewOrders” +tableEnv.executeSql("ALTER TABLE Orders RENAME TO NewOrders;"); + +// 字符串数组:["NewOrders"] +String[] tables = tableEnv.listTables(); +// or tableEnv.executeSql("SHOW TABLES").print(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val settings = EnvironmentSettings.newInstance()... +val tableEnv = TableEnvironment.create(settings) + +// 注册名为 “Orders” 的表 +tableEnv.executeSql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)"); + +// 字符串数组: ["Orders"] +val tables = tableEnv.listTables() +// or tableEnv.executeSql("SHOW TABLES").print() + +// 把 “Orders” 的表名改为 “NewOrders” +tableEnv.executeSql("ALTER TABLE Orders RENAME TO NewOrders;") + +// 字符串数组:["NewOrders"] +val tables = tableEnv.listTables() +// or tableEnv.executeSql("SHOW TABLES").print() +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +settings = EnvironmentSettings.new_instance()... +table_env = TableEnvironment.create(settings) + +# 字符串数组: ["Orders"] +tables = table_env.list_tables() +# or table_env.execute_sql("SHOW TABLES").print() + +# 把 “Orders” 的表名改为 “NewOrders” +table_env.execute_sql("ALTER TABLE Orders RENAME TO NewOrders;") + +# 字符串数组:["NewOrders"] +tables = table_env.list_tables() +# or table_env.execute_sql("SHOW TABLES").print() +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +```sql +Flink SQL> CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...); +[INFO] Table has been created. + +Flink SQL> SHOW TABLES; +Orders + +Flink SQL> ALTER TABLE Orders RENAME TO NewOrders; +[INFO] Table has been removed. + +Flink SQL> SHOW TABLES; +NewOrders +``` +{{< /tab >}} +{{< /tabs >}} + +## ALTER TABLE + +* 重命名表 + +```sql +ALTER TABLE [catalog_name.][db_name.]table_name RENAME TO new_table_name +``` + +把原有的表名更改为新的表名。 + +* 设置或修改表属性 + +```sql +ALTER TABLE [catalog_name.][db_name.]table_name SET (key1=val1, key2=val2, ...) +``` + +为指定的表设置一个或多个属性。若个别属性已经存在于表中,则使用新的值覆盖旧的值。 + +## ALTER DATABASE + +```sql +ALTER DATABASE [catalog_name.]db_name SET (key1=val1, key2=val2, ...) +``` + +在数据库中设置一个或多个属性。若个别属性已经在数据库中设定,将会使用新值覆盖旧值。 + +## ALTER FUNCTION + +{% highlight sql%} +ALTER [TEMPORARY|TEMPORARY SYSTEM] FUNCTION + [IF EXISTS] [catalog_name.][db_name.]function_name + AS identifier [LANGUAGE JAVA|SCALA|PYTHON] +``` + +修改一个有 catalog 和数据库命名空间的 catalog function ,需要指定一个新的 identifier ,可指定 language tag 。若函数不存在,删除会抛出异常。 + +如果 language tag 是 JAVA 或者 SCALA ,则 identifier 是 UDF 实现类的全限定名。关于 JAVA/SCALA UDF 的实现,请参考 [自定义函数]({{< ref "docs/dev/table/functions/udfs" >}})。 + +如果 language tag 是 PYTHON , 则 identifier 是 UDF 对象的全限定名,例如 `pyflink.table.tests.test_udf.add`。关于 PYTHON UDF 的实现,请参考 [Python UDFs]({{< ref "docs/dev/python/table/udfs/python_udfs" >}})。 + +**TEMPORARY** + +修改一个有 catalog 和数据库命名空间的临时 catalog function ,并覆盖原有的 catalog function 。 + +**TEMPORARY SYSTEM** + +修改一个没有数据库命名空间的临时系统 catalog function ,并覆盖系统内置的函数。 + +**IF EXISTS** + +若函数不存在,则不进行任何操作。 + +**LANGUAGE JAVA\|SCALA\|PYTHON** + +Language tag 用于指定 Flink runtime 如何执行这个函数。目前,只支持 JAVA,SCALA 和 PYTHON,且函数的默认语言为 JAVA。 diff --git a/docs/content.zh/docs/dev/table/sql/create.md b/docs/content.zh/docs/dev/table/sql/create.md new file mode 100644 index 0000000000000..6004dbb0e3292 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/create.md @@ -0,0 +1,607 @@ +--- +title: "CREATE 语句" +weight: 4 +type: docs +aliases: + - /zh/dev/table/sql/create.html +--- + + +# CREATE 语句 + + + +CREATE 语句用于向当前或指定的 [Catalog]({{< ref "docs/dev/table/catalogs" >}}) 中注册表、视图或函数。注册后的表、视图和函数可以在 SQL 查询中使用。 + +目前 Flink SQL 支持下列 CREATE 语句: + +- CREATE TABLE +- CREATE DATABASE +- CREATE VIEW +- CREATE FUNCTION + +## 执行 CREATE 语句 + +{{< tabs "execute" >}} +{{< tab "Java" >}} + +可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行 CREATE 语句。 若 CREATE 操作执行成功,`executeSql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 CREATE 语句。 +{{< /tab >}} +{{< tab "Scala" >}} + +可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行 CREATE 语句。 若 CREATE 操作执行成功,`executeSql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 CREATE 语句。 + +{{< /tab >}} +{{< tab "Python" >}} + +可以使用 `TableEnvironment` 中的 `execute_sql()` 方法执行 CREATE 语句。 若 CREATE 操作执行成功,`execute_sql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 CREATE 语句。 + +{{< /tab >}} +{{< tab "SQL CLI" >}} + +可以在 [SQL CLI]({{< ref "docs/dev/table/sqlClient" >}}) 中执行 CREATE 语句。 + +以下的例子展示了如何在 SQL CLI 中执行一个 CREATE 语句。 + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "43d7f18a-0f7f-4b9c-8367-d731238d4d41" >}} +{{< tab "Java" >}} +```java +EnvironmentSettings settings = EnvironmentSettings.newInstance()... +TableEnvironment tableEnv = TableEnvironment.create(settings); + +// 对已注册的表进行 SQL 查询 +// 注册名为 “Orders” 的表 +tableEnv.executeSql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)"); +// 在表上执行 SQL 查询,并把得到的结果作为一个新的表 +Table result = tableEnv.sqlQuery( + "SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'"); + +// 对已注册的表进行 INSERT 操作 +// 注册 TableSink +tableEnv.executeSql("CREATE TABLE RubberOrders(product STRING, amount INT) WITH (...)"); +// 在表上执行 INSERT 语句并向 TableSink 发出结果 +tableEnv.executeSql( + "INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'"); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val settings = EnvironmentSettings.newInstance()... +val tableEnv = TableEnvironment.create(settings) + +// 对已注册的表进行 SQL 查询 +// 注册名为 “Orders” 的表 +tableEnv.executeSql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)"); +// 在表上执行 SQL 查询,并把得到的结果作为一个新的表 +val result = tableEnv.sqlQuery( + "SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'"); + +// 对已注册的表进行 INSERT 操作 +// 注册 TableSink +tableEnv.executeSql("CREATE TABLE RubberOrders(product STRING, amount INT) WITH ('connector.path'='/path/to/file' ...)"); +// 在表上执行 INSERT 语句并向 TableSink 发出结果 +tableEnv.executeSql( + "INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'") +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +settings = EnvironmentSettings.new_instance()... +table_env = TableEnvironment.create(settings) + +# 对已经注册的表进行 SQL 查询 +# 注册名为 “Orders” 的表 +table_env.execute_sql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)"); +# 在表上执行 SQL 查询,并把得到的结果作为一个新的表 +result = table_env.sql_query( + "SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'"); + +# 对已注册的表进行 INSERT 操作 +# 注册 TableSink +table_env.execute_sql("CREATE TABLE RubberOrders(product STRING, amount INT) WITH (...)") +# 在表上执行 INSERT 语句并向 TableSink 发出结果 +table_env \ + .execute_sql("INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'") +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +```sql +Flink SQL> CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...); +[INFO] Table has been created. + +Flink SQL> CREATE TABLE RubberOrders (product STRING, amount INT) WITH (...); +[INFO] Table has been created. + +Flink SQL> INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'; +[INFO] Submitting SQL update statement to the cluster... +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + +## CREATE TABLE + +```text +CREATE TABLE [IF NOT EXISTS] [catalog_name.][db_name.]table_name + ( + { | | }[ , ...n] + [ ] + [ ][ , ...n] + ) + [COMMENT table_comment] + [PARTITIONED BY (partition_column_name1, partition_column_name2, ...)] + WITH (key1=val1, key2=val2, ...) + [ LIKE source_table [( )] ] + +: + column_name column_type [ ] [COMMENT column_comment] + +: + [CONSTRAINT constraint_name] PRIMARY KEY NOT ENFORCED + +: + [CONSTRAINT constraint_name] PRIMARY KEY (column_name, ...) NOT ENFORCED + +: + column_name column_type METADATA [ FROM metadata_key ] [ VIRTUAL ] + +: + column_name AS computed_column_expression [COMMENT column_comment] + +: + WATERMARK FOR rowtime_column_name AS watermark_strategy_expression + +: + [catalog_name.][db_name.]table_name + +: +{ + { INCLUDING | EXCLUDING } { ALL | CONSTRAINTS | PARTITIONS } + | { INCLUDING | EXCLUDING | OVERWRITING } { GENERATED | OPTIONS | WATERMARKS } +}[, ...] + +``` + +根据指定的表名创建一个表,如果同名表已经在 catalog 中存在了,则无法注册。 + +### Columns + +**Physical / Regular Columns** + +Physical columns are regular columns known from databases. They define the names, the types, and the +order of fields in the physical data. Thus, physical columns represent the payload that is read from +and written to an external system. Connectors and formats use these columns (in the defined order) +to configure themselves. Other kinds of columns can be declared between physical columns but will not +influence the final physical schema. + +The following statement creates a table with only regular columns: + +```sql +CREATE TABLE MyTable ( + `user_id` BIGINT, + `name` STRING +) WITH ( + ... +); +``` + +**Metadata Columns** + +Metadata columns are an extension to the SQL standard and allow to access connector and/or format specific +fields for every row of a table. A metadata column is indicated by the `METADATA` keyword. For example, +a metadata column can be be used to read and write the timestamp from and to Kafka records for time-based +operations. The [connector and format documentation]({{< ref "docs/connectors/table/overview" >}}) lists the +available metadata fields for every component. However, declaring a metadata column in a table's schema +is optional. + +The following statement creates a table with an additional metadata column that references the metadata field `timestamp`: + +```sql +CREATE TABLE MyTable ( + `user_id` BIGINT, + `name` STRING, + `record_time` TIMESTAMP_LTZ(3) METADATA FROM 'timestamp' -- reads and writes a Kafka record's timestamp +) WITH ( + 'connector' = 'kafka' + ... +); +``` + +Every metadata field is identified by a string-based key and has a documented data type. For example, +the Kafka connector exposes a metadata field with key `timestamp` and data type `TIMESTAMP_LTZ(3)` +that can be used for both reading and writing records. + +In the example above, the metadata column `record_time` becomes part of the table's schema and can be +transformed and stored like a regular column: + +```sql +INSERT INTO MyTable SELECT user_id, name, record_time + INTERVAL '1' SECOND FROM MyTable; +``` + +For convenience, the `FROM` clause can be omitted if the column name should be used as the identifying metadata key: + +```sql +CREATE TABLE MyTable ( + `user_id` BIGINT, + `name` STRING, + `timestamp` TIMESTAMP_LTZ(3) METADATA -- use column name as metadata key +) WITH ( + 'connector' = 'kafka' + ... +); +``` + +For convenience, the runtime will perform an explicit cast if the data type of the column differs from +the data type of the metadata field. Of course, this requires that the two data types are compatible. + +```sql +CREATE TABLE MyTable ( + `user_id` BIGINT, + `name` STRING, + `timestamp` BIGINT METADATA -- cast the timestamp as BIGINT +) WITH ( + 'connector' = 'kafka' + ... +); +``` + +By default, the planner assumes that a metadata column can be used for both reading and writing. However, +in many cases an external system provides more read-only metadata fields than writable fields. Therefore, +it is possible to exclude metadata columns from persisting using the `VIRTUAL` keyword. + +```sql +CREATE TABLE MyTable ( + `timestamp` BIGINT METADATA, -- part of the query-to-sink schema + `offset` BIGINT METADATA VIRTUAL, -- not part of the query-to-sink schema + `user_id` BIGINT, + `name` STRING, +) WITH ( + 'connector' = 'kafka' + ... +); +``` + +In the example above, the `offset` is a read-only metadata column and excluded from the query-to-sink +schema. Thus, source-to-query schema (for `SELECT`) and query-to-sink (for `INSERT INTO`) schema differ: + +```text +source-to-query schema: +MyTable(`timestamp` BIGINT, `offset` BIGINT, `user_id` BIGINT, `name` STRING) + +query-to-sink schema: +MyTable(`timestamp` BIGINT, `user_id` BIGINT, `name` STRING) +``` + +**Computed Columns** + +Computed columns are virtual columns that are generated using the syntax `column_name AS computed_column_expression`. + +A computed column evaluates an expression that can reference other columns declared in the same table. +Both physical columns and metadata columns can be accessed. The column itself is not physically stored +within the table. The column's data type is derived automatically from the given expression and does +not have to be declared manually. + +The planner will transform computed columns into a regular projection after the source. For optimization +or [watermark strategy push down]({{< ref "docs/dev/table/sourcesSinks" >}}), the evaluation might be spread +across operators, performed multiple times, or skipped if not needed for the given query. + +For example, a computed column could be defined as: +```sql +CREATE TABLE MyTable ( + `user_id` BIGINT, + `price` DOUBLE, + `quantity` DOUBLE, + `cost` AS price * quanitity, -- evaluate expression and supply the result to queries +) WITH ( + 'connector' = 'kafka' + ... +); +``` + +The expression may contain any combination of columns, constants, or functions. The expression cannot +contain a subquery. + +Computed columns are commonly used in Flink for defining [time attributes]({{< ref "docs/dev/table/concepts/time_attributes" >}}) +in `CREATE TABLE` statements. +- A [processing time attribute]({{< ref "docs/dev/table/concepts/time_attributes" >}}#processing-time) +can be defined easily via `proc AS PROCTIME()` using the system's `PROCTIME()` function. +- An [event time attribute]({{< ref "docs/dev/table/concepts/time_attributes" >}}#event-time) timestamp +can be pre-processed before the `WATERMARK` declaration. For example, the computed column can be used +if the original field is not `TIMESTAMP(3)` type or is nested in a JSON string. + +Similar to virtual metadata columns, computed columns are excluded from persisting. Therefore, a computed +column cannot be the target of an `INSERT INTO` statement. Thus, source-to-query schema (for `SELECT`) +and query-to-sink (for `INSERT INTO`) schema differ: + +```text +source-to-query schema: +MyTable(`user_id` BIGINT, `price` DOUBLE, `quantity` DOUBLE, `cost` DOUBLE) + +query-to-sink schema: +MyTable(`user_id` BIGINT, `price` DOUBLE, `quantity` DOUBLE) +``` + +### `WATERMARK` + +`WATERMARK` 定义了表的事件时间属性,其形式为 `WATERMARK FOR rowtime_column_name AS watermark_strategy_expression` 。 + +`rowtime_column_name` 把一个现有的列定义为一个为表标记事件时间的属性。该列的类型必须为 `TIMESTAMP(3)`,且是 schema 中的顶层列,它也可以是一个计算列。 + +`watermark_strategy_expression` 定义了 watermark 的生成策略。它允许使用包括计算列在内的任意非查询表达式来计算 watermark ;表达式的返回类型必须是 `TIMESTAMP(3)`,表示了从 Epoch 以来的经过的时间。 +返回的 watermark 只有当其不为空且其值大于之前发出的本地 watermark 时才会被发出(以保证 watermark 递增)。每条记录的 watermark 生成表达式计算都会由框架完成。 +框架会定期发出所生成的最大的 watermark ,如果当前 watermark 仍然与前一个 watermark 相同、为空、或返回的 watermark 的值小于最后一个发出的 watermark ,则新的 watermark 不会被发出。 +Watermark 根据 [`pipeline.auto-watermark-interval`]({{< ref "docs/deployment/config" >}}#pipeline-auto-watermark-interval) 中所配置的间隔发出。 +若 watermark 的间隔是 `0ms` ,那么每条记录都会产生一个 watermark,且 watermark 会在不为空并大于上一个发出的 watermark 时发出。 + +使用事件时间语义时,表必须包含事件时间属性和 watermark 策略。 + +Flink 提供了几种常用的 watermark 策略。 + +- 严格递增时间戳: `WATERMARK FOR rowtime_column AS rowtime_column`。 + + 发出到目前为止已观察到的最大时间戳的 watermark ,时间戳大于最大时间戳的行被认为没有迟到。 + +- 递增时间戳: `WATERMARK FOR rowtime_column AS rowtime_column - INTERVAL '0.001' SECOND`。 + + 发出到目前为止已观察到的最大时间戳减 1 的 watermark ,时间戳大于或等于最大时间戳的行被认为没有迟到。 + +- 有界乱序时间戳: `WATERMARK FOR rowtime_column AS rowtime_column - INTERVAL 'string' timeUnit`。 + + 发出到目前为止已观察到的最大时间戳减去指定延迟的 watermark ,例如, `WATERMARK FOR rowtime_column AS rowtime_column - INTERVAL '5' SECOND` 是一个 5 秒延迟的 watermark 策略。 + +```sql +CREATE TABLE Orders ( + `user` BIGINT, + product STRING, + order_time TIMESTAMP(3), + WATERMARK FOR order_time AS order_time - INTERVAL '5' SECOND +) WITH ( . . . ); +``` + + +### `PRIMARY KEY` + +主键用作 Flink 优化的一种提示信息。主键限制表明一张表或视图的某个(些)列是唯一的并且不包含 Null 值。 +主键声明的列都是非 nullable 的。因此主键可以被用作表行级别的唯一标识。 + +主键可以和列的定义一起声明,也可以独立声明为表的限制属性,不管是哪种方式,主键都不可以重复定义,否则 Flink 会报错。 + +**有效性检查** + +SQL 标准主键限制可以有两种模式:`ENFORCED` 或者 `NOT ENFORCED`。 它申明了是否输入/出数据会做合法性检查(是否唯一)。Flink 不存储数据因此只支持 `NOT ENFORCED` 模式,即不做检查,用户需要自己保证唯一性。 + +Flink 假设声明了主键的列都是不包含 Null 值的,Connector 在处理数据时需要自己保证语义正确。 + +**Notes:** 在 CREATE TABLE 语句中,创建主键会修改列的 nullable 属性,主键声明的列默认都是非 Nullable 的。 + +### `PARTITIONED BY` + +根据指定的列对已经创建的表进行分区。若表使用 filesystem sink ,则将会为每个分区创建一个目录。 + +### `WITH` Options + +表属性用于创建 table source/sink ,一般用于寻找和创建底层的连接器。 + +表达式 `key1=val1` 的键和值必须为字符串文本常量。请参考 [连接外部系统]({{< ref "docs/connectors/table/overview" >}}) 了解不同连接器所支持的属性。 + +**注意:** 表名可以为以下三种格式 1. `catalog_name.db_name.table_name` 2. `db_name.table_name` 3. `table_name`。使用`catalog_name.db_name.table_name` 的表将会与名为 "catalog_name" 的 catalog 和名为 "db_name" 的数据库一起注册到 metastore 中。使用 `db_name.table_name` 的表将会被注册到当前执行的 table environment 中的 catalog 且数据库会被命名为 "db_name";对于 `table_name`, 数据表将会被注册到当前正在运行的catalog和数据库中。 + +**注意:** 使用 `CREATE TABLE` 语句注册的表均可用作 table source 和 table sink。 在被 DML 语句引用前,我们无法决定其实际用于 source 抑或是 sink。 + +### `LIKE` + +`LIKE` 子句来源于两种 SQL 特性的变体/组合(Feature T171,“表定义中的 LIKE 语法” 和 Feature T173,“表定义中的 LIKE 语法扩展”)。LIKE 子句可以基于现有表的定义去创建新表,并且可以扩展或排除原始表中的某些部分。与 SQL 标准相反,LIKE 子句必须在 CREATE 语句中定义,并且是基于 CREATE 语句的更上层定义,这是因为 LIKE 子句可以用于定义表的多个部分,而不仅仅是 schema 部分。 + +你可以使用该子句,重用(或改写)指定的连接器配置属性或者可以向外部表添加 watermark 定义,例如可以向 Apache Hive 中定义的表添加 watermark 定义。 + +示例如下: + +```sql +CREATE TABLE Orders ( + `user` BIGINT, + product STRING, + order_time TIMESTAMP(3) +) WITH ( + 'connector' = 'kafka', + 'scan.startup.mode' = 'earliest-offset' +); + +CREATE TABLE Orders_with_watermark ( + -- 添加 watermark 定义 + WATERMARK FOR order_time AS order_time - INTERVAL '5' SECOND +) WITH ( + -- 改写 startup-mode 属性 + 'scan.startup.mode' = 'latest-offset' +) +LIKE Orders; +``` + +结果表 `Orders_with_watermark` 等效于使用以下语句创建的表: + +```sql +CREATE TABLE Orders_with_watermark ( + `user` BIGINT, + product STRING, + order_time TIMESTAMP(3), + WATERMARK FOR order_time AS order_time - INTERVAL '5' SECOND +) WITH ( + 'connector' = 'kafka', + 'scan.startup.mode' = 'latest-offset' +); +``` + +表属性的合并逻辑可以用 `like options` 来控制。 + +可以控制合并的表属性如下: + +* CONSTRAINTS - 主键和唯一键约束 +* GENERATED - 计算列 +* OPTIONS - 连接器信息、格式化方式等配置项 +* PARTITIONS - 表分区信息 +* WATERMARKS - watermark 定义 + +并且有三种不同的表属性合并策略: + +* INCLUDING - 新表包含源表(source table)所有的表属性,如果和源表的表属性重复则会直接失败,例如新表和源表存在相同 key 的属性。 +* EXCLUDING - 新表不包含源表指定的任何表属性。 +* OVERWRITING - 新表包含源表的表属性,但如果出现重复项,则会用新表的表属性覆盖源表中的重复表属性,例如,两个表中都存在相同 key 的属性,则会使用当前语句中定义的 key 的属性值。 + +并且你可以使用 `INCLUDING/EXCLUDING ALL` 这种声明方式来指定使用怎样的合并策略,例如使用 `EXCLUDING ALL INCLUDING WATERMARKS`,那么代表只有源表的 WATERMARKS 属性才会被包含进新表。 + +示例如下: +```sql + +-- 存储在文件系统的源表 +CREATE TABLE Orders_in_file ( + `user` BIGINT, + product STRING, + order_time_string STRING, + order_time AS to_timestamp(order_time) + +) +PARTITIONED BY (`user`) +WITH ( + 'connector' = 'filesystem', + 'path' = '...' +); + +-- 对应存储在 kafka 的源表 +CREATE TABLE Orders_in_kafka ( + -- 添加 watermark 定义 + WATERMARK FOR order_time AS order_time - INTERVAL '5' SECOND +) WITH ( + 'connector' = 'kafka', + ... +) +LIKE Orders_in_file ( + -- 排除需要生成 watermark 的计算列之外的所有内容。 + -- 去除不适用于 kafka 的所有分区和文件系统的相关属性。 + EXCLUDING ALL + INCLUDING GENERATED +); +``` + +如果未提供 like 配置项(like options),默认将使用 `INCLUDING ALL OVERWRITING OPTIONS` 的合并策略。 + +**注意:** 您无法选择物理列的合并策略,当物理列进行合并时就如使用了 `INCLUDING` 策略。 + +**注意:** 源表 `source_table` 可以是一个组合 ID。您可以指定不同 catalog 或者 DB 的表作为源表: 例如,`my_catalog.my_db.MyTable` 指定了源表 `MyTable` 来源于名为 `MyCatalog` 的 catalog 和名为 `my_db` 的 DB ,`my_db.MyTable` 指定了源表 `MyTable` 来源于当前 catalog 和名为 `my_db` 的 DB。 + +{{< top >}} + +## CREATE CATALOG + +```sql +CREATE CATALOG catalog_name + WITH (key1=val1, key2=val2, ...) +``` + +Create a catalog with the given catalog properties. If a catalog with the same name already exists, an exception is thrown. + +**WITH OPTIONS** + +Catalog properties used to store extra information related to this catalog. +The key and value of expression `key1=val1` should both be string literal. + +Check out more details at [Catalogs]({{< ref "docs/dev/table/catalogs" >}}). + +{{< top >}} + +## CREATE DATABASE + +```sql +CREATE DATABASE [IF NOT EXISTS] [catalog_name.]db_name + [COMMENT database_comment] + WITH (key1=val1, key2=val2, ...) +``` + +根据给定的表属性创建数据库。若数据库中已存在同名表会抛出异常。 + +**IF NOT EXISTS** + +若数据库已经存在,则不会进行任何操作。 + +**WITH OPTIONS** + +数据库属性一般用于存储关于这个数据库额外的信息。 +表达式 `key1=val1` 中的键和值都需要是字符串文本常量。 + +{{< top >}} + +## CREATE VIEW +```sql +CREATE [TEMPORARY] VIEW [IF NOT EXISTS] [catalog_name.][db_name.]view_name + [{columnName [, columnName ]* }] [COMMENT view_comment] + AS query_expression +``` + +根据给定的 query 语句创建一个视图。若数据库中已经存在同名视图会抛出异常. + +**TEMPORARY** + +创建一个有 catalog 和数据库命名空间的临时视图,并覆盖原有的视图。 + +**IF NOT EXISTS** + +若该视图已经存在,则不会进行任何操作。 + +{{< top >}} + +## CREATE FUNCTION +{% highlight sql%} +CREATE [TEMPORARY|TEMPORARY SYSTEM] FUNCTION + [IF NOT EXISTS] [[catalog_name.]db_name.]function_name + AS identifier [LANGUAGE JAVA|SCALA|PYTHON] +``` + +创建一个有 catalog 和数据库命名空间的 catalog function ,需要指定一个 identifier ,可指定 language tag 。 若 catalog 中,已经有同名的函数注册了,则无法注册。 + +如果 language tag 是 JAVA 或者 SCALA ,则 identifier 是 UDF 实现类的全限定名。关于 JAVA/SCALA UDF 的实现,请参考 [自定义函数]({{< ref "docs/dev/table/functions/udfs" >}})。 + +如果 language tag 是 PYTHON,则 identifier 是 UDF 对象的全限定名,例如 `pyflink.table.tests.test_udf.add`。关于 PYTHON UDF 的实现,请参考 [Python UDFs]({{< ref "docs/dev/python/table/udfs/python_udfs" >}})。 + +如果 language tag 是 PYTHON,而当前程序是 Java/Scala 程序或者纯 SQL 程序,则需要[配置 Python 相关的依赖]({{< ref "docs/dev/python/dependency_management" >}}#python-dependency-in-javascala-program)。 + +**TEMPORARY** + +创建一个有 catalog 和数据库命名空间的临时 catalog function ,并覆盖原有的 catalog function 。 + +**TEMPORARY SYSTEM** + +创建一个没有数据库命名空间的临时系统 catalog function ,并覆盖系统内置的函数。 + +**IF NOT EXISTS** + +若该函数已经存在,则不会进行任何操作。 + +**LANGUAGE JAVA\|SCALA\|PYTHON** + +Language tag 用于指定 Flink runtime 如何执行这个函数。目前,只支持 JAVA, SCALA 和 PYTHON,且函数的默认语言为 JAVA。 + diff --git a/docs/content.zh/docs/dev/table/sql/describe.md b/docs/content.zh/docs/dev/table/sql/describe.md new file mode 100644 index 0000000000000..eb53a60df5f86 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/describe.md @@ -0,0 +1,229 @@ +--- +title: "DESCRIBE Statements" +weight: 8 +type: docs +aliases: + - /dev/table/sql/describe.html +--- + + +# DESCRIBE Statements + +DESCRIBE statements are used to describe the schema of a table or a view. + + +## Run a DESCRIBE statement + +{{< tabs "describe" >}} +{{< tab "Java" >}} +DESCRIBE statements can be executed with the `executeSql()` method of the `TableEnvironment`. The `executeSql()` method returns the schema of given table for a successful DESCRIBE operation, otherwise will throw an exception. + +The following examples show how to run a DESCRIBE statement in `TableEnvironment`. +{{< /tab >}} +{{< tab "Scala" >}} +DESCRIBE statements can be executed with the `executeSql()` method of the `TableEnvironment`. The `executeSql()` method returns the schema of given table for a successful DESCRIBE operation, otherwise will throw an exception. + +The following examples show how to run a DESCRIBE statement in `TableEnvironment`. +{{< /tab >}} +{{< tab "Python" >}} + +DESCRIBE statements can be executed with the `execute_sql()` method of the `TableEnvironment`. The `execute_sql()` method returns the schema of given table for a successful DESCRIBE operation, otherwise will throw an exception. + +The following examples show how to run a DESCRIBE statement in `TableEnvironment`. + +{{< /tab >}} +{{< tab "SQL CLI" >}} + +DESCRIBE statements can be executed in [SQL CLI]({{< ref "docs/dev/table/sqlClient" >}}). + +The following examples show how to run a DESCRIBE statement in SQL CLI. + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "a5de1760-e363-4b8d-9d6f-0bacb35b9dcf" >}} +{{< tab "Java" >}} +```java +EnvironmentSettings settings = EnvironmentSettings.newInstance()... +TableEnvironment tableEnv = TableEnvironment.create(settings); + +// register a table named "Orders" +tableEnv.executeSql( + "CREATE TABLE Orders (" + + " `user` BIGINT NOT NULl," + + " product VARCHAR(32)," + + " amount INT," + + " ts TIMESTAMP(3)," + + " ptime AS PROCTIME()," + + " PRIMARY KEY(`user`) NOT ENFORCED," + + " WATERMARK FOR ts AS ts - INTERVAL '1' SECONDS" + + ") with (...)"); + +// print the schema +tableEnv.executeSql("DESCRIBE Orders").print(); + +// print the schema +tableEnv.executeSql("DESC Orders").print(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val settings = EnvironmentSettings.newInstance()... +val tableEnv = TableEnvironment.create(settings) + +// register a table named "Orders" + tableEnv.executeSql( + "CREATE TABLE Orders (" + + " `user` BIGINT NOT NULl," + + " product VARCHAR(32)," + + " amount INT," + + " ts TIMESTAMP(3)," + + " ptime AS PROCTIME()," + + " PRIMARY KEY(`user`) NOT ENFORCED," + + " WATERMARK FOR ts AS ts - INTERVAL '1' SECONDS" + + ") with (...)") + +// print the schema +tableEnv.executeSql("DESCRIBE Orders").print() + +// print the schema +tableEnv.executeSql("DESC Orders").print() +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +settings = EnvironmentSettings.new_instance()... +table_env = TableEnvironment.create(settings) + +# register a table named "Orders" +table_env.execute_sql( \ + "CREATE TABLE Orders (" + " `user` BIGINT NOT NULl," + " product VARCHAR(32)," + " amount INT," + " ts TIMESTAMP(3)," + " ptime AS PROCTIME()," + " PRIMARY KEY(`user`) NOT ENFORCED," + " WATERMARK FOR ts AS ts - INTERVAL '1' SECONDS" + ") with (...)"); + +# print the schema +table_env.execute_sql("DESCRIBE Orders").print() + +# print the schema +table_env.execute_sql("DESC Orders").print() +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +```sql +Flink SQL> CREATE TABLE Orders ( +> `user` BIGINT NOT NULl, +> product VARCHAR(32), +> amount INT, +> ts TIMESTAMP(3), +> ptime AS PROCTIME(), +> PRIMARY KEY(`user`) NOT ENFORCED, +> WATERMARK FOR ts AS ts - INTERVAL '1' SECONDS +> ) with ( +> ... +> ); +[INFO] Table has been created. + +Flink SQL> DESCRIBE Orders; + +Flink SQL> DESC Orders; +``` +{{< /tab >}} +{{< /tabs >}} + +The result of the above example is: +{{< tabs "c20da697-b9fc-434b-b7e5-3b51510eee5b" >}} +{{< tab "Java" >}} +```text + ++---------+----------------------------------+-------+-----------+-----------------+----------------------------+ +| name | type | null | key | computed column | watermark | ++---------+----------------------------------+-------+-----------+-----------------+----------------------------+ +| user | BIGINT | false | PRI(user) | | | +| product | VARCHAR(32) | true | | | | +| amount | INT | true | | | | +| ts | TIMESTAMP(3) *ROWTIME* | true | | | `ts` - INTERVAL '1' SECOND | +| ptime | TIMESTAMP(3) NOT NULL *PROCTIME* | false | | PROCTIME() | | ++---------+----------------------------------+-------+-----------+-----------------+----------------------------+ +5 rows in set + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```text + ++---------+----------------------------------+-------+-----------+-----------------+----------------------------+ +| name | type | null | key | computed column | watermark | ++---------+----------------------------------+-------+-----------+-----------------+----------------------------+ +| user | BIGINT | false | PRI(user) | | | +| product | VARCHAR(32) | true | | | | +| amount | INT | true | | | | +| ts | TIMESTAMP(3) *ROWTIME* | true | | | `ts` - INTERVAL '1' SECOND | +| ptime | TIMESTAMP(3) NOT NULL *PROCTIME* | false | | PROCTIME() | | ++---------+----------------------------------+-------+-----------+-----------------+----------------------------+ +5 rows in set + +``` +{{< /tab >}} +{{< tab "Python" >}} +```text + ++---------+----------------------------------+-------+-----------+-----------------+----------------------------+ +| name | type | null | key | computed column | watermark | ++---------+----------------------------------+-------+-----------+-----------------+----------------------------+ +| user | BIGINT | false | PRI(user) | | | +| product | VARCHAR(32) | true | | | | +| amount | INT | true | | | | +| ts | TIMESTAMP(3) *ROWTIME* | true | | | `ts` - INTERVAL '1' SECOND | +| ptime | TIMESTAMP(3) NOT NULL *PROCTIME* | false | | PROCTIME() | | ++---------+----------------------------------+-------+-----------+-----------------+----------------------------+ +5 rows in set + +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +```text + +root + |-- user: BIGINT NOT NULL + |-- product: VARCHAR(32) + |-- amount: INT + |-- ts: TIMESTAMP(3) *ROWTIME* + |-- ptime: TIMESTAMP(3) NOT NULL *PROCTIME* AS PROCTIME() + |-- WATERMARK FOR ts AS `ts` - INTERVAL '1' SECOND + |-- CONSTRAINT PK_3599338 PRIMARY KEY (user) + +``` +{{< /tab >}} +{{< /tabs >}} + + +{{< top >}} + +## Syntax + +```sql +{ DESCRIBE | DESC } [catalog_name.][db_name.]table_name +``` diff --git a/docs/content.zh/docs/dev/table/sql/drop.md b/docs/content.zh/docs/dev/table/sql/drop.md new file mode 100644 index 0000000000000..64b13559bafaf --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/drop.md @@ -0,0 +1,221 @@ +--- +title: "DROP 语句" +weight: 5 +type: docs +aliases: + - /zh/dev/table/sql/drop.html +--- + + +# DROP 语句 + + + +DROP 语句用于从当前或指定的 [Catalog]({{< ref "docs/dev/table/catalogs" >}}) 中删除一个已经注册的表、视图或函数。 + +Flink SQL 目前支持以下 DROP 语句: + +- DROP TABLE +- DROP DATABASE +- DROP VIEW +- DROP FUNCTION + +## 执行 DROP 语句 + +{{< tabs "execute" >}} +{{< tab "Java" >}} +可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行 DROP 语句。 若 DROP 操作执行成功,`executeSql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 DROP 语句。 + +{{< /tab >}} +{{< tab "Scala" >}} + +可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行 DROP 语句。 若 DROP 操作执行成功,`executeSql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 DROP 语句。 + +{{< /tab >}} +{{< tab "Java" >}} +可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行 DROP 语句。 若 DROP 操作执行成功,`executeSql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 DROP 语句。 + +{{< /tab >}} +{{< tab "Python" >}} + +可以使用 `TableEnvironment` 中的 `execute_sql()` 方法执行 DROP 语句。 若 DROP 操作执行成功,`execute_sql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 DROP 语句 +{{< /tab >}} +{{< tab "SQL CLI" >}} + +可以在 [SQL CLI]({{< ref "docs/dev/table/sqlClient" >}}) 中执行 DROP 语句。 + +以下的例子展示了如何在 SQL CLI 中执行一个 DROP 语句。 + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "9f224244-9f9f-4af3-85b9-61225ec3fc0b" >}} +{{< tab "Java" >}} +```java +EnvironmentSettings settings = EnvironmentSettings.newInstance()... +TableEnvironment tableEnv = TableEnvironment.create(settings); + +// 注册名为 “Orders” 的表 +tableEnv.executeSql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)"); + +// 字符串数组: ["Orders"] +String[] tables = tableEnv.listTables(); +// or tableEnv.executeSql("SHOW TABLES").print(); + +// 从 catalog 删除 “Orders” 表 +tableEnv.executeSql("DROP TABLE Orders"); + +// 空字符串数组 +String[] tables = tableEnv.listTables(); +// or tableEnv.executeSql("SHOW TABLES").print(); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val settings = EnvironmentSettings.newInstance()... +val tableEnv = TableEnvironment.create(settings) + +// 注册名为 “Orders” 的表 +tableEnv.executeSql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)") + +// 字符串数组: ["Orders"] +val tables = tableEnv.listTables() +// or tableEnv.executeSql("SHOW TABLES").print() + +// 从 catalog 删除 “Orders” 表 +tableEnv.executeSql("DROP TABLE Orders") + +// 空字符串数组 +val tables = tableEnv.listTables() +// or tableEnv.executeSql("SHOW TABLES").print() +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +settings = EnvironmentSettings.new_instance()... +table_env = TableEnvironment.create(settings) + +# 字符串数组: ["Orders"] +tables = table_env.listTables() +# or table_env.executeSql("SHOW TABLES").print() + +# 从 catalog 删除 “Orders” 表 +table_env.execute_sql("DROP TABLE Orders") + +# 空字符串数组 +tables = table_env.list_tables() +# or table_env.execute_sql("SHOW TABLES").print() +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +```sql +Flink SQL> CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...); +[INFO] Table has been created. + +Flink SQL> SHOW TABLES; +Orders + +Flink SQL> DROP TABLE Orders; +[INFO] Table has been removed. + +Flink SQL> SHOW TABLES; +[INFO] Result was empty. +``` +{{< /tab >}} +{{< /tabs >}} + +## DROP TABLE + +```sql +DROP TABLE [IF EXISTS] [catalog_name.][db_name.]table_name +``` + +根据给定的表名删除某个表。若需要删除的表不存在,则抛出异常。 + +**IF EXISTS** + +表不存在时不会进行任何操作。 + +## DROP DATABASE + +```sql +DROP DATABASE [IF EXISTS] [catalog_name.]db_name [ (RESTRICT | CASCADE) ] +``` + +根据给定的表名删除数据库。若需要删除的数据库不存在会抛出异常 。 + +**IF EXISTS** + +若数据库不存在,不执行任何操作。 + +**RESTRICT** + +当删除一个非空数据库时,会触发异常。(默认为开) + +**CASCADE** + +删除一个非空数据库时,把相关联的表与函数一并删除。 + +## DROP VIEW + +```sql +DROP [TEMPORARY] VIEW [IF EXISTS] [catalog_name.][db_name.]view_name +``` + +删除一个有 catalog 和数据库命名空间的视图。若需要删除的视图不存在,则会产生异常。 + +**TEMPORARY** + +删除一个有 catalog 和数据库命名空间的临时视图。 + +**IF EXISTS** + +若视图不存在,则不会进行任何操作。 + +**依赖管理** +Flink 没有使用 CASCADE / RESTRICT 关键字来维护视图的依赖关系,当前的方案是在用户使用视图时再提示错误信息,比如在视图的底层表已经被删除等场景。 + +## DROP FUNCTION + +{% highlight sql%} +DROP [TEMPORARY|TEMPORARY SYSTEM] FUNCTION [IF EXISTS] [catalog_name.][db_name.]function_name; +``` + +删除一个有 catalog 和数据库命名空间的 catalog function。若需要删除的函数不存在,则会产生异常。 + +**TEMPORARY** + +删除一个有 catalog 和数据库命名空间的临时 catalog function。 + +**TEMPORARY SYSTEM** + +删除一个没有数据库命名空间的临时系统函数。 + +**IF EXISTS** + +若函数不存在,则不会进行任何操作。 diff --git a/docs/content.zh/docs/dev/table/sql/explain.md b/docs/content.zh/docs/dev/table/sql/explain.md new file mode 100644 index 0000000000000..fbdc277921c67 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/explain.md @@ -0,0 +1,227 @@ +--- +title: "EXPLAIN Statements" +weight: 9 +type: docs +aliases: + - /dev/table/sql/explain.html +--- + + +# EXPLAIN Statements + + + +EXPLAIN statements are used to explain the logical and optimized query plans of a query or an INSERT statement. + +## Run an EXPLAIN statement + +{{< tabs "explain" >}} +{{< tab "Java" >}} + +EXPLAIN statements can be executed with the `executeSql()` method of the `TableEnvironment`. The `executeSql()` method returns explain result for a successful EXPLAIN operation, otherwise will throw an exception. + +The following examples show how to run an EXPLAIN statement in `TableEnvironment`. + +{{< /tab >}} +{{< tab "Scala" >}} + +EXPLAIN statements can be executed with the `executeSql()` method of the `TableEnvironment`. The `executeSql()` method returns explain result for a successful EXPLAIN operation, otherwise will throw an exception. + +The following examples show how to run an EXPLAIN statement in `TableEnvironment`. +{{< /tab >}} +{{< tab "Python" >}} + +EXPLAIN statements can be executed with the `execute_sql()` method of the `TableEnvironment`. The `execute_sql()` method returns explain result for a successful EXPLAIN operation, otherwise will throw an exception. + +The following examples show how to run an EXPLAIN statement in `TableEnvironment`. + +{{< /tab >}} +{{< tab "SQL CLI" >}} + +EXPLAIN statements can be executed in [SQL CLI]({{< ref "docs/dev/table/sqlClient" >}}). + +The following examples show how to run an EXPLAIN statement in SQL CLI. + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "c83a84e4-4b3a-420b-9a27-94dc640dfcce" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + +// register a table named "Orders" +tEnv.executeSql("CREATE TABLE MyTable1 (`count` bigint, word VARCHAR(256) WITH (...)"); +tEnv.executeSql("CREATE TABLE MyTable2 (`count` bigint, word VARCHAR(256) WITH (...)"); + +// explain SELECT statement through TableEnvironment.explainSql() +String explanation = tEnv.explainSql( + "SELECT `count`, word FROM MyTable1 WHERE word LIKE 'F%' " + + "UNION ALL " + + "SELECT `count`, word FROM MyTable2"); +System.out.println(explanation); + +// explain SELECT statement through TableEnvironment.executeSql() +TableResult tableResult = tEnv.executeSql( + "EXPLAIN PLAN FOR " + + "SELECT `count`, word FROM MyTable1 WHERE word LIKE 'F%' " + + "UNION ALL " + + "SELECT `count`, word FROM MyTable2"); +tableResult.print(); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() +val tEnv = StreamTableEnvironment.create(env) + +// register a table named "Orders" +tEnv.executeSql("CREATE TABLE MyTable1 (`count` bigint, word VARCHAR(256) WITH (...)") +tEnv.executeSql("CREATE TABLE MyTable2 (`count` bigint, word VARCHAR(256) WITH (...)") + +// explain SELECT statement through TableEnvironment.explainSql() +val explanation = tEnv.explainSql( + "SELECT `count`, word FROM MyTable1 WHERE word LIKE 'F%' " + + "UNION ALL " + + "SELECT `count`, word FROM MyTable2") +println(explanation) + +// explain SELECT statement through TableEnvironment.executeSql() +val tableResult = tEnv.executeSql( + "EXPLAIN PLAN FOR " + + "SELECT `count`, word FROM MyTable1 WHERE word LIKE 'F%' " + + "UNION ALL " + + "SELECT `count`, word FROM MyTable2") +tableResult.print() + +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +settings = EnvironmentSettings.new_instance()... +table_env = StreamTableEnvironment.create(env, settings) + +t_env.execute_sql("CREATE TABLE MyTable1 (`count` bigint, word VARCHAR(256) WITH (...)") +t_env.execute_sql("CREATE TABLE MyTable2 (`count` bigint, word VARCHAR(256) WITH (...)") + +# explain SELECT statement through TableEnvironment.explain_sql() +explanation1 = t_env.explain_sql( + "SELECT `count`, word FROM MyTable1 WHERE word LIKE 'F%' " + "UNION ALL " + "SELECT `count`, word FROM MyTable2") +print(explanation1) + +# explain SELECT statement through TableEnvironment.execute_sql() +table_result = t_env.execute_sql( + "EXPLAIN PLAN FOR " + "SELECT `count`, word FROM MyTable1 WHERE word LIKE 'F%' " + "UNION ALL " + "SELECT `count`, word FROM MyTable2") +table_result.print() + +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +```sql +Flink SQL> CREATE TABLE MyTable1 (`count` bigint, word VARCHAR(256); +[INFO] Table has been created. + +Flink SQL> CREATE TABLE MyTable2 (`count` bigint, word VARCHAR(256); +[INFO] Table has been created. + +Flink SQL> EXPLAIN PLAN FOR SELECT `count`, word FROM MyTable1 WHERE word LIKE 'F%' +> UNION ALL +> SELECT `count`, word FROM MyTable2; + +``` +{{< /tab >}} +{{< /tabs >}} + +The `EXPLAIN` result is: +{{< tabs "6ee087b2-3a49-4d75-a803-f436a2166c92" >}} +{{< tab "Blink Planner" >}} +```text +== Abstract Syntax Tree == +LogicalUnion(all=[true]) + LogicalFilter(condition=[LIKE($1, _UTF-16LE'F%')]) + LogicalTableScan(table=[[default_catalog, default_database, MyTable1]], fields=[count, word]) + LogicalTableScan(table=[[default_catalog, default_database, MyTable2]], fields=[count, word]) + +== Optimized Physical Plan == +Union(all=[true], union all=[count, word]) + Calc(select=[count, word], where=[LIKE(word, _UTF-16LE'F%')]) + TableSourceScan(table=[[default_catalog, default_database, MyTable1]], fields=[count, word]) + TableSourceScan(table=[[default_catalog, default_database, MyTable2]], fields=[count, word]) + +== Optimized Execution Plan == +Union(all=[true], union all=[count, word]) + Calc(select=[count, word], where=[LIKE(word, _UTF-16LE'F%')]) + TableSourceScan(table=[[default_catalog, default_database, MyTable1]], fields=[count, word]) + TableSourceScan(table=[[default_catalog, default_database, MyTable2]], fields=[count, word]) +``` +{{< /tab >}} +{{< tab "Legacy Planner" >}} +```text +== Abstract Syntax Tree == +LogicalUnion(all=[true]) + LogicalFilter(condition=[LIKE($1, _UTF-16LE'F%')]) + FlinkLogicalTableSourceScan(table=[[default_catalog, default_database, MyTable1]], fields=[count, word]) + FlinkLogicalTableSourceScan(table=[[default_catalog, default_database, MyTable2]], fields=[count, word]) + +== Optimized Logical Plan == +DataStreamUnion(all=[true], union all=[count, word]) + DataStreamCalc(select=[count, word], where=[LIKE(word, _UTF-16LE'F%')]) + TableSourceScan(table=[[default_catalog, default_database, MyTable1]], fields=[count, word]) + TableSourceScan(table=[[default_catalog, default_database, MyTable2]], fields=[count, word]) + +== Physical Execution Plan == +Stage 1 : Data Source + content : collect elements with CollectionInputFormat + +Stage 2 : Data Source + content : collect elements with CollectionInputFormat + + Stage 3 : Operator + content : from: (count, word) + ship_strategy : REBALANCE + + Stage 4 : Operator + content : where: (LIKE(word, _UTF-16LE'F%')), select: (count, word) + ship_strategy : FORWARD + + Stage 5 : Operator + content : from: (count, word) + ship_strategy : REBALANCE +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + +## Syntax + +```sql +EXPLAIN PLAN FOR +``` + +For query syntax, please refer to [Queries]({{< ref "docs/dev/table/sql/queries" >}}#supported-syntax) page. +For INSERT, please refer to [INSERT]({{< ref "docs/dev/table/sql/insert" >}}) page. diff --git a/docs/content.zh/docs/dev/table/sql/gettingStarted.md b/docs/content.zh/docs/dev/table/sql/gettingStarted.md new file mode 100644 index 0000000000000..17ee4f11bc27a --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/gettingStarted.md @@ -0,0 +1,167 @@ +--- +title: "入门" +weight: 2 +type: docs +aliases: + - /zh/dev/table/sql/gettingStarted.html +--- + + +# 入门 + +Flink SQL 使得使用标准 SQL 开发流应用程序变的简单。如果你曾经在工作中使用过兼容 ANSI-SQL 2011 的数据库或类似的 SQL 系统,那么就很容易学习 Flink。本教程将帮助你在 Flink SQL 开发环境下快速入门。 + +### 先决条件 + +你只需要具备 SQL 的基础知识即可,不需要其他编程经验。 + +### 安装 + +安装 Flink 有多种方式。对于实验而言,最常见的选择是下载二进制包并在本地运行。你可以按照[本地模式安装]({{< ref "docs/try-flink/local_installation" >}})中的步骤为本教程的剩余部分设置环境。 + +完成所有设置后,在安装文件夹中使用以下命令启动本地集群: + +```bash +./bin/start-cluster.sh +``` + +启动完成后,就可以在本地访问 Flink WebUI [localhost:8081](localhost:8081),通过它,你可以监控不同的作业。 + +### SQL 客户端 + +[SQL 客户端]({{< ref "docs/dev/table/sqlClient" >}})是一个交互式的客户端,用于向 Flink 提交 SQL 查询并将结果可视化。 +在安装文件夹中运行 `sql-client` 脚本来启动 SQL 客户端。 + + ```bash +./bin/sql-client.sh + ``` + +### Hello World + +SQL 客户端(我们的查询编辑器)启动并运行后,就可以开始编写查询了。 +让我们使用以下简单查询打印出 'Hello World': + +```sql +SELECT 'Hello World'; +``` + +运行 `HELP` 命令会列出所有支持的 SQL 语句。让我们运行一个 `SHOW` 命令,来查看 Flink [内置函数]({{< ref "docs/dev/table/functions/systemFunctions" >}})的完整列表。 + +```sql +SHOW FUNCTIONS; +``` + +这些函数为用户在开发 SQL 查询时提供了一个功能强大的工具箱。 +例如,`CURRENT_TIMESTAMP` 将在执行时打印出机器的当前系统时间。 + +```sql +SELECT CURRENT_TIMESTAMP; +``` + +--------------- + +{{< top >}} + +## Source 表 + +与所有 SQL 引擎一样,Flink 查询操作是在表上进行。与传统数据库不同,Flink 不在本地管理静态数据;相反,它的查询在外部表上连续运行。 + +Flink 数据处理流水线开始于 source 表。source 表产生在查询执行期间可以被操作的行;它们是查询时 `FROM` 子句中引用的表。这些表可能是 Kafka 的 topics,数据库,文件系统,或者任何其它 Flink 知道如何消费的系统。 + +可以通过 SQL 客户端或使用环境配置文件来定义表。SQL 客户端支持类似于传统 SQL 的 [SQL DDL 命令]({{< ref "docs/dev/table/sql/overview" >}})。标准 SQL DDL 用于[创建]({{< ref "docs/dev/table/sql/create" >}}),[修改]({{< ref "docs/dev/table/sql/alter" >}}),[删除]({{< ref "docs/dev/table/sql/drop" >}})表。 + +Flink 支持不同的[连接器]({{< ref "docs/connectors/table/overview" >}})和[格式]({{< ref "docs/connectors/table/formats/overview" >}})相结合以定义表。下面是一个示例,定义一个以 [CSV 文件]({{< ref "docs/connectors/table/formats/csv" >}})作为存储格式的 source 表,其中 `emp_id`,`name`,`dept_id` 作为 `CREATE` 表语句中的列。 + +```sql +CREATE TABLE employee_information ( + emp_id INT, + name VARCHAR, + dept_id INT +) WITH ( + 'connector' = 'filesystem', + 'path' = '/path/to/something.csv', + 'format' = 'csv' +); +``` + +可以从该表中定义一个连续查询,当新行可用时读取并立即输出它们的结果。 +例如,我们可以过滤出只在部门 `1` 中工作的员工。 + +```sql +SELECT * from employee_information WHERE DeptId = 1; +``` + +--------------- + +{{< top >}} + +## 连续查询 + +虽然最初设计时没有考虑流语义,但 SQL 是用于构建连续数据流水线的强大工具。Flink SQL 与传统数据库查询的不同之处在于,Flink SQL 持续消费到达的行并对其结果进行更新。 + +一个[连续查询]({{< ref "docs/dev/table/concepts/dynamic_tables" >}}#continuous-queries)永远不会终止,并会产生一个动态表作为结果。[动态表]({{< ref "docs/dev/table/concepts/dynamic_tables" >}}#continuous-queries)是 Flink 中 Table API 和 SQL 对流数据支持的核心概念。 + +连续流上的聚合需要在查询执行期间不断地存储聚合的结果。例如,假设你需要从传入的数据流中计算每个部门的员工人数。查询需要维护每个部门最新的计算总数,以便在处理新行时及时输出结果。 + + ```sql +SELECT + dept_id, + COUNT(*) as emp_count +FROM employee_information +GROUP BY dep_id; + ``` + +这样的查询被认为是 _有状态的_。Flink 的高级容错机制将维持内部状态和一致性,因此即使遇到硬件故障,查询也始终返回正确结果。 + +## Sink 表 + +当运行此查询时,SQL 客户端实时但是以只读方式提供输出。存储结果,作为报表或仪表板的数据来源,需要写到另一个表。这可以使用 `INSERT INTO` 语句来实现。本节中引用的表称为 sink 表。`INSERT INTO` 语句将作为一个独立查询被提交到 Flink 集群中。 + + ```sql +INSERT INTO department_counts + SELECT + dept_id, + COUNT(*) as emp_count +FROM employee_information; + ``` + +提交后,它将运行并将结果直接存储到 sink 表中,而不是将结果加载到系统内存中。 + +--------------- + +{{< top >}} + +## 寻求帮助! + +如果你有疑惑,可以查阅[社区支持资源](https://flink.apache.org/zh/community.html)。 +特别是,Apache Flink 的[用户邮件列表](https://flink.apache.org/zh/community.html#mailing-lists)一直被评为是任何 Apache 项目中最活跃的项目之一,也是快速获得帮助的好方法。 + +## 了解更多资源 + +* [SQL]({{< ref "docs/dev/table/sql/overview" >}}):SQL 支持的操作和语法。 +* [SQL 客户端]({{< ref "docs/dev/table/sqlClient" >}}):不用编写代码就可以尝试 Flink SQL,可以直接提交 SQL 任务到集群上。 +* [概念与通用 API]({{< ref "docs/dev/table/common" >}}):Table API 和 SQL 公共概念以及 API。 +* [流式概念]({{< ref "docs/dev/table/concepts/overview" >}}):Table API 和 SQL 中流式相关的文档,比如配置时间属性和如何处理更新结果。 +* [内置函数]({{< ref "docs/dev/table/functions/systemFunctions" >}}):Table API 和 SQL 中的内置函数。 +* [连接外部系统]({{< ref "docs/connectors/table/overview" >}}):读写外部系统的连接器和格式。 + +--------------- + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/insert.md b/docs/content.zh/docs/dev/table/sql/insert.md new file mode 100644 index 0000000000000..f8cd439cca146 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/insert.md @@ -0,0 +1,265 @@ +--- +title: "INSERT 语句" +weight: 7 +type: docs +aliases: + - /zh/dev/table/sql/insert.html +--- + + +# INSERT 语句 + + + +INSERT 语句用来向表中添加行。 + +## 执行 INSERT 语句 + +{{< tabs "execute" >}} +{{< tab "Java" >}} + +单条 INSERT 语句,可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行。`executeSql()` 方法执行 INSERT 语句时会立即提交一个 Flink 作业,并且返回一个 TableResult 对象,通过该对象可以获取 JobClient 方便的操作提交的作业。 +多条 INSERT 语句,使用 `TableEnvironment` 中的 `createStatementSet` 创建一个 `StatementSet` 对象,然后使用 `StatementSet` 中的 `addInsertSql()` 方法添加多条 INSERT 语句,最后通过 `StatementSet` 中的 `execute()` 方法来执行。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一条 INSERT 语句,或者通过 `StatementSet` 执行多条 INSERT 语句。 +{{< /tab >}} +{{< tab "Scala" >}} + +单条 INSERT 语句,可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行。`executeSql()` 方法执行 INSERT 语句时会立即提交一个 Flink 作业,并且返回一个 TableResult 对象,通过该对象可以获取 JobClient 方便的操作提交的作业。 +多条 INSERT 语句,使用 `TableEnvironment` 中的 `createStatementSet` 创建一个 `StatementSet` 对象,然后使用 `StatementSet` 中的 `addInsertSql()` 方法添加多条 INSERT 语句,最后通过 `StatementSet` 中的 `execute()` 方法来执行。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一条 INSERT 语句,或者通过 `StatementSet` 执行多条 INSERT 语句。 + +{{< /tab >}} +{{< tab "Python" >}} + +单条 INSERT 语句,可以使用 `TableEnvironment` 中的 `execute_sql()` 方法执行。`execute_sql()` 方法执行 INSERT 语句时会立即提交一个 Flink 作业,并且返回一个 TableResult 对象,通过该对象可以获取 JobClient 方便的操作提交的作业。 +多条 INSERT 语句,使用 `TableEnvironment` 中的 `create_statement_set` 创建一个 `StatementSet` 对象,然后使用 `StatementSet` 中的 `add_insert_sql()` 方法添加多条 INSERT 语句,最后通过 `StatementSet` 中的 `execute()` 方法来执行。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一条 INSERT 语句,或者通过 `StatementSet` 执行多条 INSERT 语句。 + +{{< /tab >}} +{{< tab "SQL CLI" >}} + +可以在 [SQL CLI]({{< ref "docs/dev/table/sqlClient" >}}) 中执行 INSERT 语句 + +以下的例子展示了如何在 SQL CLI 中执行一条 INSERT 语句。 + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "77ed5a01-effa-432c-b089-f922c3964c88" >}} +{{< tab "Java" >}} +```java +EnvironmentSettings settings = EnvironmentSettings.newInstance()... +TableEnvironment tEnv = TableEnvironment.create(settings); + +// 注册一个 "Orders" 源表,和 "RubberOrders" 结果表 +tEnv.executeSql("CREATE TABLE Orders (`user` BIGINT, product VARCHAR, amount INT) WITH (...)"); +tEnv.executeSql("CREATE TABLE RubberOrders(product VARCHAR, amount INT) WITH (...)"); + +// 运行一条 INSERT 语句,将源表的数据输出到结果表中 +TableResult tableResult1 = tEnv.executeSql( + "INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'"); +// 通过 TableResult 来获取作业状态 +System.out.println(tableResult1.getJobClient().get().getJobStatus()); + +//---------------------------------------------------------------------------- +// 注册一个 "GlassOrders" 结果表用于运行多 INSERT 语句 +tEnv.executeSql("CREATE TABLE GlassOrders(product VARCHAR, amount INT) WITH (...)"); + +// 运行多条 INSERT 语句,将原表数据输出到多个结果表中 +StatementSet stmtSet = tEnv.createStatementSet(); +// `addInsertSql` 方法每次只接收单条 INSERT 语句 +stmtSet.addInsertSql( + "INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'"); +stmtSet.addInsertSql( + "INSERT INTO GlassOrders SELECT product, amount FROM Orders WHERE product LIKE '%Glass%'"); +// 执行刚刚添加的所有 INSERT 语句 +TableResult tableResult2 = stmtSet.execute(); +// 通过 TableResult 来获取作业状态 +System.out.println(tableResult1.getJobClient().get().getJobStatus()); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val settings = EnvironmentSettings.newInstance()... +val tEnv = TableEnvironment.create(settings) + +// 注册一个 "Orders" 源表,和 "RubberOrders" 结果表 +tEnv.executeSql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)") +tEnv.executeSql("CREATE TABLE RubberOrders(product STRING, amount INT) WITH (...)") + +// 运行一个 INSERT 语句,将源表的数据输出到结果表中 +val tableResult1 = tEnv.executeSql( + "INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'") +// 通过 TableResult 来获取作业状态 +println(tableResult1.getJobClient().get().getJobStatus()) + +//---------------------------------------------------------------------------- +// 注册一个 "GlassOrders" 结果表用于运行多 INSERT 语句 +tEnv.executeSql("CREATE TABLE GlassOrders(product VARCHAR, amount INT) WITH (...)"); + +// 运行多个 INSERT 语句,将原表数据输出到多个结果表中 +val stmtSet = tEnv.createStatementSet() +// `addInsertSql` 方法每次只接收单条 INSERT 语句 +stmtSet.addInsertSql( + "INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'") +stmtSet.addInsertSql( + "INSERT INTO GlassOrders SELECT product, amount FROM Orders WHERE product LIKE '%Glass%'") +// 执行刚刚添加的所有 INSERT 语句 +val tableResult2 = stmtSet.execute() +// 通过 TableResult 来获取作业状态 +println(tableResult1.getJobClient().get().getJobStatus()) + +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +settings = EnvironmentSettings.new_instance()... +table_env = TableEnvironment.create(settings) + +# 注册一个 "Orders" 源表,和 "RubberOrders" 结果表 +table_env.executeSql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)") +table_env.executeSql("CREATE TABLE RubberOrders(product STRING, amount INT) WITH (...)") + +# 运行一条 INSERT 语句,将源表的数据输出到结果表中 +table_result1 = table_env \ + .executeSql("INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'") +# 通过 TableResult 来获取作业状态 +print(table_result1.get_job_client().get_job_status()) + +#---------------------------------------------------------------------------- +# 注册一个 "GlassOrders" 结果表用于运行多 INSERT 语句 +table_env.execute_sql("CREATE TABLE GlassOrders(product VARCHAR, amount INT) WITH (...)") + +# 运行多条 INSERT 语句,将原表数据输出到多个结果表中 +stmt_set = table_env.create_statement_set() +# `add_insert_sql` 方法每次只接收单条 INSERT 语句 +stmt_set \ + .add_insert_sql("INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'") +stmt_set \ + .add_insert_sql("INSERT INTO GlassOrders SELECT product, amount FROM Orders WHERE product LIKE '%Glass%'") +# 执行刚刚添加的所有 INSERT 语句 +table_result2 = stmt_set.execute() +# 通过 TableResult 来获取作业状态 +print(table_result2.get_job_client().get_job_status()) + + +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +```sql +Flink SQL> CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...); +[INFO] Table has been created. + +Flink SQL> CREATE TABLE RubberOrders(product STRING, amount INT) WITH (...); + +Flink SQL> SHOW TABLES; +Orders +RubberOrders + +Flink SQL> INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'; +[INFO] Submitting SQL update statement to the cluster... +[INFO] Table update statement has been successfully submitted to the cluster: +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + +## 将 SELECT 查询数据插入表中 + +通过 INSERT 语句,可以将查询的结果插入到表中, + +### 语法 + +```sql + +INSERT { INTO | OVERWRITE } [catalog_name.][db_name.]table_name [PARTITION part_spec] select_statement + +part_spec: + (part_col_name1=val1 [, part_col_name2=val2, ...]) + +``` + +**OVERWRITE** + +`INSERT OVERWRITE` 将会覆盖表中或分区中的任何已存在的数据。否则,新数据会追加到表中或分区中。 + +**PARTITION** + +`PARTITION` 语句应该包含需要插入的静态分区列与值。 + +### 示例 + +```sql +-- 创建一个分区表 +CREATE TABLE country_page_view (user STRING, cnt INT, date STRING, country STRING) +PARTITIONED BY (date, country) +WITH (...) + +-- 追加行到该静态分区中 (date='2019-8-30', country='China') +INSERT INTO country_page_view PARTITION (date='2019-8-30', country='China') + SELECT user, cnt FROM page_view_source; + +-- 追加行到分区 (date, country) 中,其中 date 是静态分区 '2019-8-30';country 是动态分区,其值由每一行动态决定 +INSERT INTO country_page_view PARTITION (date='2019-8-30') + SELECT user, cnt, country FROM page_view_source; + +-- 覆盖行到静态分区 (date='2019-8-30', country='China') +INSERT OVERWRITE country_page_view PARTITION (date='2019-8-30', country='China') + SELECT user, cnt FROM page_view_source; + +-- 覆盖行到分区 (date, country) 中,其中 date 是静态分区 '2019-8-30';country 是动态分区,其值由每一行动态决定 +INSERT OVERWRITE country_page_view PARTITION (date='2019-8-30') + SELECT user, cnt, country FROM page_view_source; +``` + +## 将值插入表中 + +通过 INSERT 语句,也可以直接将值插入到表中, + +### 语法 + +```sql +INSERT { INTO | OVERWRITE } [catalog_name.][db_name.]table_name VALUES values_row [, values_row ...] + +values_row: + : (val1 [, val2, ...]) +``` + +**OVERWRITE** + +`INSERT OVERWRITE` 将会覆盖表中的任何已存在的数据。否则,新数据会追加到表中。 + +### 示例 + +```sql + +CREATE TABLE students (name STRING, age INT, gpa DECIMAL(3, 2)) WITH (...); + +INSERT INTO students + VALUES ('fred flintstone', 35, 1.28), ('barney rubble', 32, 2.32); + +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/load.md b/docs/content.zh/docs/dev/table/sql/load.md new file mode 100644 index 0000000000000..adc9fe0028ba5 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/load.md @@ -0,0 +1,143 @@ +--- +title: "LOAD 语句" +weight: 12 +type: docs +aliases: + - /zh/dev/table/sql/load.html +--- + + +# LOAD Statements + +LOAD statements are used to load a built-in or user-defined module. + +## Run a LOAD statement + +{{< tabs "load statement" >}} +{{< tab "Java" >}} + +LOAD statements can be executed with the `executeSql()` method of the `TableEnvironment`. The `executeSql()` method returns 'OK' for a successful LOAD operation; otherwise, it will throw an exception. + +The following examples show how to run a LOAD statement in `TableEnvironment`. + +{{< /tab >}} +{{< tab "Scala" >}} + +LOAD statements can be executed with the `executeSql()` method of the `TableEnvironment`. The `executeSql()` method returns 'OK' for a successful LOAD operation; otherwise, it will throw an exception. + +The following examples show how to run a LOAD statement in `TableEnvironment`. +{{< /tab >}} +{{< tab "Python" >}} + +LOAD statements can be executed with the `executeSql()` method of the `TableEnvironment`. The `executeSql()` method returns 'OK' for a successful LOAD operation; otherwise, it will throw an exception. + +The following examples show how to run a LOAD statement in `TableEnvironment`. + +{{< /tab >}} +{{< tab "SQL CLI" >}} + +LOAD statements can be executed in [SQL CLI]({{< ref "docs/dev/table/sqlClient" >}}). + +The following examples show how to run a LOAD statement in SQL CLI. + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "load modules" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + +// load a hive module +tEnv.executeSql("LOAD MODULE hive WITH ('hive-version' = '3.1.2')"); +tEnv.executeSql("SHOW MODULES").print(); +// +-------------+ +// | module name | +// +-------------+ +// | core | +// | hive | +// +-------------+ + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() +val tEnv = StreamTableEnvironment.create(env) + +// load a hive module +tEnv.executeSql("LOAD MODULE hive WITH ('hive-version' = '3.1.2')") +tEnv.executeSql("SHOW MODULES").print() +// +-------------+ +// | module name | +// +-------------+ +// | core | +// | hive | +// +-------------+ + +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +settings = EnvironmentSettings.new_instance()... +table_env = StreamTableEnvironment.create(env, settings) + +# load a hive module +table_env.execute_sql("LOAD MODULE hive WITH ('hive-version' = '3.1.2')") +table_env.execute_sql("SHOW MODULES").print() +# +-------------+ +# | module name | +# +-------------+ +# | core | +# | hive | +# +-------------+ + +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +```sql +Flink SQL> LOAD MODULE hive WITH ('hive-version' = '3.1.2'); +[INFO] Load module succeeded! + +Flink SQL> SHOW MODULES; ++-------------+ +| module name | ++-------------+ +| core | +| hive | ++-------------+ + +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + +## LOAD MODULE + +The following grammar gives an overview of the available syntax: +```sql +LOAD MODULE module_name [WITH ('key1' = 'val1', 'key2' = 'val2', ...)] +``` +{{< hint warning >}} +`module_name` is a simple identifier. It is case-sensitive and should be identical to the module type defined in the module factory because it is used to perform module discovery. +Properties `('key1' = 'val1', 'key2' = 'val2', ...)` is a map that contains a set of key-value pairs (except for the key `'type'`) and passed to the discovery service to instantiate the corresponding module. +{{< /hint >}} diff --git a/docs/content.zh/docs/dev/table/sql/overview.md b/docs/content.zh/docs/dev/table/sql/overview.md new file mode 100644 index 0000000000000..204eb5c86e7d3 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/overview.md @@ -0,0 +1,70 @@ +--- +title: "概览" +weight: 1 +type: docs +aliases: + - /zh/dev/table/sql/ +--- + + +# SQL + +本页面描述了 Flink 所支持的 SQL 语言,包括数据定义语言(Data Definition Language,DDL)、数据操纵语言(Data Manipulation Language,DML)以及查询语言。Flink 对 SQL 的支持基于实现了 SQL 标准的 [Apache Calcite](https://calcite.apache.org/)。 + +本页面列出了目前 Flink SQL 所支持的所有语句: + +- [SELECT (Queries)]({{< ref "docs/dev/table/sql/queries" >}}) +- [CREATE TABLE, DATABASE, VIEW, FUNCTION]({{< ref "docs/dev/table/sql/create" >}}) +- [DROP TABLE, DATABASE, VIEW, FUNCTION]({{< ref "docs/dev/table/sql/drop" >}}) +- [ALTER TABLE, DATABASE, FUNCTION]({{< ref "docs/dev/table/sql/alter" >}}) +- [INSERT]({{< ref "docs/dev/table/sql/insert" >}}) +- [SQL HINTS]({{< ref "docs/dev/table/sql/queries/hints" >}}) +- [DESCRIBE]({{< ref "docs/dev/table/sql/describe" >}}) +- [EXPLAIN]({{< ref "docs/dev/table/sql/explain" >}}) +- [USE]({{< ref "docs/dev/table/sql/use" >}}) +- [SHOW]({{< ref "docs/dev/table/sql/show" >}}) +- [LOAD]({{< ref "docs/dev/table/sql/load" >}}) +- [UNLOAD]({{< ref "docs/dev/table/sql/unload" >}}) + +## 数据类型 + +请参考专门描述该主题的页面 [数据类型]({{< ref "docs/dev/table/types" >}})。 + +通用类型与(嵌套的)复合类型 (如:POJO、tuples、rows、Scala case 类) 都可以作为行的字段。 + +复合类型的字段任意的嵌套可被 [值访问函数]({{< ref "docs/dev/table/functions/systemFunctions" >}}#value-access-functions) 访问。 + +通用类型将会被视为一个黑箱,且可以被 [用户自定义函数]({{< ref "docs/dev/table/functions/udfs" >}}) 传递或引用。 + +对于 DDL 语句而言,我们支持所有在 [数据类型]({{< ref "docs/dev/table/types" >}}) 页面中定义的数据类型。 + +**注意:** SQL查询不支持部分数据类型(cast 表达式或字符常量值)。如:`STRING`, `BYTES`, `RAW`, `TIME(p) WITHOUT TIME ZONE`, `TIME(p) WITH LOCAL TIME ZONE`, `TIMESTAMP(p) WITHOUT TIME ZONE`, `TIMESTAMP(p) WITH LOCAL TIME ZONE`, `ARRAY`, `MULTISET`, `ROW`. + +{{< top >}} + +## 保留关键字 + +虽然 SQL 的特性并未完全实现,但是一些字符串的组合却已经被预留为关键字以备未来使用。如果你希望使用以下字符串作为你的字段名,请在使用时使用反引号将该字段名包起来(如 `` `value` ``, `` `count` `` )。 + + A, ABS, ABSOLUTE, ACTION, ADA, ADD, ADMIN, AFTER, ALL, ALLOCATE, ALLOW, ALTER, ALWAYS, AND, ANY, ARE, ARRAY, AS, ASC, ASENSITIVE, ASSERTION, ASSIGNMENT, ASYMMETRIC, AT, ATOMIC, ATTRIBUTE, ATTRIBUTES, AUTHORIZATION, AVG, BEFORE, BEGIN, BERNOULLI, BETWEEN, BIGINT, BINARY, BIT, BLOB, BOOLEAN, BOTH, BREADTH, BY, BYTES, C, CALL, CALLED, CARDINALITY, CASCADE, CASCADED, CASE, CAST, CATALOG, CATALOG_NAME, CEIL, CEILING, CENTURY, CHAIN, CHAR, CHARACTER, CHARACTERISTICS, CHARACTERS, CHARACTER_LENGTH, CHARACTER_SET_CATALOG, CHARACTER_SET_NAME, CHARACTER_SET_SCHEMA, CHAR_LENGTH, CHECK, CLASS_ORIGIN, CLOB, CLOSE, COALESCE, COBOL, COLLATE, COLLATION, COLLATION_CATALOG, COLLATION_NAME, COLLATION_SCHEMA, COLLECT, COLUMN, COLUMN_NAME, COMMAND_FUNCTION, COMMAND_FUNCTION_CODE, COMMIT, COMMITTED, CONDITION, CONDITION_NUMBER, CONNECT, CONNECTION, CONNECTION_NAME, CONSTRAINT, CONSTRAINTS, CONSTRAINT_CATALOG, CONSTRAINT_NAME, CONSTRAINT_SCHEMA, CONSTRUCTOR, CONTAINS, CONTINUE, CONVERT, CORR, CORRESPONDING, COUNT, COVAR_POP, COVAR_SAMP, CREATE, CROSS, CUBE, CUME_DIST, CURRENT, CURRENT_CATALOG, CURRENT_DATE, CURRENT_DEFAULT_TRANSFORM_GROUP, CURRENT_PATH, CURRENT_ROLE, CURRENT_SCHEMA, CURRENT_TIME, CURRENT_TIMESTAMP, CURRENT_TRANSFORM_GROUP_FOR_TYPE, CURRENT_USER, CURSOR, CURSOR_NAME, CYCLE, DATA, DATABASE, DATE, DATETIME_INTERVAL_CODE, DATETIME_INTERVAL_PRECISION, DAY, DEALLOCATE, DEC, DECADE, DECIMAL, DECLARE, DEFAULT, DEFAULTS, DEFERRABLE, DEFERRED, DEFINED, DEFINER, DEGREE, DELETE, DENSE_RANK, DEPTH, DEREF, DERIVED, DESC, DESCRIBE, DESCRIPTION, DESCRIPTOR, DETERMINISTIC, DIAGNOSTICS, DISALLOW, DISCONNECT, DISPATCH, DISTINCT, DOMAIN, DOUBLE, DOW, DOY, DROP, DYNAMIC, DYNAMIC_FUNCTION, DYNAMIC_FUNCTION_CODE, EACH, ELEMENT, ELSE, END, END-EXEC, EPOCH, EQUALS, ESCAPE, EVERY, EXCEPT, EXCEPTION, EXCLUDE, EXCLUDING, EXEC, EXECUTE, EXISTS, EXP, EXPLAIN, EXTEND, EXTERNAL, EXTRACT, FALSE, FETCH, FILTER, FINAL, FIRST, FIRST_VALUE, FLOAT, FLOOR, FOLLOWING, FOR, FOREIGN, FORTRAN, FOUND, FRAC_SECOND, FREE, FROM, FULL, FUNCTION, FUSION, G, GENERAL, GENERATED, GET, GLOBAL, GO, GOTO, GRANT, GRANTED, GROUP, GROUPING, HAVING, HIERARCHY, HOLD, HOUR, IDENTITY, IMMEDIATE, IMPLEMENTATION, IMPORT, IN, INCLUDING, INCREMENT, INDICATOR, INITIALLY, INNER, INOUT, INPUT, INSENSITIVE, INSERT, INSTANCE, INSTANTIABLE, INT, INTEGER, INTERSECT, INTERSECTION, INTERVAL, INTO, INVOKER, IS, ISOLATION, JAVA, JOIN, K, KEY, KEY_MEMBER, KEY_TYPE, LABEL, LANGUAGE, LARGE, LAST, LAST_VALUE, LATERAL, LEADING, LEFT, LENGTH, LEVEL, LIBRARY, LIKE, LIMIT, LN, LOCAL, LOCALTIME, LOCALTIMESTAMP, LOCATOR, LOWER, M, MAP, MATCH, MATCHED, MAX, MAXVALUE, MEMBER, MERGE, MESSAGE_LENGTH, MESSAGE_OCTET_LENGTH, MESSAGE_TEXT, METHOD, MICROSECOND, MILLENNIUM, MIN, MINUTE, MINVALUE, MOD, MODIFIES, MODULE, MODULES, MONTH, MORE, MULTISET, MUMPS, NAME, NAMES, NATIONAL, NATURAL, NCHAR, NCLOB, NESTING, NEW, NEXT, NO, NONE, NORMALIZE, NORMALIZED, NOT, NULL, NULLABLE, NULLIF, NULLS, NUMBER, NUMERIC, OBJECT, OCTETS, OCTET_LENGTH, OF, OFFSET, OLD, ON, ONLY, OPEN, OPTION, OPTIONS, OR, ORDER, ORDERING, ORDINALITY, OTHERS, OUT, OUTER, OUTPUT, OVER, OVERLAPS, OVERLAY, OVERRIDING, PAD, PARAMETER, PARAMETER_MODE, PARAMETER_NAME, PARAMETER_ORDINAL_POSITION, PARAMETER_SPECIFIC_CATALOG, PARAMETER_SPECIFIC_NAME, PARAMETER_SPECIFIC_SCHEMA, PARTIAL, PARTITION, PASCAL, PASSTHROUGH, PATH, PERCENTILE_CONT, PERCENTILE_DISC, PERCENT_RANK, PLACING, PLAN, PLI, POSITION, POWER, PRECEDING, PRECISION, PREPARE, PRESERVE, PRIMARY, PRIOR, PRIVILEGES, PROCEDURE, PUBLIC, QUARTER, RANGE, RANK, RAW, READ, READS, REAL, RECURSIVE, REF, REFERENCES, REFERENCING, REGR_AVGX, REGR_AVGY, REGR_COUNT, REGR_INTERCEPT, REGR_R2, REGR_SLOPE, REGR_SXX, REGR_SXY, REGR_SYY, RELATIVE, RELEASE, REPEATABLE, RESET, RESTART, RESTRICT, RESULT, RETURN, RETURNED_CARDINALITY, RETURNED_LENGTH, RETURNED_OCTET_LENGTH, RETURNED_SQLSTATE, RETURNS, REVOKE, RIGHT, ROLE, ROLLBACK, ROLLUP, ROUTINE, ROUTINE_CATALOG, ROUTINE_NAME, ROUTINE_SCHEMA, ROW, ROWS, ROW_COUNT, ROW_NUMBER, SAVEPOINT, SCALE, SCHEMA, SCHEMA_NAME, SCOPE, SCOPE_CATALOGS, SCOPE_NAME, SCOPE_SCHEMA, SCROLL, SEARCH, SECOND, SECTION, SECURITY, SELECT, SELF, SENSITIVE, SEQUENCE, SERIALIZABLE, SERVER, SERVER_NAME, SESSION, SESSION_USER, SET, SETS, SIMILAR, SIMPLE, SIZE, SMALLINT, SOME, SOURCE, SPACE, SPECIFIC, SPECIFICTYPE, SPECIFIC_NAME, SQL, SQLEXCEPTION, SQLSTATE, SQLWARNING, SQL_TSI_DAY, SQL_TSI_FRAC_SECOND, SQL_TSI_HOUR, SQL_TSI_MICROSECOND, SQL_TSI_MINUTE, SQL_TSI_MONTH, SQL_TSI_QUARTER, SQL_TSI_SECOND, SQL_TSI_WEEK, SQL_TSI_YEAR, SQRT, START, STATE, STATEMENT, STATIC, STDDEV_POP, STDDEV_SAMP, STREAM, STRING, STRUCTURE, STYLE, SUBCLASS_ORIGIN, SUBMULTISET, SUBSTITUTE, SUBSTRING, SUM, SYMMETRIC, SYSTEM, SYSTEM_USER, TABLE, TABLESAMPLE, TABLE_NAME, TEMPORARY, THEN, TIES, TIME, TIMESTAMP, TIMESTAMPADD, TIMESTAMPDIFF, TIMEZONE_HOUR, TIMEZONE_MINUTE, TINYINT, TO, TOP_LEVEL_COUNT, TRAILING, TRANSACTION, TRANSACTIONS_ACTIVE, TRANSACTIONS_COMMITTED, TRANSACTIONS_ROLLED_BACK, TRANSFORM, TRANSFORMS, TRANSLATE, TRANSLATION, TREAT, TRIGGER, TRIGGER_CATALOG, TRIGGER_NAME, TRIGGER_SCHEMA, TRIM, TRUE, TYPE, UESCAPE, UNBOUNDED, UNCOMMITTED, UNDER, UNION, UNIQUE, UNKNOWN, UNNAMED, UNNEST, UPDATE, UPPER, UPSERT, USAGE, USER, USER_DEFINED_TYPE_CATALOG, USER_DEFINED_TYPE_CODE, USER_DEFINED_TYPE_NAME, USER_DEFINED_TYPE_SCHEMA, USING, VALUE, VALUES, VARBINARY, VARCHAR, VARYING, VAR_POP, VAR_SAMP, VERSION, VIEW, WEEK, WHEN, WHENEVER, WHERE, WIDTH_BUCKET, WINDOW, WITH, WITHIN, WITHOUT, WORK, WRAPPER, WRITE, XML, YEAR, ZONE + + +{{< top >}} + diff --git a/docs/content.zh/docs/dev/table/sql/queries/_index.md b/docs/content.zh/docs/dev/table/sql/queries/_index.md new file mode 100644 index 0000000000000..b3c1bc5c02507 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/_index.md @@ -0,0 +1,24 @@ +--- +title: "Queries 查询" +weight: 3 +type: docs +bookCollapseSection: true +--- + \ No newline at end of file diff --git a/docs/content.zh/docs/dev/table/sql/queries/deduplication.md b/docs/content.zh/docs/dev/table/sql/queries/deduplication.md new file mode 100644 index 0000000000000..78cf583081ab6 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/deduplication.md @@ -0,0 +1,76 @@ +--- +title: "去重" +weight: 16 +type: docs +--- + + +# Deduplication +{{< label Batch >}} {{< label Streaming >}} + +Deduplication removes rows that duplicate over a set of columns, keeping only the first one or the last one. In some cases, the upstream ETL jobs are not end-to-end exactly-once; this may result in duplicate records in the sink in case of failover. However, the duplicate records will affect the correctness of downstream analytical jobs - e.g. `SUM`, `COUNT` - so deduplication is needed before further analysis. + +Flink uses `ROW_NUMBER()` to remove duplicates, just like the way of Top-N query. In theory, deduplication is a special case of Top-N in which the N is one and order by the processing time or event time. + +The following shows the syntax of the Deduplication statement: + +```sql +SELECT [column_list] +FROM ( + SELECT [column_list], + ROW_NUMBER() OVER ([PARTITION BY col1[, col2...]] + ORDER BY time_attr [asc|desc]) AS rownum + FROM table_name) +WHERE rownum = 1 +``` + +**Parameter Specification:** + +- `ROW_NUMBER()`: Assigns an unique, sequential number to each row, starting with one. +- `PARTITION BY col1[, col2...]`: Specifies the partition columns, i.e. the deduplicate key. +- `ORDER BY time_attr [asc|desc]`: Specifies the ordering column, it must be a [time attribute]({{< ref "docs/dev/table/concepts/time_attributes" >}}). Currently Flink supports [processing time attribute]({{< ref "docs/dev/table/concepts/time_attributes" >}}#processing-time) and [event time atttribute]({{< ref "docs/dev/table/concepts/time_attributes" >}}#event-time). Ordering by ASC means keeping the first row, ordering by DESC means keeping the last row. +- `WHERE rownum = 1`: The `rownum = 1` is required for Flink to recognize this query is deduplication. + +{{< hint info >}} +Note: the above pattern must be followed exactly, otherwise the optimizer won’t be able to translate the query. +{{< /hint >}} + +The following examples show how to specify SQL queries with Deduplication on streaming tables. + +```sql +CREATE TABLE Orders ( + order_time STRING, + user STRING, + product STRING, + num BIGINT, + proctime AS PROCTIME() +) WITH (...); + +-- remove duplicate rows on order_id and keep the first occurrence row, +-- because there shouldn't be two orders with the same order_id. +SELECT order_id, user, product, num +FROM ( + SELECT *, + ROW_NUMBER() OVER (PARTITION BY order_id ORDER BY proctime ASC) AS row_num + FROM Orders) +WHERE row_num = 1 +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/group-agg.md b/docs/content.zh/docs/dev/table/sql/queries/group-agg.md new file mode 100644 index 0000000000000..70926381856cd --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/group-agg.md @@ -0,0 +1,162 @@ +--- +title: "分组聚合" +weight: 8 +type: docs +--- + + +# Group Aggregation +{{< label Batch >}} {{< label Streaming >}} + +Like most data systems, Apache Flink supports aggregate functions; both built-in and user-defined. [User-defined functions]({{< ref "docs/dev/table/functions/udfs" >}}) must be registered in a catalog before use. + +An aggregate function computes a single result from multiple input rows. For example, there are aggregates to compute the `COUNT`, `SUM`, `AVG` (average), `MAX` (maximum) and `MIN` (minimum) over a set of rows. + +```sql +SELECT COUNT(*) FROM Orders +``` + +For streaming queries, it is important to understand that Flink runs continuous queries that never terminate. Instead, they update their result table according to the updates on its input tables. For the above query, Flink will output an updated count each time a new row is inserted into the `Orders` table. + +Apache Flink supports the standard `GROUP BY` clause for aggregating data. + +```sql +SELECT COUNT(*) +FROM Orders +GROUP BY order_id +``` + +For streaming queries, the required state for computing the query result might grow infinitely. State size depends on the number of groups and the number and type of aggregation functions. For example `MIN`/`MAX` are heavy on state size while `COUNT` is cheap. You can provide a query configuration with an appropriate state time-to-live (TTL) to prevent excessive state size. Note that this might affect the correctness of the query result. See [query configuration]({{< ref "docs/dev/table/config" >}}#table-exec-state-ttl) for details. + +Apache Flink provides a set of performance tuning ways for Group Aggregation, see more [Performance Tuning]({{< ref "docs/dev/table/tuning" >}}). + +## DISTINCT Aggregation + +Distinct aggregates remove duplicate values before applying an aggregation function. The following example counts the number of distinct order_ids instead of the total number of rows in the Orders table. + +```sql +SELECT COUNT(DISTINCT order_id) FROM Orders +``` + +For streaming queries, the required state for computing the query result might grow infinitely. State size is mostly depends on the number of distinct rows and the time that a group is maintained, short lived group by windows are not a problem. You can provide a query configuration with an appropriate state time-to-live (TTL) to prevent excessive state size. Note that this might affect the correctness of the query result. See [query configuration]({{< ref "docs/dev/table/config" >}}#table-exec-state-ttl) for details. + +## GROUPING SETS + +Grouping sets allow for more complex grouping operations than those describable by a standard `GROUP BY`. Rows are grouped separately by each specified grouping set and aggregates are computed for each group just as for simple `GROUP BY` clauses. + +```sql +SELECT supplier_id, rating, COUNT(*) AS total +FROM (VALUES + ('supplier1', 'product1', 4), + ('supplier1', 'product2', 3), + ('supplier2', 'product3', 3), + ('supplier2', 'product4', 4)) +AS Products(supplier_id, product_id, rating) +GROUP BY GROUPING SETS ((supplier_id, rating), (supplier_id), ()) +``` + +Results: + +``` ++-------------+--------+-------+ +| supplier_id | rating | total | ++-------------+--------+-------+ +| supplier1 | 4 | 1 | +| supplier1 | (NULL) | 2 | +| (NULL) | (NULL) | 4 | +| supplier1 | 3 | 1 | +| supplier2 | 3 | 1 | +| supplier2 | (NULL) | 2 | +| supplier2 | 4 | 1 | ++-------------+--------+-------+ +``` + +Each sublist of `GROUPING SETS` may specify zero or more columns or expressions and is interpreted the same way as though it was used directly in the `GROUP BY` clause. An empty grouping set means that all rows are aggregated down to a single group, which is output even if no input rows were present. + +References to the grouping columns or expressions are replaced by null values in result rows for grouping sets in which those columns do not appear. + +For streaming queries, the required state for computing the query result might grow infinitely. State size depends on number of group sets and type of aggregation functions. You can provide a query configuration with an appropriate state time-to-live (TTL) to prevent excessive state size. Note that this might affect the correctness of the query result. See [query configuration]({{< ref "docs/dev/table/config" >}}#table-exec-state-ttl) for details. + +### ROLLUP + +`ROLLUP` is a shorthand notation for specifying a common type of grouping set. It represents the given list of expressions and all prefixes of the list, including the empty list. + +For example, the following query is equivalent to the one above. + +```sql +SELECT supplier_id, rating, COUNT(*) +FROM (VALUES + ('supplier1', 'product1', 4), + ('supplier1', 'product2', 3), + ('supplier2', 'product3', 3), + ('supplier2', 'product4', 4)) +AS Products(supplier_id, product_id, rating) +GROUP BY ROLLUP (supplier_id, rating) +``` + +### CUBE + +`CUBE` is a shorthand notation for specifying a common type of grouping set. It represents the given list and all of its possible subsets - the power set. + +For example, the following two queries are equivalent. + +```sql +SELECT supplier_id, rating, product_id, COUNT(*) +FROM (VALUES + ('supplier1', 'product1', 4), + ('supplier1', 'product2', 3), + ('supplier2', 'product3', 3), + ('supplier2', 'product4', 4)) +AS Products(supplier_id, product_id, rating) +GROUP BY CUBE (supplier_id, rating, product_id) + +SELECT supplier_id, rating, product_id, COUNT(*) +FROM (VALUES + ('supplier1', 'product1', 4), + ('supplier1', 'product2', 3), + ('supplier2', 'product3', 3), + ('supplier2', 'product4', 4)) +AS Products(supplier_id, product_id, rating) +GROUP BY GROUPING SET ( + ( supplier_id, product_id, rating ), + ( supplier_id, product_id ), + ( supplier_id, rating ), + ( supplier_id ), + ( product_id, rating ), + ( product_id ), + ( rating ), + ( ) +) +``` + +## HAVING + +`HAVING` eliminates group rows that do not satisfy the condition. `HAVING` is different from `WHERE`: `WHERE` filters individual rows before the `GROUP BY` while `HAVING` filters group rows created by `GROUP BY`. Each column referenced in condition must unambiguously reference a grouping column unless it appears within an aggregate function. + +```sql +SELECT SUM(amount) +FROM Orders +GROUP BY users +HAVING SUM(amount) > 50 +``` + +The presence of `HAVING` turns a query into a grouped query even if there is no `GROUP BY` clause. It is the same as what happens when the query contains aggregate functions but no `GROUP BY` clause. The query considers all selected rows to form a single group, and the `SELECT` list and `HAVING` clause can only reference table columns from within aggregate functions. Such a query will emit a single row if the `HAVING` condition is true, zero rows if it is not true. + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/hints.md b/docs/content.zh/docs/dev/table/sql/queries/hints.md new file mode 100644 index 0000000000000..7b630ef5262ed --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/hints.md @@ -0,0 +1,84 @@ +--- +title: "Hints" +weight: 2 +type: docs +aliases: + - /zh/dev/table/sql/hints.html +--- + + +# Hints + +{{< label Batch >}} {{< label Streaming >}} + + +SQL hints 是和 SQL 语句一起使用来改变执行计划的。本章介绍如何使用 SQL hints 增强各种方法。 + +SQL hints 一般可以用于以下: + +- 增强 planner:没有完美的 planner,所以实现 SQL hints 让用户更好地控制执行是非常有意义的; +- 增加元数据(或者统计信息):如"已扫描的表索引"和"一些混洗键(shuffle keys)的倾斜信息"的一些统计数据对于查询来说是动态的,用 hints 来配置它们会非常方便,因为我们从 planner 获得的计划元数据通常不那么准确; +- 算子(Operator)资源约束:在许多情况下,我们会为执行算子提供默认的资源配置,即最小并行度或托管内存(UDF 资源消耗)或特殊资源需求(GPU 或 SSD 磁盘)等,可以使用 SQL hints 非常灵活地为每个查询(非作业)配置资源。 + + +## 动态表(Dynamic Table)选项 +动态表选项允许动态地指定或覆盖表选项,不同于用 SQL DDL 或 连接 API 定义的静态表选项,这些选项可以在每个查询的每个表范围内灵活地指定。 + +因此,它非常适合用于交互式终端中的特定查询,例如,在 SQL-CLI 中,你可以通过添加动态选项`/*+ OPTIONS('csv.ignore-parse-errors'='true') */`来指定忽略 CSV 源的解析错误。 + +注意:动态表选项默认值禁止使用,因为它可能会更改查询的语义。你需要将配置项 `table.dynamic-table-options.enabled` 显式设置为 `true`(默认值为 false),请参阅 [Configuration]({{< ref "docs/dev/table/config" >}}) 了解有关如何设置配置选项的详细信息。 + + +### 语法 +为了不破坏 SQL 兼容性,我们使用 Oracle 风格的 SQL hints 语法: +```sql +table_path /*+ OPTIONS(key=val [, key=val]*) */ + +key: + stringLiteral +val: + stringLiteral + +``` + + +### 示例 + +```sql + +CREATE TABLE kafka_table1 (id BIGINT, name STRING, age INT) WITH (...); +CREATE TABLE kafka_table2 (id BIGINT, name STRING, age INT) WITH (...); + +-- 覆盖查询语句中源表的选项 +select id, name from kafka_table1 /*+ OPTIONS('scan.startup.mode'='earliest-offset') */; + +-- 覆盖 join 中源表的选项 +select * from + kafka_table1 /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ t1 + join + kafka_table2 /*+ OPTIONS('scan.startup.mode'='earliest-offset') */ t2 + on t1.id = t2.id; + +-- 覆盖插入语句中结果表的选项 +insert into kafka_table1 /*+ OPTIONS('sink.partitioner'='round-robin') */ select * from kafka_table2; + +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/joins.md b/docs/content.zh/docs/dev/table/sql/queries/joins.md new file mode 100644 index 0000000000000..005b602dd72ea --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/joins.md @@ -0,0 +1,336 @@ +--- +title: "Join" +weight: 10 +type: docs +aliases: + - /dev/table/streaming/joins.html +--- + + +# Joins + +{{< label Batch >}} {{< label Streaming >}} + +Flink SQL supports complex and flexible join operations over dynamic tables. +There are several different types of joins to account for the wide variety of semantics queries may require. + +By default, the order of joins is not optimized. Tables are joined in the order in which they are specified in the `FROM` clause. You can tweak the performance of your join queries, by listing the tables with the lowest update frequency first and the tables with the highest update frequency last. Make sure to specify tables in an order that does not yield a cross join (Cartesian product), which are not supported and would cause a query to fail. + +Regular Joins +------------- + +Regular joins are the most generic type of join in which any new record, or changes to either side of the join, are visible and affect the entirety of the join result. +For example, if there is a new record on the left side, it will be joined with all the previous and future records on the right side. + +```sql +SELECT * FROM Orders +INNER JOIN Product +ON Orders.productId = Product.id +``` + +For streaming queries, the grammar of regular joins is the most flexible and allow for any kind of updating (insert, update, delete) input table. +However, this operation has important operational implications: it requires to keep both sides of the join input in Flink state forever. +Thus, the required state for computing the query result might grow infinitely depending on the number of distinct input rows of all input tables and intermediate join results. You can provide a query configuration with an appropriate state time-to-live (TTL) to prevent excessive state size. Note that this might affect the correctness of the query result. See [query configuration]({{< ref "docs/dev/table/config" >}}#table-exec-state-ttl) for details. + +{{< query_state_warning >}} + +### INNER Equi-JOIN + +Returns a simple Cartesian product restricted by the join condition. Currently, only equi-joins are supported, i.e., joins that have at least one conjunctive condition with an equality predicate. Arbitrary cross or theta joins are not supported. + +```sql +SELECT * +FROM Orders +INNER JOIN Product +ON Orders.product_id = Product.id +``` + +### OUTER Equi-JOIN + +Returns all rows in the qualified Cartesian product (i.e., all combined rows that pass its join condition), plus one copy of each row in an outer table for which the join condition did not match with any row of the other table. Flink supports LEFT, RIGHT, and FULL outer joins. Currently, only equi-joins are supported, i.e., joins with at least one conjunctive condition with an equality predicate. Arbitrary cross or theta joins are not supported. + +```sql +SELECT * +FROM Orders +LEFT JOIN Product +ON Orders.product_id = Product.id + +SELECT * +FROM Orders +RIGHT JOIN Product +ON Orders.product_id = Product.id + +SELECT * +FROM Orders +FULL OUTER JOIN Product +ON Orders.product_id = Product.id +``` + +Interval Joins +-------------- + +Returns a simple Cartesian product restricted by the join condition and a time constraint. An interval join requires at least one equi-join predicate and a join condition that bounds the time on both sides. Two appropriate range predicates can define such a condition (<, <=, >=, >), a BETWEEN predicate, or a single equality predicate that compares [time attributes]({{< ref "docs/dev/table/concepts/time_attributes" >}}) of the same type (i.e., processing time or event time) of both input tables. + +For example, this query will join all orders with their corresponding shipments if the order was shipped four hours after the order was received. + +```sql +SELECT * +FROM Orders o, Shipments s +WHERE o.id = s.order_id +AND o.order_time BETWEEN s.ship_time - INTERVAL '4' HOUR AND s.ship_time +``` + +The following predicates are examples of valid interval join conditions: + +- `ltime = rtime` +- `ltime >= rtime AND ltime < rtime + INTERVAL '10' MINUTE` +- `ltime BETWEEN rtime - INTERVAL '10' SECOND AND rtime + INTERVAL '5' SECOND` + +For streaming queries, compared to the regular join, interval join only supports append-only tables with time attributes. +Since time attributes are quasi-monotonic increasing, Flink can remove old values from its state without affecting the correctness of the result. + +Temporal Joins +-------------- + +### Event Time Temporal Join + +Temporal joins allow joining against a [versioned table]({{< ref "docs/dev/table/concepts/versioned_tables" >}}). +This means a table can be enriched with changing metadata and retrieve its value at a certain point in time. + +Temporal joins take an arbitrary table (left input/probe site) and correlate each row to the corresponding row's relevant version in the versioned table (right input/build side). +Flink uses the SQL syntax of `FOR SYSTEM_TIME AS OF` to perform this operation from the SQL:2011 standard. +The syntax of a temporal join is as follows; + +```sql +SELECT [column_list] +FROM table1 [AS ] +[LEFT] JOIN table2 FOR SYSTEM_TIME AS OF table1.{ proctime | rowtime } [AS ] +ON table1.column-name1 = table2.column-name1 +``` + +With an event-time attribute (i.e., a rowtime attribute), it is possible to retrieve the value of a key as it was at some point in the past. +This allows for joining the two tables at a common point in time. +The versioned table will store all versions - identified by time - since the last watermark. + +For example, suppose we have a table of orders, each with prices in different currencies. +To properly normalize this table to a single currency, such as USD, each order needs to be joined with the proper currency conversion rate from the point-in-time when the order was placed. + +```sql +-- Create a table of orders. This is a standard +-- append-only dynamic table. +CREATE TABLE orders ( + order_id STRING, + price DECIMAL(32,2), + currency STRING, + order_time TIMESTAMP(3), + WATERMARK FOR order_time AS order_time +) WITH (/* ... */); + +-- Define a versioned table of currency rates. +-- This could be from a change-data-capture +-- such as Debezium, a compacted Kafka topic, or any other +-- way of defining a versioned table. +CREATE TABLE currency_rates ( + currency STRING, + conversion_rate DECIMAL(32, 2), + update_time TIMESTAMP(3) METADATA FROM `values.source.timestamp` VIRTUAL + WATERMARK FOR update_time AS update_time +) WITH ( + 'connector' = 'upsert-kafka', + /* ... */ +); + +SELECT + order_id, + price, + currency, + conversion_rate, + order_time, +FROM orders +LEFT JOIN currency_rates FOR SYSTEM TIME AS OF orders.order_time +ON orders.currency = currency_rates.currency + +order_id price currency conversion_rate order_time +====== ==== ====== ============ ======== +o_001 11.11 EUR 1.14 12:00:00 +o_002 12.51 EUR 1.10 12:06:00 + +``` + +**Note:** The event-time temporal join is triggered by a watermark from the left and right sides; please ensure both sides of the join have set watermark correctly. + +**Note:** The event-time temporal join requires the primary key contained in the equivalence condition of the temporal join condition, e.g., The primary key `P.product_id` of table `product_changelog` to be constrained in the condition `O.product_id = P.product_id`. + +In contrast to [regular joins](#regular-joins), the previous temporal table results will not be affected despite the changes on the build side. +Compared to [interval joins](#interval-joins), temporal table joins do not define a time window within which the records will be joined. +Records from the probe side are always joined with the build side's version at the time specified by the time attribute. Thus, rows on the build side might be arbitrarily old. +As time passes, no longer needed versions of the record (for the given primary key) will be removed from the state. + +### Processing Time Temporal Join + +A processing time temporal table join uses a processing-time attribute to correlate rows to the latest version of a key in an external versioned table. + +By definition, with a processing-time attribute, the join will always return the most up-to-date value for a given key. One can think of a lookup table as a simple HashMap that stores all the records from the build side. +The power of this join is it allows Flink to work directly against external systems when it is not feasible to materialize the table as a dynamic table within Flink. + +The following processing-time temporal table join example shows an append-only table `orders` that should be joined with the table `LatestRates`. +`LatestRates` is a dimension table (e.g. HBase table) that is materialized with the latest rate. At time `10:15`, `10:30`, `10:52`, the content of `LatestRates` looks as follows: + +```sql +10:15> SELECT * FROM LatestRates; + +currency rate +======== ====== +US Dollar 102 +Euro 114 +Yen 1 + +10:30> SELECT * FROM LatestRates; + +currency rate +======== ====== +US Dollar 102 +Euro 114 +Yen 1 + +10:52> SELECT * FROM LatestRates; + +currency rate +======== ====== +US Dollar 102 +Euro 116 <==== changed from 114 to 116 +Yen 1 +``` + +The content of `LastestRates` at times `10:15` and `10:30` are equal. +The Euro rate has changed from 114 to 116 at `10:52`. + +`Orders` is an append-only table representing payments for the given `amount` and the given `currency`. +For example, at `10:15` there was an order for an amount of `2 Euro`. + +```sql +SELECT * FROM Orders; + +amount currency +====== ========= + 2 Euro <== arrived at time 10:15 + 1 US Dollar <== arrived at time 10:30 + 2 Euro <== arrived at time 10:52 +``` + +Given these tables, we would like to calculate all `Orders` converted to a common currency. + +```text +amount currency rate amount*rate +====== ========= ======= ============ + 2 Euro 114 228 <== arrived at time 10:15 + 1 US Dollar 102 102 <== arrived at time 10:30 + 2 Euro 116 232 <== arrived at time 10:52 +``` + + +With the help of temporal table join, we can express such a query in SQL as: + +```sql +SELECT + o.amount, o.currency, r.rate, o.amount * r.rate +FROM + Orders AS o + JOIN LatestRates FOR SYSTEM_TIME AS OF o.proctime AS r + ON r.currency = o.currency +``` + +Each record from the probe side will be joined with the current version of the build side table. +In our example, the query uses the processing-time notion, so a newly appended order would always be joined with the most recent version of `LatestRates` when executing the operation. + +The result is not deterministic for processing-time. +The processing-time temporal join is most often used to enrich the stream with an external table (i.e., dimension table). + +In contrast to [regular joins](#regular-joins), the previous temporal table results will not be affected despite the changes on the build side. +Compared to [interval joins](#interval-joins), temporal table joins do not define a time window within which the records join, i.e., old rows are not stored in state. + +Lookup Join +-------------- + +A lookup join is typically used to enrich a table with data that is queried from an external system. The join requires one table to have a processing time attribute and the other table to be backed by a lookup source connector. + +The lookup join uses the above [Processing Time Temporal Join](#processing-time-temporal-join) syntax with the right table to be backed by a lookup source connector. + +The following example shows the syntax to specify a lookup join. + +```sql +-- Customers is backed by the JDBC connector and can be used for lookup joins +CREATE TEMPORARY TABLE Customers ( + id INT, + name STRING, + country STRING, + zip STRING +) WITH ( + 'connector' = 'jdbc', + 'url' = 'jdbc:mysql://mysqlhost:3306/customerdb', + 'table-name' = 'customers' +); + +-- enrich each order with customer information +SELECT o.order_id, o.total, c.country, c.zip +FROM Orders AS o + JOIN Customers FOR SYSTEM_TIME AS OF o.proc_time AS c + ON o.customer_id = c.id; +``` + +In the example above, the Orders table is enriched with data from the Customers table which resides in a MySQL database. The `FOR SYSTEM_TIME AS OF` clause with the subsequent processing time attribute ensures that each row of the `Orders` table is joined with those Customers rows that match the join predicate at the point in time when the `Orders` row is processed by the join operator. It also prevents that the join result is updated when a joined `Customer` row is updated in the future. The lookup join also requires a mandatory equality join predicate, in the example above `o.customer_id = c.id`. + +Array Expansion +-------------- + +Returns a new row for each element in the given array. Unnesting `WITH ORDINALITY` is not yet supported. + +```sql +SELECT order_id, tag +FROM Orders CROSS JOIN UNNEST(tags) AS t (tag) +``` + +Table Function +-------------- + +Joins a table with the results of a table function. Each row of the left (outer) table is joined with all rows produced by the corresponding call of the table function. [User-defined table functions]({{< ref "docs/dev/table/functions/udfs" >}}#table-functions) must be registered before use. + +### INNER JOIN + +The row of the left (outer) table is dropped, if its table function call returns an empty result. + +```sql +SELECT order_id, res +FROM Orders, +LATERAL TABLE(table_func(order_id)) t(res) +``` + +### LEFT OUTER JOIN + +If a table function call returns an empty result, the corresponding outer row is preserved, and the result padded with null values. Currently, a left outer join against a lateral table requires a TRUE literal in the ON clause. + +```sql +SELECT order_id, res +FROM Orders +LEFT OUTER JOIN LATERAL TABLE(table_func(order_id)) t(res) + ON TRUE +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/limit.md b/docs/content.zh/docs/dev/table/sql/queries/limit.md new file mode 100644 index 0000000000000..65fa92e3b01fc --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/limit.md @@ -0,0 +1,40 @@ +--- +title: "LIMIT 语句" +weight: 13 +type: docs +--- + + +# LIMIT 语句 + +{{< label Batch >}} + +`LIMIT` clause constrains the number of rows returned by the `SELECT` statement. In general, this clause is used in conjunction with ORDER BY to ensure that the results are deterministic. + +The following example selects the first 3 rows in `Orders` table. + +```sql +SELECT * +FROM Orders +ORDER BY orderTime +LIMIT 3 +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/match_recognize.md b/docs/content.zh/docs/dev/table/sql/queries/match_recognize.md new file mode 100644 index 0000000000000..7008de47ecf45 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/match_recognize.md @@ -0,0 +1,1012 @@ +--- +title: "模式检测" +weight: 17 +type: docs +aliases: + - /zh/dev/table/streaming/match_recognize.html +--- + + +# 模式检测 + +{{< label Streaming >}} + +搜索一组事件模式(event pattern)是一种常见的用例,尤其是在数据流情景中。Flink 提供[复杂事件处理(CEP)库]({{< ref "docs/libs/cep" >}}),该库允许在事件流中进行模式检测。此外,Flink 的 SQL API 提供了一种关系式的查询表达方式,其中包含大量内置函数和基于规则的优化,可以开箱即用。 + +2016 年 12 月,国际标准化组织(ISO)发布了新版本的 SQL 标准,其中包括在 _SQL 中的行模式识别(Row Pattern Recognition in SQL)_([ISO/IEC TR 19075-5:2016](https://standards.iso.org/ittf/PubliclyAvailableStandards/c065143_ISO_IEC_TR_19075-5_2016.zip))。它允许 Flink 使用 `MATCH_RECOGNIZE` 子句融合 CEP 和 SQL API,以便在 SQL 中进行复杂事件处理。 + +`MATCH_RECOGNIZE` 子句启用以下任务: +* 使用 `PARTITION BY` 和 `ORDER BY` 子句对数据进行逻辑分区和排序。 +* 使用 `PATTERN` 子句定义要查找的行模式。这些模式使用类似于正则表达式的语法。 +* 在 `DEFINE` 子句中指定行模式变量的逻辑组合。 +* measures 是指在 `MEASURES` 子句中定义的表达式,这些表达式可用于 SQL 查询中的其他部分。 + +下面的示例演示了基本模式识别的语法: + +```sql +SELECT T.aid, T.bid, T.cid +FROM MyTable + MATCH_RECOGNIZE ( + PARTITION BY userid + ORDER BY proctime + MEASURES + A.id AS aid, + B.id AS bid, + C.id AS cid + PATTERN (A B C) + DEFINE + A AS name = 'a', + B AS name = 'b', + C AS name = 'c' + ) AS T +``` + +本页将更详细地解释每个关键字,并演示说明更复杂的示例。 + +{% info 注意 %} Flink 的 `MATCH_RECOGNIZE` 子句实现是一个完整标准子集。仅支持以下部分中记录的功能。基于社区反馈,可能会支持其他功能,请查看[已知的局限](#known-limitations)。 + + + + + +介绍和示例 +------------------------- + + + +### 安装指南 + +模式识别特性使用 Apache Flink 内部的 CEP 库。为了能够使用 `MATCH_RECOGNIZE` 子句,需要将库作为依赖项添加到 Maven 项目中。 + +```xml + + org.apache.flink + flink-cep{{ site.scala_version_suffix }} + {{ site.version }} + +``` + +或者,也可以将依赖项添加到集群的 classpath(查看 [dependency section]({{< ref "docs/dev/datastream/project-configuration" >}}) 获取更多相关依赖信息)。 + +如果你想在 [SQL Client]({{< ref "docs/dev/table/sqlClient" >}}) 中使用 `MATCH_RECOGNIZE` 子句,你无需执行任何操作,因为默认情况下包含所有依赖项。 + + + +### SQL 语义 + +每个 `MATCH_RECOGNIZE` 查询都包含以下子句: + +* [PARTITION BY](#partitioning) - 定义表的逻辑分区;类似于 `GROUP BY` 操作。 +* [ORDER BY](#order-of-events) - 指定传入行的排序方式;这是必须的,因为模式依赖于顺序。 +* [MEASURES](#define--measures) - 定义子句的输出;类似于 `SELECT` 子句。 +* [ONE ROW PER MATCH](#output-mode) - 输出方式,定义每个匹配项应产生多少行。 +* [AFTER MATCH SKIP](#after-match-strategy) - 指定下一个匹配的开始位置;这也是控制单个事件可以属于多少个不同匹配项的方法。 +* [PATTERN](#defining-a-pattern) - 允许使用类似于 _正则表达式_ 的语法构造搜索的模式。 +* [DEFINE](#define--measures) - 本部分定义了模式变量必须满足的条件。 + +注意 目前,`MATCH_RECOGNIZE` 子句只能应用于[追加表]({{< ref "docs/dev/table/concepts/dynamic_tables" >}}#update-and-append-queries)。此外,它也总是生成一个追加表。 + + + +### 示例 + +对于我们的示例,我们假设已经注册了一个表 `Ticker`。该表包含特定时间点的股票价格。 + +这张表的 schema 如下: + +```text +Ticker + |-- symbol: String # 股票的代号 + |-- price: Long # 股票的价格 + |-- tax: Long # 股票应纳税额 + |-- rowtime: TimeIndicatorTypeInfo(rowtime) # 更改这些值的时间点 +``` + +为了简化,我们只考虑单个股票 `ACME` 的传入数据。Ticker 可以类似于下表,其中的行是连续追加的。 + +```text +symbol rowtime price tax +====== ==================== ======= ======= +'ACME' '01-Apr-11 10:00:00' 12 1 +'ACME' '01-Apr-11 10:00:01' 17 2 +'ACME' '01-Apr-11 10:00:02' 19 1 +'ACME' '01-Apr-11 10:00:03' 21 3 +'ACME' '01-Apr-11 10:00:04' 25 2 +'ACME' '01-Apr-11 10:00:05' 18 1 +'ACME' '01-Apr-11 10:00:06' 15 1 +'ACME' '01-Apr-11 10:00:07' 14 2 +'ACME' '01-Apr-11 10:00:08' 24 2 +'ACME' '01-Apr-11 10:00:09' 25 2 +'ACME' '01-Apr-11 10:00:10' 19 1 +``` + +现在的任务是找出一个单一股票价格不断下降的时期。为此,可以编写如下查询: + +```sql +SELECT * +FROM Ticker + MATCH_RECOGNIZE ( + PARTITION BY symbol + ORDER BY rowtime + MEASURES + START_ROW.rowtime AS start_tstamp, + LAST(PRICE_DOWN.rowtime) AS bottom_tstamp, + LAST(PRICE_UP.rowtime) AS end_tstamp + ONE ROW PER MATCH + AFTER MATCH SKIP TO LAST PRICE_UP + PATTERN (START_ROW PRICE_DOWN+ PRICE_UP) + DEFINE + PRICE_DOWN AS + (LAST(PRICE_DOWN.price, 1) IS NULL AND PRICE_DOWN.price < START_ROW.price) OR + PRICE_DOWN.price < LAST(PRICE_DOWN.price, 1), + PRICE_UP AS + PRICE_UP.price > LAST(PRICE_DOWN.price, 1) + ) MR; +``` + +此查询将 `Ticker` 表按照 `symbol` 列进行分区并按照 `rowtime` 属性进行排序。 + +`PATTERN` 子句指定我们对以下模式感兴趣:该模式具有开始事件 `START_ROW`,然后是一个或多个 `PRICE_DOWN` 事件,并以 `PRICE_UP` 事件结束。如果可以找到这样的模式,如 `AFTER MATCH SKIP TO LAST` 子句所示,则从最后一个 `PRICE_UP` 事件开始寻找下一个模式匹配。 + +`DEFINE` 子句指定 `PRICE_DOWN` 和 `PRICE_UP` 事件需要满足的条件。尽管不存在 `START_ROW` 模式变量,但它具有一个始终被评估为 `TRUE` 隐式条件。 + +模式变量 `PRICE_DOWN` 定义为价格小于满足 `PRICE_DOWN` 条件的最后一行。对于初始情况或没有满足 `PRICE_DOWN` 条件的最后一行时,该行的价格应小于该模式中前一行(由 `START_ROW` 引用)的价格。 + +模式变量 `PRICE_UP` 定义为价格大于满足 `PRICE_DOWN` 条件的最后一行。 + +此查询为股票价格持续下跌的每个期间生成摘要行。 + +在查询的 `MEASURES` 子句部分定义确切的输出行信息。输出行数由 `ONE ROW PER MATCH` 输出方式定义。 + +```text + symbol start_tstamp bottom_tstamp end_tstamp +========= ================== ================== ================== +ACME 01-APR-11 10:00:04 01-APR-11 10:00:07 01-APR-11 10:00:08 +``` + +该行结果描述了从 `01-APR-11 10:00:04` 开始的价格下跌期,在 `01-APR-11 10:00:07` 达到最低价格,到 `01-APR-11 10:00:08` 再次上涨。 + + + +分区 +------------ + +可以在分区数据中寻找模式,例如单个股票行情或特定用户的趋势。这可以用 `PARTITION BY` 子句来表示。该子句类似于对 aggregation 使用 `GROUP BY`。 + +注意 强烈建议对传入的数据进行分区,否则 `MATCH_RECOGNIZE` 子句将被转换为非并行算子,以确保全局排序。 + + + +事件顺序 +--------------- + +Apache Flink 可以根据时间([处理时间或者事件时间]({{< ref "docs/dev/table/concepts/time_attributes" >}}))进行模式搜索。 + +如果是事件时间,则在将事件传递到内部模式状态机之前对其进行排序。所以,无论行添加到表的顺序如何,生成的输出都是正确的。而模式是按照每行中所包含的时间指定顺序计算的。 + +`MATCH_RECOGNIZE` 子句假定升序的 [时间属性]({{< ref "docs/dev/table/concepts/time_attributes" >}}) 是 `ORDER BY` 子句的第一个参数。 + +对于示例 `Ticker` 表,诸如 `ORDER BY rowtime ASC, price DESC` 的定义是有效的,但 `ORDER BY price, rowtime` 或者 `ORDER BY rowtime DESC, price ASC` 是无效的。 + +Define & Measures +----------------- + +`DEFINE` 和 `MEASURES` 关键字与简单 SQL 查询中的 `WHERE` 和 `SELECT` 子句具有相近的含义。 + +`MEASURES` 子句定义匹配模式的输出中要包含哪些内容。它可以投影列并定义表达式进行计算。产生的行数取决于[输出方式](#output-mode)设置。 + +`DEFINE` 子句指定行必须满足的条件才能被分类到相应的[模式变量](#defining-a-pattern)。如果没有为模式变量定义条件,则将对每一行使用计算结果为 `true` 的默认条件。 + +有关在这些子句中可使用的表达式的更详细的说明,请查看[事件流导航](#pattern-navigation)部分。 + +### Aggregations + +Aggregations 可以在 `DEFINE` 和 `MEASURES` 子句中使用。支持[内置函数]({{< ref "docs/dev/table/functions/systemFunctions" >}})和[用户自定义函数]({{< ref "docs/dev/table/functions/udfs" >}})。 + +对相应匹配项的行子集可以使用 Aggregate functions。请查看[事件流导航](#pattern-navigation)部分以了解如何计算这些子集。 + +下面这个示例的任务是找出股票平均价格没有低于某个阈值的最长时间段。它展示了 `MATCH_RECOGNIZE` 在 aggregation 中的可表达性。可以使用以下查询执行此任务: + +```sql +SELECT * +FROM Ticker + MATCH_RECOGNIZE ( + PARTITION BY symbol + ORDER BY rowtime + MEASURES + FIRST(A.rowtime) AS start_tstamp, + LAST(A.rowtime) AS end_tstamp, + AVG(A.price) AS avgPrice + ONE ROW PER MATCH + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A+ B) + DEFINE + A AS AVG(A.price) < 15 + ) MR; +``` + +给定此查询和以下输入值: + +```text +symbol rowtime price tax +====== ==================== ======= ======= +'ACME' '01-Apr-11 10:00:00' 12 1 +'ACME' '01-Apr-11 10:00:01' 17 2 +'ACME' '01-Apr-11 10:00:02' 13 1 +'ACME' '01-Apr-11 10:00:03' 16 3 +'ACME' '01-Apr-11 10:00:04' 25 2 +'ACME' '01-Apr-11 10:00:05' 2 1 +'ACME' '01-Apr-11 10:00:06' 4 1 +'ACME' '01-Apr-11 10:00:07' 10 2 +'ACME' '01-Apr-11 10:00:08' 15 2 +'ACME' '01-Apr-11 10:00:09' 25 2 +'ACME' '01-Apr-11 10:00:10' 25 1 +'ACME' '01-Apr-11 10:00:11' 30 1 +``` + +只要事件的平均价格不超过 `15`,查询就会将事件作为模式变量 `A` 的一部分进行累积。 +例如,这种限制发生在 `01-Apr-11 10:00:04`。接下来的时间段在 `01-Apr-11 10:00:11` 再次超过平均价格 `15`。因此,所述查询的结果将是: + +```text + symbol start_tstamp end_tstamp avgPrice +========= ================== ================== ============ +ACME 01-APR-11 10:00:00 01-APR-11 10:00:03 14.5 +ACME 01-APR-11 10:00:05 01-APR-11 10:00:10 13.5 +``` + +注意 Aggregation 可以应用于表达式,但前提是它们引用单个模式变量。因此,`SUM(A.price * A.tax)` 是有效的,而 `AVG(A.price * B.tax)` 则是无效的。 + +注意 不支持 `DISTINCT` aggregation。 + + + +定义模式 +------------------ + +`MATCH_RECOGNIZE` 子句允许用户在事件流中使用功能强大、表达力强的语法搜索模式,这种语法与广泛使用的正则表达式语法有些相似。 + +每个模式都是由基本的构建块构造的,称为 _模式变量_,可以应用算子(量词和其他修饰符)到这些模块中。整个模式必须用括号括起来。 + +示例模式如下所示: + +```sql +PATTERN (A B+ C* D) +``` + +可以使用以下算子: + +* _Concatenation_ - 像 `(A B)` 这样的模式意味着 `A` 和 `B` 之间的连接是严格的。因此,在它们之间不能存在没有映射到 `A` 或 `B` 的行。 +* _Quantifiers_ - 修改可以映射到模式变量的行数。 + * `*` — _0_ 或者多行 + * `+` — _1_ 或者多行 + * `?` — _0_ 或者 _1_ 行 + * `{ n }` — 严格 _n_ 行(_n > 0_) + * `{ n, }` — _n_ 或者更多行(_n ≥ 0_) + * `{ n, m }` — 在 _n_ 到 _m_(包含)行之间(_0 ≤ n ≤ m,0 < m_) + * `{ , m }` — 在 _0_ 到 _m_(包含)行之间(_m > 0_) + + +注意 不支持可能产生空匹配的模式。此类模式的示例如 `PATTERN (A*)`,`PATTERN (A? B*)`,`PATTERN (A{0,} B{0,} C*)` 等。 + + + +### 贪婪量词和勉强量词 + +每一个量词可以是 _贪婪_(默认行为)的或者 _勉强_ 的。贪婪的量词尝试匹配尽可能多的行,而勉强的量词则尝试匹配尽可能少的行。 + +为了说明区别,可以通过查询查看以下示例,其中贪婪量词应用于 `B` 变量: + +```sql +SELECT * +FROM Ticker + MATCH_RECOGNIZE( + PARTITION BY symbol + ORDER BY rowtime + MEASURES + C.price AS lastPrice + ONE ROW PER MATCH + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A B* C) + DEFINE + A AS A.price > 10, + B AS B.price < 15, + C AS C.price > 12 + ) +``` + +假设我们有以下输入: + +```text + symbol tax price rowtime +======= ===== ======== ===================== + XYZ 1 10 2018-09-17 10:00:02 + XYZ 2 11 2018-09-17 10:00:03 + XYZ 1 12 2018-09-17 10:00:04 + XYZ 2 13 2018-09-17 10:00:05 + XYZ 1 14 2018-09-17 10:00:06 + XYZ 2 16 2018-09-17 10:00:07 +``` + +上面的模式将产生以下输出: + +```text + symbol lastPrice +======== =========== + XYZ 16 +``` + +将 `B*` 修改为 `B*?` 的同一查询,这意味着 `B*` 应该是勉强的,将产生: + +```text + symbol lastPrice +======== =========== + XYZ 13 + XYZ 16 +``` + +模式变量 `B` 只匹配价格为 `12` 的行,而不是包含价格为 `12`、`13` 和 `14` 的行。 + +注意 模式的最后一个变量不能使用贪婪量词。因此,不允许使用类似 `(A B*)` 的模式。通过引入条件为 `B` 的人工状态(例如 `C`),可以轻松解决此问题。因此,你可以使用类似以下的查询: + +```sql +PATTERN (A B* C) +DEFINE + A AS condA(), + B AS condB(), + C AS NOT condB() +``` + +注意 目前不支持可选的勉强量词(`A??` 或者 `A{0,1}?`)。 + + + +### 时间约束 + +特别是对于流的使用场景,通常需要在给定的时间内完成模式。这要求限制住 Flink 在内部必须保持的状态总体大小(即已经过期的状态就不需要再维护了),即使在贪婪的量词的情况下也是如此。 + +因此,Flink SQL 支持附加的(非标准 SQL)`WITHIN` 子句来定义模式的时间约束。子句可以在 `PATTERN` 子句之后定义,并以毫秒为间隔进行解析。 + +如果潜在匹配的第一个和最后一个事件之间的时间长于给定值,则不会将这种匹配追加到结果表中。 + +注意 通常鼓励使用 `WITHIN` 子句,因为它有助于 Flink 进行有效的内存管理。一旦达到阈值,即可修剪基础状态。 + +注意 然而,`WITHIN` 子句不是 SQL 标准的一部分。时间约束处理的方法已被提议将来可能会改变。 + +下面的示例查询说明了 `WITHIN` 子句的用法: + +```sql +SELECT * +FROM Ticker + MATCH_RECOGNIZE( + PARTITION BY symbol + ORDER BY rowtime + MEASURES + C.rowtime AS dropTime, + A.price - C.price AS dropDiff + ONE ROW PER MATCH + AFTER MATCH SKIP PAST LAST ROW + PATTERN (A B* C) WITHIN INTERVAL '1' HOUR + DEFINE + B AS B.price > A.price - 10 + C AS C.price < A.price - 10 + ) +``` + +该查询检测到在 1 小时的间隔内价格下降了 `10`。 + +假设该查询用于分析以下股票数据: + +```text +symbol rowtime price tax +====== ==================== ======= ======= +'ACME' '01-Apr-11 10:00:00' 20 1 +'ACME' '01-Apr-11 10:20:00' 17 2 +'ACME' '01-Apr-11 10:40:00' 18 1 +'ACME' '01-Apr-11 11:00:00' 11 3 +'ACME' '01-Apr-11 11:20:00' 14 2 +'ACME' '01-Apr-11 11:40:00' 9 1 +'ACME' '01-Apr-11 12:00:00' 15 1 +'ACME' '01-Apr-11 12:20:00' 14 2 +'ACME' '01-Apr-11 12:40:00' 24 2 +'ACME' '01-Apr-11 13:00:00' 1 2 +'ACME' '01-Apr-11 13:20:00' 19 1 +``` + +查询将生成以下结果: + +```text +symbol dropTime dropDiff +====== ==================== ============= +'ACME' '01-Apr-11 13:00:00' 14 +``` + +结果行代表价格从 `15`(在`01-Apr-11 12:00:00`)下降到 `1`(在`01-Apr-11 13:00:00`)。`dropDiff` 列包含价格差异。 + +请注意,即使价格也下降了较高的值,例如,下降了 `11`(在 `01-Apr-11 10:00:00` 和 `01-Apr-11 11:40:00` 之间),这两个事件之间的时间差大于 1 小时。因此,它们不会产生匹配。 + + + +输出方式 +----------- + +_输出方式_ 描述每个找到的匹配项应该输出多少行。SQL 标准描述了两种方式: +- `ALL ROWS PER MATCH` +- `ONE ROW PER MATCH` + +目前,唯一支持的输出方式是 `ONE ROW PER MATCH`,它将始终为每个找到的匹配项生成一个输出摘要行。 + +输出行的 schema 将是按特定顺序连接 `[partitioning columns] + [measures columns]`。 + +以下示例显示了所定义的查询的输出: + +```sql +SELECT * +FROM Ticker + MATCH_RECOGNIZE( + PARTITION BY symbol + ORDER BY rowtime + MEASURES + FIRST(A.price) AS startPrice, + LAST(A.price) AS topPrice, + B.price AS lastPrice + ONE ROW PER MATCH + PATTERN (A+ B) + DEFINE + A AS LAST(A.price, 1) IS NULL OR A.price > LAST(A.price, 1), + B AS B.price < LAST(A.price) + ) +``` + +对于以下输入行: + +```text + symbol tax price rowtime +======== ===== ======== ===================== + XYZ 1 10 2018-09-17 10:00:02 + XYZ 2 12 2018-09-17 10:00:03 + XYZ 1 13 2018-09-17 10:00:04 + XYZ 2 11 2018-09-17 10:00:05 +``` + +该查询将生成以下输出: + +```text + symbol startPrice topPrice lastPrice +======== ============ ========== =========== + XYZ 10 13 11 +``` + +该模式识别由 `symbol` 列分区。即使在 `MEASURES` 子句中未明确提及,分区列仍会添加到结果的开头。 + + + +模式导航 +------------------ + +`DEFINE` 和 `MEASURES` 子句允许在(可能)匹配模式的行列表中进行导航。 + +本节讨论用于声明条件或产生输出结果的导航。 + + + +### 引用模式变量 + +_引用模式变量_ 允许引用一组映射到 `DEFINE` 或 `MEASURES` 子句中特定模式变量的行。 + +例如,如果我们尝试将当前行与 `A` 进行匹配,则表达式 `A.price` 描述了目前为止已映射到 `A` 的一组行加上当前行。如果 `DEFINE`/`MEASURES` 子句中的表达式需要一行(例如 `a.price` 或 `a.price > 10`),它将选择属于相应集合的最后一个值。 + +如果没有指定模式变量(例如 `SUM(price)`),则表达式引用默认模式变量 `*`,该变量引用模式中的所有变量。换句话说,它创建了一个列表,其中列出了迄今为止映射到任何变量的所有行以及当前行。 + + + +#### 示例 + +对于更全面的示例,可以查看以下模式和相应的条件: + +```sql +PATTERN (A B+) +DEFINE + A AS A.price >= 10, + B AS B.price > A.price AND SUM(price) < 100 AND SUM(B.price) < 80 +``` + +下表描述了如何为每个传入事件计算这些条件。 + +该表由以下列组成: + * `#` - 行标识符,用于唯一标识列表中的传入行 `[A.price]`/`[B.price]`/`[price]`。 + * `price` - 传入行的价格。 + * `[A.price]`/`[B.price]`/`[price]` - 描述 `DEFINE` 子句中用于计算条件的行列表。 + * `Classifier` - 当前行的分类器,指示该行映射到的模式变量。 + * `A.price`/`B.price`/`SUM(price)`/`SUM(B.price)` - 描述了这些表达式求值后的结果。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    #priceClassifier[A.price][B.price][price]A.priceB.priceSUM(price)SUM(B.price)
    #110-> A#1--10---
    #215-> B#1#2#1, #210152515
    #320-> B#1#2, #3#1, #2, #310204535
    #431-> B#1#2, #3, #4#1, #2, #3, #410317666
    #535#1#2, #3, #4, #5#1, #2, #3, #4, #51035111101
    + +从表中可以看出,第一行映射到模式变量 `A`,随后的行映射到模式变量 `B`。但是,最后一行不满足 `B` 条件,因为所有映射行 `SUM(price)` 的总和与 `B` 中所有行的总和都超过了指定的阈值。 + +### Logical Offsets + +_Logical offsets_ 在映射到指定模式变量的事件启用导航。这可以用两个相应的函数表示: + + + + + + + + + + + + + + + + + + +
    Offset functions描述
    +```text +LAST(variable.field, n) +``` + +

    返回映射到变量最后 n 个元素的事件中的字段值。计数从映射的最后一个元素开始。

    +
    +```text +FIRST(variable.field, n) +``` + +

    返回映射到变量的第 n 个元素的事件中的字段值。计数从映射的第一个元素开始。

    +
    + + + +#### 示例 + +对于更全面的示例,可以参考以下模式和相应的条件: + +```sql +PATTERN (A B+) +DEFINE + A AS A.price >= 10, + B AS (LAST(B.price, 1) IS NULL OR B.price > LAST(B.price, 1)) AND + (LAST(B.price, 2) IS NULL OR B.price > 2 * LAST(B.price, 2)) +``` + +下表描述了如何为每个传入事件计算这些条件。 + +该表包括以下列: + * `price` - 传入行的价格。 + * `Classifier` - 当前行的分类器,指示该行映射到的模式变量。 + * `LAST(B.price, 1)`/`LAST(B.price, 2)` - 描述对这些表达式求值后的结果。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    priceClassifierLAST(B.price, 1)LAST(B.price, 2)Comment
    10-> A
    15-> Bnullnull注意 LAST(B.price, 1) 为空,因为仍然没有映射到 B
    20-> B15null
    31-> B2015
    353120因为 35 < 2 * 20 没有映射。
    + +将默认模式变量与 logical offsets 一起使用也可能很有意义。 + +在这种情况下,offset 会包含到目前为止映射的所有行: + +```sql +PATTERN (A B? C) +DEFINE + B AS B.price < 20, + C AS LAST(price, 1) < C.price +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    priceClassifierLAST(price, 1)Comment
    10-> A
    15-> B
    20-> C15LAST(price, 1) 被计算为映射到 B 变量的行的价格。
    + +如果第二行没有映射到 `B` 变量,则会得到以下结果: + + + + + + + + + + + + + + + + + + + + + + + + +
    priceClassifierLAST(price, 1)Comment
    10-> A
    20-> C10LAST(price, 1) 被计算为映射到 A 变量的行的价格。
    + +也可以在 `FIRST/LAST` 函数的第一个参数中使用多个模式变量引用。这样,可以编写访问多个列的表达式。但是,它们都必须使用相同的模式变量。换句话说,必须在一行中计算 `LAST`/`FIRST` 函数的值。 + +因此,可以使用 `LAST(A.price * A.tax)`,但不允许使用类似 `LAST(A.price * B.tax)` 的表达式。 + + + +匹配后的策略 +-------------------- + +`AFTER MATCH SKIP` 子句指定在找到完全匹配后从何处开始新的匹配过程。 + +有四种不同的策略: +* `SKIP PAST LAST ROW` - 在当前匹配的最后一行之后的下一行继续模式匹配。 +* `SKIP TO NEXT ROW` - 继续从匹配项开始行后的下一行开始搜索新匹配项。 +* `SKIP TO LAST variable` - 恢复映射到指定模式变量的最后一行的模式匹配。 +* `SKIP TO FIRST variable` - 在映射到指定模式变量的第一行继续模式匹配。 + +这也是一种指定单个事件可以属于多少个匹配项的方法。例如,使用 `SKIP PAST LAST ROW` 策略,每个事件最多只能属于一个匹配项。 + + + +#### 示例 + +为了更好地理解这些策略之间的差异,我们可以看看下面的例子。 + +对于以下输入行: + +```text + symbol tax price rowtime +======== ===== ======= ===================== + XYZ 1 7 2018-09-17 10:00:01 + XYZ 2 9 2018-09-17 10:00:02 + XYZ 1 10 2018-09-17 10:00:03 + XYZ 2 5 2018-09-17 10:00:04 + XYZ 2 10 2018-09-17 10:00:05 + XYZ 2 7 2018-09-17 10:00:06 + XYZ 2 14 2018-09-17 10:00:07 +``` + +我们使用不同的策略评估以下查询: + +```sql +SELECT * +FROM Ticker + MATCH_RECOGNIZE( + PARTITION BY symbol + ORDER BY rowtime + MEASURES + SUM(A.price) AS sumPrice, + FIRST(rowtime) AS startTime, + LAST(rowtime) AS endTime + ONE ROW PER MATCH + [AFTER MATCH STRATEGY] + PATTERN (A+ C) + DEFINE + A AS SUM(A.price) < 30 + ) +``` + +该查询返回映射到 `A` 的总体匹配的第一个和最后一个时间戳所有行的价格之和。 + +查询将根据使用的 `AFTER MATCH` 策略产生不同的结果: + +##### `AFTER MATCH SKIP PAST LAST ROW` + +```text + symbol sumPrice startTime endTime +======== ========== ===================== ===================== + XYZ 26 2018-09-17 10:00:01 2018-09-17 10:00:04 + XYZ 17 2018-09-17 10:00:05 2018-09-17 10:00:07 +``` + +第一个结果与 #1,#2,#3,#4 行匹配。 + +第二个结果与 #5,#6, #7 行匹配。 + +##### `AFTER MATCH SKIP TO NEXT ROW` + +```text + symbol sumPrice startTime endTime +======== ========== ===================== ===================== + XYZ 26 2018-09-17 10:00:01 2018-09-17 10:00:04 + XYZ 24 2018-09-17 10:00:02 2018-09-17 10:00:05 + XYZ 25 2018-09-17 10:00:03 2018-09-17 10:00:06 + XYZ 22 2018-09-17 10:00:04 2018-09-17 10:00:07 + XYZ 17 2018-09-17 10:00:05 2018-09-17 10:00:07 +``` + +同样,第一个结果与 #1,#2,#3,#4 行匹配。 + +与上一个策略相比,下一个匹配再次包含 #2 行匹配。因此,第二个结果与 #2,#3,#4,#5 行匹配。 + +第三个结果与 #3,#4,#5, #6 行匹配。 + +第四个结果与 #4,#5,#6, #7 行匹配。 + +最后一个结果与 #5,#6, #7 行匹配。 + +##### `AFTER MATCH SKIP TO LAST A` + +```text + symbol sumPrice startTime endTime +======== ========== ===================== ===================== + XYZ 26 2018-09-17 10:00:01 2018-09-17 10:00:04 + XYZ 25 2018-09-17 10:00:03 2018-09-17 10:00:06 + XYZ 17 2018-09-17 10:00:05 2018-09-17 10:00:07 +``` + +同样,第一个结果与 #1,#2,#3,#4 行匹配。 + +与前一个策略相比,下一个匹配只包含 #3 行(对应 `A`)用于下一个匹配。因此,第二个结果与 #3,#4,#5, #6 行匹配。 + +最后一个结果与 #5,#6, #7 行匹配。 + +##### `AFTER MATCH SKIP TO FIRST A` + +这种组合将产生一个运行时异常,因为人们总是试图在上一个开始的地方开始一个新的匹配。这将产生一个无限循环,因此是禁止的。 + +必须记住,在 `SKIP TO FIRST/LAST variable` 策略的场景下,可能没有映射到该变量的行(例如,对于模式 `A*`)。在这种情况下,将抛出一个运行时异常,因为标准要求一个有效的行来继续匹配。 + + + +时间属性 +--------------- + +为了在 `MATCH_RECOGNIZE` 之上应用一些后续查询,可能需要使用[时间属性]({{< ref "docs/dev/table/concepts/time_attributes" >}})。有两个函数可供选择: + + + + + + + + + + + + + + + + + + + +
    FunctionDescription
    + MATCH_ROWTIME()
    +
    +

    返回映射到给定模式的最后一行的时间戳。

    +

    结果属性是}}">行时间属性,可用于后续基于时间的操作,例如 }}#interval-joins">interval joinsgroup window or over window aggregations

    +
    + MATCH_PROCTIME()
    +
    +

    返回}}#processing-time">处理时间属性,该属性可用于随后的基于时间的操作,例如 }}#interval-joins">interval joinsgroup window or over window aggregations

    +
    + + + +控制内存消耗 +------------------------------ + +在编写 `MATCH_RECOGNIZE` 查询时,内存消耗是一个重要的考虑因素,因为潜在匹配的空间是以宽度优先的方式构建的。鉴于此,我们必须确保模式能够完成。最好使用映射到匹配项的合理数量的行,因为它们必须内存相适。 + +例如,该模式不能有没有接受每一行上限的量词。这种模式可以是这样的: + +```sql +PATTERN (A B+ C) +DEFINE + A as A.price > 10, + C as C.price > 20 +``` + +查询将每个传入行映射到 `B` 变量,因此永远不会完成。可以纠正此查询,例如,通过否定 `C` 的条件: + +```sql +PATTERN (A B+ C) +DEFINE + A as A.price > 10, + B as B.price <= 20, + C as C.price > 20 +``` + +或者使用 [reluctant quantifier](#greedy--reluctant-quantifiers): + +```sql +PATTERN (A B+? C) +DEFINE + A as A.price > 10, + C as C.price > 20 +``` + +注意 请注意,`MATCH_RECOGNIZE` 子句未使用配置的 [state retention time]({{< ref "docs/dev/table/config" >}}#idle-state-retention-time)。为此,可能需要使用 `WITHIN` [子句](#time-constraint)。 + + + +已知的局限 +----------------- + +Flink 对 `MATCH_RECOGNIZE` 子句实现是一项长期持续的工作,目前尚不支持 SQL 标准的某些功能。 + +不支持的功能包括: +* 模式表达式: + * Pattern groups - 这意味着量词不能应用于模式的子序列。因此,`(A (B C)+)` 不是有效的模式。 + * Alterations - 像 `PATTERN((A B | C D) E)`这样的模式,这意味着在寻找 `E` 行之前必须先找到子序列 `A B` 或者 `C D`。 + * `PERMUTE` operator - 这等同于它应用于所示的所有变量的排列 `PATTERN (PERMUTE (A, B, C))` = `PATTERN (A B C | A C B | B A C | B C A | C A B | C B A)`。 + * Anchors - `^, $`,表示分区的开始/结束,在流上下文中没有意义,将不被支持。 + * Exclusion - `PATTERN ({- A -} B)` 表示将查找 `A`,但是不会参与输出。这只适用于 `ALL ROWS PER MATCH` 方式。 + * Reluctant optional quantifier - `PATTERN A??` 只支持贪婪的可选量词。 +* `ALL ROWS PER MATCH` 输出方式 - 为参与创建匹配项的每一行产生一个输出行。这也意味着: + * `MEASURES` 子句唯一支持的语义是 `FINAL` + * `CLASSIFIER` 函数,尚不支持返回行映射到的模式变量。 +* `SUBSET` - 它允许创建模式变量的逻辑组,并在 `DEFINE` 和 `MEASURES` 子句中使用这些组。 +* Physical offsets - `PREV/NEXT`,它为所有可见事件建立索引,而不是仅将那些映射到模式变量的事件编入索引(如 [logical offsets](#logical-offsets) 的情况)。 +* 提取时间属性 - 目前无法为后续基于时间的操作提取时间属性。 +* `MATCH_RECOGNIZE` 仅 SQL 支持。Table API 中没有等效项。 +* Aggregations: + * 不支持 distinct aggregations。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/orderby.md b/docs/content.zh/docs/dev/table/sql/queries/orderby.md new file mode 100644 index 0000000000000..f904a63682735 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/orderby.md @@ -0,0 +1,39 @@ +--- +title: "ORDER BY 语句" +weight: 12 +type: docs +--- + + +# ORDER BY 语句 + +{{< label Batch >}} {{< label Streaming >}} + +The `ORDER BY` clause causes the result rows to be sorted according to the specified expression(s). If two rows are equal according to the leftmost expression, they are compared according to the next expression and so on. If they are equal according to all specified expressions, they are returned in an implementation-dependent order. + +When running in streaming mode, the primary sort order of a table must be ascending on a [time attribute]({{< ref "docs/dev/table/concepts/time_attributes" >}}). All subsequent orders can be freely chosen. But there is no this limitation in batch mode. + +```sql +SELECT * +FROM Orders +ORDER BY order_time, order_id +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/over-agg.md b/docs/content.zh/docs/dev/table/sql/queries/over-agg.md new file mode 100644 index 0000000000000..b815adcaa04bd --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/over-agg.md @@ -0,0 +1,101 @@ +--- +title: "Over聚合" +weight: 9 +type: docs +--- + + +# Over Aggregation +{{< label Batch >}} {{< label Streaming >}} + +`OVER` aggregates compute an aggregated value for every input row over a range of ordered rows. In contrast to `GROUP BY` aggregates, `OVER` aggregates do not reduce the number of result rows to a single row for every group. Instead `OVER` aggregates produce an aggregated value for every input row. + +The following query computes for every order the sum of amounts of all orders for the same product that were received within one hour before the current order. + +```sql +SELECT order_id, order_time, amount, + SUM(amount) OVER ( + PARTITION BY product + ORDER BY order_time + RANGE BETWEEN INTERVAL '1' HOUR PRECEDING AND CURRENT ROW + ) AS one_hour_prod_amount_sum +FROM Orders +``` + +The syntax for an `OVER` window is summarized below. + +```sql +SELECT + agg_func(agg_col) OVER ( + [PARTITION BY col1[, col2, ...]] + ORDER BY time_col + range_definition), + ... +FROM ... +``` + +You can define multiple `OVER` window aggregates in a `SELECT` clause. However, for streaming queries, the `OVER` windows for all aggregates must be identical due to current limitation. + + +### ORDER BY + +`OVER windows are defined on an ordered sequence of rows. Since tables do not have an inherent order, the `ORDER BY` clause is mandatory. For streaming queries, Flink currently only supports `OVER` windows that are defined with an ascending [time attributes]({{< ref "docs/dev/table/concepts/time_attributes" >}}) order. Additional orderings are not supported. + +### PARTITION BY + +`OVER` windows can be defined on a partitioned table. In presence of a `PARTITION BY` clause, the aggregate is computed for each input row only over the rows of its partition. + +### Range Definitions + +The range definition specifies how many rows are included in the aggregate. The range is defined with a `BETWEEN` clause that defines a lower and an upper boundary. All rows between these boundaries are included in the aggregate. Flink only supports `CURRENT ROW` as the upper boundary. + +There are two options to define the range, `ROWS` intervals and `RANGE` intervals. + +#### RANGE intervals + +A `RANGE` interval is defined on the values of the ORDER BY column, which is in case of Flink always a time attribute. The following RANGE interval defines that all rows with a time attribute of at most 30 minutes less than the current row are included in the aggregate. + +```sql +RANGE BETWEEN INTERVAL '30' MINUTE PRECEDING AND CURRENT ROW +``` + +#### ROW intervals + +A `ROWS` interval is a count-based interval. It defines exactly how many rows are included in the aggregate. The following `ROWS` interval defines that the 10 rows preceding the current row and the current row (so 11 rows in total) are included in the aggregate. + +```sql +ROWS BETWEEN 10 PRECEDING AND CURRENT ROW +WINDOW +``` + +The `WINDOW` clause can be used to define an `OVER` window outside of the `SELECT` clause. It can make queries more readable and also allows us to reuse the window definition for multiple aggregates. + +```sql +SELECT order_id, order_time, amount, + SUM(amount) OVER w AS sum_amount, + AVG(amount) OVER w AS avg_amount +FROM Orders +WINDOW w AS ( + PARTITION BY product + ORDER BY order_time + RANGE BETWEEN INTERVAL '1' HOUR PRECEDING AND CURRENT ROW) +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/overview.md b/docs/content.zh/docs/dev/table/sql/queries/overview.md new file mode 100644 index 0000000000000..6cc22eaac1145 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/overview.md @@ -0,0 +1,415 @@ +--- +title: "概览" +weight: 1 +type: docs +aliases: + - /dev/table/sql/queries.html + - /dev/table/queries/ + - /dev/table/sql.html +--- + + +# 查询 + + +`SELECT` statements and `VALUES` statements are specified with the `sqlQuery()` method of the `TableEnvironment`. +The method returns the result of the SELECT statement (or the VALUES statements) as a `Table`. +A `Table` can be used in [subsequent SQL and Table API queries]({{< ref "docs/dev/table/common" >}}#mixing-table-api-and-sql), be [converted into a DataStream]({{< ref "docs/dev/table/common" >}}#integration-with-datastream), or [written to a TableSink]({{< ref "docs/dev/table/common" >}}#emit-a-table). +SQL and Table API queries can be seamlessly mixed and are holistically optimized and translated into a single program. + +In order to access a table in a SQL query, it must be [registered in the TableEnvironment]({{< ref "docs/dev/table/common" >}}#register-tables-in-the-catalog). +A table can be registered from a [TableSource]({{< ref "docs/dev/table/common" >}}#register-a-tablesource), [Table]({{< ref "docs/dev/table/common" >}}#register-a-table), [CREATE TABLE statement](#create-table), [DataStream]({{< ref "docs/dev/table/common" >}}#register-a-datastream). +Alternatively, users can also [register catalogs in a TableEnvironment]({{< ref "docs/dev/table/catalogs" >}}) to specify the location of the data sources. + +For convenience, `Table.toString()` automatically registers the table under a unique name in its `TableEnvironment` and returns the name. +So, `Table` objects can be directly inlined into SQL queries as shown in the examples below. + +**Note:** Queries that include unsupported SQL features cause a `TableException`. +The supported features of SQL on batch and streaming tables are listed in the following sections. + +## Specifying a Query + +The following examples show how to specify a SQL queries on registered and inlined tables. + +{{< tabs "f5adf0e8-aae8-4eb4-84a7-ceb156d173e9" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env); + +// ingest a DataStream from an external source +DataStream> ds = env.addSource(...); + +// SQL query with an inlined (unregistered) table +Table table = tableEnv.fromDataStream(ds, $("user"), $("product"), $("amount")); +Table result = tableEnv.sqlQuery( + "SELECT SUM(amount) FROM " + table + " WHERE product LIKE '%Rubber%'"); + +// SQL query with a registered table +// register the DataStream as view "Orders" +tableEnv.createTemporaryView("Orders", ds, $("user"), $("product"), $("amount")); +// run a SQL query on the Table and retrieve the result as a new Table +Table result2 = tableEnv.sqlQuery( + "SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'"); + +// create and register a TableSink +final Schema schema = new Schema() + .field("product", DataTypes.STRING()) + .field("amount", DataTypes.INT()); + +tableEnv.connect(new FileSystem().path("/path/to/file")) + .withFormat(...) + .withSchema(schema) + .createTemporaryTable("RubberOrders"); + +// run an INSERT SQL on the Table and emit the result to the TableSink +tableEnv.executeSql( + "INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'"); +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment +val tableEnv = StreamTableEnvironment.create(env) + +// read a DataStream from an external source +val ds: DataStream[(Long, String, Integer)] = env.addSource(...) + +// SQL query with an inlined (unregistered) table +val table = ds.toTable(tableEnv, $"user", $"product", $"amount") +val result = tableEnv.sqlQuery( + s"SELECT SUM(amount) FROM $table WHERE product LIKE '%Rubber%'") + +// SQL query with a registered table +// register the DataStream under the name "Orders" +tableEnv.createTemporaryView("Orders", ds, $"user", $"product", $"amount") +// run a SQL query on the Table and retrieve the result as a new Table +val result2 = tableEnv.sqlQuery( + "SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'") + +// create and register a TableSink +val schema = new Schema() + .field("product", DataTypes.STRING()) + .field("amount", DataTypes.INT()) + +tableEnv.connect(new FileSystem().path("/path/to/file")) + .withFormat(...) + .withSchema(schema) + .createTemporaryTable("RubberOrders") + +// run an INSERT SQL on the Table and emit the result to the TableSink +tableEnv.executeSql( + "INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'") +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +env = StreamExecutionEnvironment.get_execution_environment() +table_env = StreamTableEnvironment.create(env) + +# SQL query with an inlined (unregistered) table +# elements data type: BIGINT, STRING, BIGINT +table = table_env.from_elements(..., ['user', 'product', 'amount']) +result = table_env \ + .sql_query("SELECT SUM(amount) FROM %s WHERE product LIKE '%%Rubber%%'" % table) + +# create and register a TableSink +t_env.connect(FileSystem().path("/path/to/file"))) + .with_format(Csv() + .field_delimiter(',') + .deriveSchema()) + .with_schema(Schema() + .field("product", DataTypes.STRING()) + .field("amount", DataTypes.BIGINT())) + .create_temporary_table("RubberOrders") + +# run an INSERT SQL on the Table and emit the result to the TableSink +table_env \ + .execute_sql("INSERT INTO RubberOrders SELECT product, amount FROM Orders WHERE product LIKE '%Rubber%'") +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + +## Execute a Query + + +A SELECT statement or a VALUES statement can be executed to collect the content to local through the `TableEnvironment.executeSql()` method. The method returns the result of the SELECT statement (or the VALUES statement) as a `TableResult`. Similar to a SELECT statement, a `Table` object can be executed using the `Table.execute()` method to collect the content of the query to the local client. +`TableResult.collect()` method returns a closeable row iterator. The select job will not be finished unless all result data has been collected. We should actively close the job to avoid resource leak through the `CloseableIterator#close()` method. +We can also print the select result to client console through the `TableResult.print()` method. The result data in `TableResult` can be accessed only once. Thus, `collect()` and `print()` must not be called after each other. + +`TableResult.collect()` and `TableResult.print()` have slightly different behaviors under different checkpointing settings (to enable checkpointing for a streaming job, see [checkpointing config]({{< ref "docs/deployment/config" >}}#checkpointing)). +* For batch jobs or streaming jobs without checkpointing, `TableResult.collect()` and `TableResult.print()` have neither exactly-once nor at-least-once guarantee. Query results are immediately accessible by the clients once they're produced, but exceptions will be thrown when the job fails and restarts. +* For streaming jobs with exactly-once checkpointing, `TableResult.collect()` and `TableResult.print()` guarantee an end-to-end exactly-once record delivery. A result will be accessible by clients only after its corresponding checkpoint completes. +* For streaming jobs with at-least-once checkpointing, `TableResult.collect()` and `TableResult.print()` guarantee an end-to-end at-least-once record delivery. Query results are immediately accessible by the clients once they're produced, but it is possible for the same result to be delivered multiple times. + +{{< tabs "88a003e1-16ea-43cc-9d42-d43ef1351e53" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env, settings); + +tableEnv.executeSql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)"); + +// execute SELECT statement +TableResult tableResult1 = tableEnv.executeSql("SELECT * FROM Orders"); +// use try-with-resources statement to make sure the iterator will be closed automatically +try (CloseableIterator it = tableResult1.collect()) { + while(it.hasNext()) { + Row row = it.next(); + // handle row + } +} + +// execute Table +TableResult tableResult2 = tableEnv.sqlQuery("SELECT * FROM Orders").execute(); +tableResult2.print(); + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() +val tableEnv = StreamTableEnvironment.create(env, settings) +// enable checkpointing +tableEnv.getConfig.getConfiguration.set( + ExecutionCheckpointingOptions.CHECKPOINTING_MODE, CheckpointingMode.EXACTLY_ONCE) +tableEnv.getConfig.getConfiguration.set( + ExecutionCheckpointingOptions.CHECKPOINTING_INTERVAL, Duration.ofSeconds(10)) + +tableEnv.executeSql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)") + +// execute SELECT statement +val tableResult1 = tableEnv.executeSql("SELECT * FROM Orders") +val it = tableResult1.collect() +try while (it.hasNext) { + val row = it.next + // handle row +} +finally it.close() // close the iterator to avoid resource leak + +// execute Table +val tableResult2 = tableEnv.sqlQuery("SELECT * FROM Orders").execute() +tableResult2.print() + +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +env = StreamExecutionEnvironment.get_execution_environment() +table_env = StreamTableEnvironment.create(env, settings) +# enable checkpointing +table_env.get_config().get_configuration().set_string("execution.checkpointing.mode", "EXACTLY_ONCE") +table_env.get_config().get_configuration().set_string("execution.checkpointing.interval", "10s") + +table_env.execute_sql("CREATE TABLE Orders (`user` BIGINT, product STRING, amount INT) WITH (...)") + +# execute SELECT statement +table_result1 = table_env.execute_sql("SELECT * FROM Orders") +table_result1.print() + +# execute Table +table_result2 = table_env.sql_query("SELECT * FROM Orders").execute() +table_result2.print() + +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + + +## Syntax + +Flink parses SQL using [Apache Calcite](https://calcite.apache.org/docs/reference.html), which supports standard ANSI SQL. + +The following BNF-grammar describes the superset of supported SQL features in batch and streaming queries. The [Operations](#operations) section shows examples for the supported features and indicates which features are only supported for batch or streaming queries. + +{{< expand Grammar >}} +```sql +query: + values + | { + select + | selectWithoutFrom + | query UNION [ ALL ] query + | query EXCEPT query + | query INTERSECT query + } + [ ORDER BY orderItem [, orderItem ]* ] + [ LIMIT { count | ALL } ] + [ OFFSET start { ROW | ROWS } ] + [ FETCH { FIRST | NEXT } [ count ] { ROW | ROWS } ONLY] + +orderItem: + expression [ ASC | DESC ] + +select: + SELECT [ ALL | DISTINCT ] + { * | projectItem [, projectItem ]* } + FROM tableExpression + [ WHERE booleanExpression ] + [ GROUP BY { groupItem [, groupItem ]* } ] + [ HAVING booleanExpression ] + [ WINDOW windowName AS windowSpec [, windowName AS windowSpec ]* ] + +selectWithoutFrom: + SELECT [ ALL | DISTINCT ] + { * | projectItem [, projectItem ]* } + +projectItem: + expression [ [ AS ] columnAlias ] + | tableAlias . * + +tableExpression: + tableReference [, tableReference ]* + | tableExpression [ NATURAL ] [ LEFT | RIGHT | FULL ] JOIN tableExpression [ joinCondition ] + +joinCondition: + ON booleanExpression + | USING '(' column [, column ]* ')' + +tableReference: + tablePrimary + [ matchRecognize ] + [ [ AS ] alias [ '(' columnAlias [, columnAlias ]* ')' ] ] + +tablePrimary: + [ TABLE ] tablePath [ dynamicTableOptions ] [systemTimePeriod] [[AS] correlationName] + | LATERAL TABLE '(' functionName '(' expression [, expression ]* ')' ')' + | UNNEST '(' expression ')' + +tablePath: + [ [ catalogName . ] schemaName . ] tableName + +systemTimePeriod: + FOR SYSTEM_TIME AS OF dateTimeExpression + +dynamicTableOptions: + /*+ OPTIONS(key=val [, key=val]*) */ + +key: + stringLiteral + +val: + stringLiteral + +values: + VALUES expression [, expression ]* + +groupItem: + expression + | '(' ')' + | '(' expression [, expression ]* ')' + | CUBE '(' expression [, expression ]* ')' + | ROLLUP '(' expression [, expression ]* ')' + | GROUPING SETS '(' groupItem [, groupItem ]* ')' + +windowRef: + windowName + | windowSpec + +windowSpec: + [ windowName ] + '(' + [ ORDER BY orderItem [, orderItem ]* ] + [ PARTITION BY expression [, expression ]* ] + [ + RANGE numericOrIntervalExpression {PRECEDING} + | ROWS numericExpression {PRECEDING} + ] + ')' + +matchRecognize: + MATCH_RECOGNIZE '(' + [ PARTITION BY expression [, expression ]* ] + [ ORDER BY orderItem [, orderItem ]* ] + [ MEASURES measureColumn [, measureColumn ]* ] + [ ONE ROW PER MATCH ] + [ AFTER MATCH + ( SKIP TO NEXT ROW + | SKIP PAST LAST ROW + | SKIP TO FIRST variable + | SKIP TO LAST variable + | SKIP TO variable ) + ] + PATTERN '(' pattern ')' + [ WITHIN intervalLiteral ] + DEFINE variable AS condition [, variable AS condition ]* + ')' + +measureColumn: + expression AS alias + +pattern: + patternTerm [ '|' patternTerm ]* + +patternTerm: + patternFactor [ patternFactor ]* + +patternFactor: + variable [ patternQuantifier ] + +patternQuantifier: + '*' + | '*?' + | '+' + | '+?' + | '?' + | '??' + | '{' { [ minRepeat ], [ maxRepeat ] } '}' ['?'] + | '{' repeat '}' + +``` +{{< /expand >}} + +Flink SQL uses a lexical policy for identifier (table, attribute, function names) similar to Java: + +- The case of identifiers is preserved whether or not they are quoted. +- After which, identifiers are matched case-sensitively. +- Unlike Java, back-ticks allow identifiers to contain non-alphanumeric characters (e.g. "SELECT a AS `my field` FROM t"). + +String literals must be enclosed in single quotes (e.g., `SELECT 'Hello World'`). Duplicate a single quote for escaping (e.g., `SELECT 'It''s me.'`). Unicode characters are supported in string literals. If explicit unicode code points are required, use the following syntax: + +- Use the backslash (`\`) as escaping character (default): `SELECT U&'\263A'` +- Use a custom escaping character: `SELECT U&'#263A' UESCAPE '#'` + +{{< top >}} + +## Operations + +- [WITH clause]({{< ref "docs/dev/table/sql/queries/with" >}}) +- [SELECT & WHERE]({{< ref "docs/dev/table/sql/queries/select" >}}) +- [SELECT DISTINCT]({{< ref "docs/dev/table/sql/queries/select-distinct" >}}) +- [Windowing TVF]({{< ref "docs/dev/table/sql/queries/window-tvf" >}}) +- [Window Aggregation]({{< ref "docs/dev/table/sql/queries/window-agg" >}}) +- [Group Aggregation]({{< ref "docs/dev/table/sql/queries/group-agg" >}}) +- [Over Aggregation]({{< ref "docs/dev/table/sql/queries/over-agg" >}}) +- [Joins]({{< ref "docs/dev/table/sql/queries/joins" >}}) +- [Set Operations]({{< ref "docs/dev/table/sql/queries/set-ops" >}}) +- [ORDER BY clause]({{< ref "docs/dev/table/sql/queries/orderby" >}}) +- [LIMIT clause]({{< ref "docs/dev/table/sql/queries/limit" >}}) +- [Top-N]({{< ref "docs/dev/table/sql/queries/topn" >}}) +- [Window Top-N]({{< ref "docs/dev/table/sql/queries/window-topn" >}}) +- [Deduplication]({{< ref "docs/dev/table/sql/queries/deduplication" >}}) +- [Pattern Recognition]({{< ref "docs/dev/table/sql/queries/match_recognize" >}}) + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/select-distinct.md b/docs/content.zh/docs/dev/table/sql/queries/select-distinct.md new file mode 100644 index 0000000000000..737c1216a9501 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/select-distinct.md @@ -0,0 +1,38 @@ +--- +title: "SELECT DISTINCT" +weight: 5 +type: docs +--- + + +# SELECT DISTINCT + +{{< label Batch >}} {{< label Streaming >}} + +If `SELECT DISTINCT` is specified, all duplicate rows are removed from the result set (one row is kept from each group of duplicates). + +```sql +SELECT DISTINCT id FROM Orders +``` + +For streaming queries, the required state for computing the query result might grow infinitely. State size depends on number of distinct rows. You can provide a query configuration with an appropriate state time-to-live (TTL) to prevent excessive state size. Note that this might affect the correctness of the query result. See [query configuration]({{< ref "docs/dev/table/config" >}}#table-exec-state-ttl) for details + + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/select.md b/docs/content.zh/docs/dev/table/sql/queries/select.md new file mode 100644 index 0000000000000..4b29d6e4de694 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/select.md @@ -0,0 +1,65 @@ +--- +title: "SELECT & WHERE" +weight: 4 +type: docs +--- + + +# SELECT & WHERE clause + +{{< label Batch >}} {{< label Streaming >}} + +The general syntax of the `SELECT` statement is: + +```sql +SELECT select_list FROM table_expression [ WHERE boolean_expression ] +``` + +The `table_expression` refers to any source of data. It could be an existing table, view, or `VALUES` clause, the joined results of multiple existing tables, or a subquery. Assuming that the table is available in the catalog, the following would read all rows from `Orders`. + +```sql +SELECT * FROM Orders +``` + +The `select_list` specification `*` means the query will resolve all columns. However, usage of `*` is discouraged in production because it makes queries less robust to catalog changes. Instead, a `select_list` can specify a subset of available columns or make calculations using said columns. For example, if `Orders` has columns named `order_id`, `price`, and `tax` you could write the following query: + +```sql +SELECT order_id, price + tax FROM Orders +``` + +Queries can also consume from inline data using the `VALUES` clause. Each tuple corresponds to one row and an alias may be provided to assign names to each column. + +```sql +SELECT order_id, price FROM (VALUES (1, 2.0), (2, 3.1)) AS t (order_id, price) +``` + +Rows can be filtered based on a `WHERE` clause. + +```sql +SELECT price + tax FROM Orders WHERE id = 10 +``` + +Additionally, built-in and [user-defined scalar functions]({{< ref "docs/dev/table/functions/udfs" >}}) can be invoked on the columns of a single row. User-defined functions must be registered in a catalog before use. + +```sql +SELECT PRETTY_PRINT(order_id) FROM Orders +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/set-ops.md b/docs/content.zh/docs/dev/table/sql/queries/set-ops.md new file mode 100644 index 0000000000000..3c58da147e9a1 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/set-ops.md @@ -0,0 +1,140 @@ +--- +title: "集合操作" +weight: 11 +type: docs +--- + + +# Set Operations +{{< label Batch >}} {{< label Streaming >}} + +## UNION + +`UNION` and `UNION ALL` return the rows that are found in either table. +`UNION` takes only distinct rows while `UNION ALL` does not remove duplicates from the result rows. + +```sql +Flink SQL> create view t1(s) as values ('c'), ('a'), ('b'), ('b'), ('c'); +Flink SQL> create view t2(s) as values ('d'), ('e'), ('a'), ('b'), ('b'); + +Flink SQL> (SELECT s FROM t1) UNION (SELECT s FROM t2); ++---+ +| s| ++---+ +| c| +| a| +| b| +| d| +| e| ++---+ + +Flink SQL> (SELECT s FROM t1) UNION ALL (SELECT s FROM t2); ++---+ +| c| ++---+ +| c| +| a| +| b| +| b| +| c| +| d| +| e| +| a| +| b| +| b| ++---+ +``` + +## INTERSECT + +`INTERSECT` and `INTERSECT ALL` return the rows that are found in both tables. +`INTERSECT` takes only distinct rows while `INTERSECT ALL` does not remove duplicates from the result rows. + +```sql +Flink SQL> (SELECT s FROM t1) INTERSECT (SELECT s FROM t2); ++---+ +| s| ++---+ +| a| +| b| ++---+ + +Flink SQL> (SELECT s FROM t1) INTERSECT ALL (SELECT s FROM t2); ++---+ +| s| ++---+ +| a| +| b| +| b| ++---+ +``` + +## EXCEPT + +`EXCEPT` and `EXCEPT ALL` return the rows that are found in one table but not the other. +`EXCEPT` takes only distinct rows while `EXCEPT ALL` does not remove duplicates from the result rows. + +```sql +Flink SQL> (SELECT s FROM t1) EXCEPT (SELECT s FROM t2); ++---+ +| s | ++---+ +| c | ++---+ + +Flink SQL> (SELECT s FROM t1) EXCEPT ALL (SELECT s FROM t2); ++---+ +| s | ++---+ +| c | +| c | ++---+ +``` + +## IN + +Returns true if an expression exists in a given table sub-query. The sub-query table must +consist of one column. This column must have the same data type as the expression. + +```sql +SELECT user, amount +FROM Orders +WHERE product IN ( + SELECT product FROM NewProducts +) +``` + +The optimizer rewrites the IN condition into a join and group operation. For streaming queries, the required state for computing the query result might grow infinitely depending on the number of distinct input rows. You can provide a query configuration with an appropriate state time-to-live (TTL) to prevent excessive state size. Note that this might affect the correctness of the query result. See [query configuration]({{< ref "docs/dev/table/config" >}}#table-exec-state-ttl) for details. + +## EXISTS + +```sql +SELECT user, amount +FROM Orders +WHERE product EXISTS ( + SELECT product FROM NewProducts +) +``` + +Returns true if the sub-query returns at least one row. Only supported if the operation can be rewritten in a join and group operation. + +The optimizer rewrites the `EXISTS` operation into a join and group operation. For streaming queries, the required state for computing the query result might grow infinitely depending on the number of distinct input rows. You can provide a query configuration with an appropriate state time-to-live (TTL) to prevent excessive state size. Note that this might affect the correctness of the query result. See [query configuration]({{< ref "docs/dev/table/config" >}}#table-exec-state-ttl) for details. + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/topn.md b/docs/content.zh/docs/dev/table/sql/queries/topn.md new file mode 100644 index 0000000000000..d4652b7a1f779 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/topn.md @@ -0,0 +1,110 @@ +--- +title: "Top-N" +weight: 14 +type: docs +--- + + +# Top-N +{{< label Batch >}} {{< label Streaming >}} + +Top-N queries ask for the N smallest or largest values ordered by columns. Both smallest and largest values sets are considered Top-N queries. Top-N queries are useful in cases where the need is to display only the N bottom-most or the N top- +most records from batch/streaming table on a condition. This result set can be used for further analysis. + +Flink uses the combination of a OVER window clause and a filter condition to express a Top-N query. With the power of OVER window `PARTITION BY` clause, Flink also supports per group Top-N. For example, the top five products per category that have the maximum sales in realtime. Top-N queries are supported for SQL on batch and streaming tables. + +The following shows the syntax of the Top-N statement: + +```sql +SELECT [column_list] +FROM ( + SELECT [column_list], + ROW_NUMBER() OVER ([PARTITION BY col1[, col2...]] + ORDER BY col1 [asc|desc][, col2 [asc|desc]...]) AS rownum + FROM table_name) +WHERE rownum <= N [AND conditions] +``` + +**Parameter Specification:** + +- `ROW_NUMBER()`: Assigns an unique, sequential number to each row, starting with one, according to the ordering of rows within the partition. Currently, we only support `ROW_NUMBER` as the over window function. In the future, we will support `RANK()` and `DENSE_RANK()`. +- `PARTITION BY col1[, col2...]`: Specifies the partition columns. Each partition will have a Top-N result. +- `ORDER BY col1 [asc|desc][, col2 [asc|desc]...]`: Specifies the ordering columns. The ordering directions can be different on different columns. +- `WHERE rownum <= N`: The `rownum <= N` is required for Flink to recognize this query is a Top-N query. The N represents the N smallest or largest records will be retained. +- `[AND conditions]`: It is free to add other conditions in the where clause, but the other conditions can only be combined with `rownum <= N` using `AND` conjunction. + +{{< hint info >}} +Note: the above pattern must be followed exactly, otherwise the optimizer won’t be able to translate the query. +{{< /hint >}} + +{{< hint info >}} +The TopN query is Result Updating. Flink SQL will sort the input data stream according to the order key, so if the top N records have been changed, the changed ones will be sent as retraction/update records to downstream. +It is recommended to use a storage which supports updating as the sink of Top-N query. In addition, if the top N records need to be stored in external storage, the result table should have the same unique key with the Top-N query. +{{< /hint >}} + +The unique keys of Top-N query is the combination of partition columns and rownum column. Top-N query can also derive the unique key of upstream. Take following job as an example, say `product_id` is the unique key of the `ShopSales`, then the unique keys of the Top-N query are [`category`, `rownum`] and [`product_id`]. + +The following examples show how to specify SQL queries with Top-N on streaming tables. This is an example to get "the top five products per category that have the maximum sales in realtime" we mentioned above. + +```sql +CREATE TABLE ShopSales ( + product_id STRING, + category STRING, + product_name STRING, + sales BIGINT +) WITH (...); + +SELECT * +FROM ( + SELECT *, + ROW_NUMBER() OVER (PARTITION BY category ORDER BY sales DESC) AS row_num + FROM ShopSales) +WHERE row_num <= 5 +``` + +#### No Ranking Output Optimization + +As described above, the `rownum` field will be written into the result table as one field of the unique key, which may lead to a lot of records being written to the result table. For example, when the record (say `product-1001`) of ranking 9 is updated and its rank is upgraded to 1, all the records from ranking 1 ~ 9 will be output to the result table as update messages. If the result table receives too many data, it will become the bottleneck of the SQL job. + +The optimization way is omitting rownum field in the outer SELECT clause of the Top-N query. This is reasonable because the number of the top N records is usually not large, thus the consumers can sort the records themselves quickly. Without rownum field, in the example above, only the changed record (`product-1001`) needs to be sent to downstream, which can reduce much IO to the result table. + +The following example shows how to optimize the above Top-N example in this way: + +```sql +CREATE TABLE ShopSales ( + product_id STRING, + category STRING, + product_name STRING, + sales BIGINT +) WITH (...); + +-- omit row_num field from the output +SELECT product_id, category, product_name, sales +FROM ( + SELECT *, + ROW_NUMBER() OVER (PARTITION BY category ORDER BY sales DESC) AS row_num + FROM ShopSales) +WHERE row_num <= 5 +``` + +Attention in Streaming Mode In order to output the above query to an external storage and have a correct result, the external storage must have the same unique key with the Top-N query. In the above example query, if the `product_id` is the unique key of the query, then the external table should also has `product_id` as the unique key. + + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/window-agg.md b/docs/content.zh/docs/dev/table/sql/queries/window-agg.md new file mode 100644 index 0000000000000..cf6d2de25080e --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/window-agg.md @@ -0,0 +1,332 @@ +--- +title: "窗口聚合" +weight: 7 +type: docs +--- + + +# Window Aggregation + +## Window TVF Aggregation + +{{< label Streaming >}} + +Window aggregations are defined in the `GROUP BY` clause contains "window_start" and "window_end" columns of the relation applied [Windowing TVF]({{< ref "docs/dev/table/sql/queries/window-tvf" >}}). Just like queries with regular `GROUP BY` clauses, queries with a group by window aggregation will compute a single result row per group. + +```sql +SELECT ... +FROM -- relation applied windowing TVF +GROUP BY window_start, window_end, ... +``` + +Unlike other aggregations on continuous tables, window aggregation do not emit intermediate results but only a final result, the total aggregation at the end of the window. Moreover, window aggregations purge all intermediate state when no longer needed. + +### Windowing TVFs + +Flink supports `TUMBLE`, `HOP` and `CUMULATE` types of window aggregations, which can be defined on either [event or processing time attributes]({{< ref "docs/dev/table/concepts/time_attributes" >}}). See [Windowing TVF]({{< ref "docs/dev/table/sql/queries/window-tvf" >}}) for more windowing functions information. + +Here are some examples for `TUMBLE`, `HOP` and `CUMULATE` window aggregations. + +```sql +-- tables must have time attribute, e.g. `bidtime` in this table +Flink SQL> desc Bid; ++-------------+------------------------+------+-----+--------+---------------------------------+ +| name | type | null | key | extras | watermark | ++-------------+------------------------+------+-----+--------+---------------------------------+ +| bidtime | TIMESTAMP(3) *ROWTIME* | true | | | `bidtime` - INTERVAL '1' SECOND | +| price | DECIMAL(10, 2) | true | | | | +| item | STRING | true | | | | +| supplier_id | STRING | true | | | | ++-------------+------------------------+------+-----+--------+---------------------------------+ + +Flink SQL> SELECT * FROM Bid; ++------------------+-------+------+-------------+ +| bidtime | price | item | supplier_id | ++------------------+-------+------+-------------+ +| 2020-04-15 08:05 | 4.00 | C | supplier1 | +| 2020-04-15 08:07 | 2.00 | A | supplier1 | +| 2020-04-15 08:09 | 5.00 | D | supplier2 | +| 2020-04-15 08:11 | 3.00 | B | supplier2 | +| 2020-04-15 08:13 | 1.00 | E | supplier1 | +| 2020-04-15 08:17 | 6.00 | F | supplier2 | ++------------------+-------+------+-------------+ + +-- tumbling window aggregation +Flink SQL> SELECT window_start, window_end, SUM(price) + FROM TABLE( + TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES)) + GROUP BY window_start, window_end; ++------------------+------------------+-------+ +| window_start | window_end | price | ++------------------+------------------+-------+ +| 2020-04-15 08:00 | 2020-04-15 08:10 | 11.00 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | 10.00 | ++------------------+------------------+-------+ + +-- hopping window aggregation +Flink SQL> SELECT window_start, window_end, SUM(price) + FROM TABLE( + HOP(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '5' MINUTES, INTERVAL '10' MINUTES)) + GROUP BY window_start, window_end; ++------------------+------------------+-------+ +| window_start | window_end | price | ++------------------+------------------+-------+ +| 2020-04-15 08:00 | 2020-04-15 08:10 | 11.00 | +| 2020-04-15 08:05 | 2020-04-15 08:15 | 15.00 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | 10.00 | +| 2020-04-15 08:15 | 2020-04-15 08:25 | 6.00 | ++------------------+------------------+-------+ + +-- cumulative window aggregation +Flink SQL> SELECT window_start, window_end, SUM(price) + FROM TABLE( + CUMULATE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '2' MINUTES, INTERVAL '10' MINUTES)) + GROUP BY window_start, window_end; ++------------------+------------------+-------+ +| window_start | window_end | price | ++------------------+------------------+-------+ +| 2020-04-15 08:00 | 2020-04-15 08:06 | 4.00 | +| 2020-04-15 08:00 | 2020-04-15 08:08 | 6.00 | +| 2020-04-15 08:00 | 2020-04-15 08:10 | 11.00 | +| 2020-04-15 08:10 | 2020-04-15 08:12 | 3.00 | +| 2020-04-15 08:10 | 2020-04-15 08:14 | 4.00 | +| 2020-04-15 08:10 | 2020-04-15 08:16 | 4.00 | +| 2020-04-15 08:10 | 2020-04-15 08:18 | 10.00 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | 10.00 | ++------------------+------------------+-------+ +``` + +*Note: in order to better understand the behavior of windowing, we simplify the displaying of timestamp values to not show the trailing zeros, e.g. `2020-04-15 08:05` should be displayed as `2020-04-15 08:05:00.000` in Flink SQL Client if the type is `TIMESTAMP(3)`.* + +### GROUPING SETS + +Window aggregations also support `GROUPING SETS` syntax. Grouping sets allow for more complex grouping operations than those describable by a standard `GROUP BY`. Rows are grouped separately by each specified grouping set and aggregates are computed for each group just as for simple `GROUP BY` clauses. + +Window aggregations with `GROUPING SETS` require both the `window_start` and `window_end` columns have to be in the `GROUP BY` clause, but not in the `GROUPING SETS` clause. + +```sql +Flink SQL> SELECT window_start, window_end, supplier_id, SUM(price) as price + FROM TABLE( + TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES)) + GROUP BY window_start, window_end, GROUPING SETS ((supplier_id), ()); ++------------------+------------------+-------------+-------+ +| window_start | window_end | supplier_id | price | ++------------------+------------------+-------------+-------+ +| 2020-04-15 08:00 | 2020-04-15 08:10 | (NULL) | 11.00 | +| 2020-04-15 08:00 | 2020-04-15 08:10 | supplier2 | 5.00 | +| 2020-04-15 08:00 | 2020-04-15 08:10 | supplier1 | 6.00 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | (NULL) | 10.00 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | supplier2 | 9.00 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | supplier1 | 1.00 | ++------------------+------------------+-------------+-------+ +``` + +Each sublist of `GROUPING SETS` may specify zero or more columns or expressions and is interpreted the same way as though used directly in the `GROUP BY` clause. An empty grouping set means that all rows are aggregated down to a single group, which is output even if no input rows were present. + +References to the grouping columns or expressions are replaced by null values in result rows for grouping sets in which those columns do not appear. + +#### ROLLUP + +`ROLLUP` is a shorthand notation for specifying a common type of grouping set. It represents the given list of expressions and all prefixes of the list, including the empty list. + +Window aggregations with `ROLLUP` requires both the `window_start` and `window_end` columns have to be in the `GROUP BY` clause, but not in the `ROLLUP` clause. + +For example, the following query is equivalent to the one above. + +```sql +SELECT window_start, window_end, supplier_id, SUM(price) as price +FROM TABLE( + TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES)) +GROUP BY window_start, window_end, ROLLUP (supplier_id); +``` + +#### CUBE + +`CUBE` is a shorthand notation for specifying a common type of grouping set. It represents the given list and all of its possible subsets - the power set. + +Window aggregations with `CUBE` requires both the `window_start` and `window_end` columns have to be in the `GROUP BY` clause, but not in the `CUBE` clause. + +For example, the following two queries are equivalent. + +```sql +SELECT window_start, window_end, item, supplier_id, SUM(price) as price + FROM TABLE( + TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES)) + GROUP BY window_start, window_end, CUBE (supplier_id, item); + +SELECT window_start, window_end, item, supplier_id, SUM(price) as price + FROM TABLE( + TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES)) + GROUP BY window_start, window_end, GROUPING SETS ( + (supplier_id, item), + (supplier_id ), + ( item), + ( ) +) +``` + +### Selecting Group Window Start and End Timestamps + +The start and end timestamps of group windows can be selected with the grouped `window_start` and `window_end` columns. + +### Cascading Window Aggregation + +The `window_start` and `window_end` columns are regular timestamp columns, not time attributes. Thus they can't be used as time attributes in subsequent time-based operations. +In order to propagate time attributes, you need to additionally add `window_time` column into `GROUP BY` clause. The `window_time` is the third column produced by [Windowing TVFs]({{< ref "docs/dev/table/sql/queries/window-tvf" >}}#window-functions) which is a time attribute of the assigned window. +Adding `window_time` into `GROUP BY` clause makes `window_time` also to be group key that can be selected. Then following queries can use this column for subsequent time-based operations, such as cascading window aggregations and [Window TopN]({{< ref "docs/dev/table/sql/queries/window-topn">}}). + +The following shows a cascading window aggregation where the first window aggregation propagates the time attribute for the second window aggregation. + +```sql +-- tumbling 5 minutes for each supplier_id +CREATE VIEW window1 AS +SELECT window_start, window_end, window_time as rowtime, SUM(price) as partial_price + FROM TABLE( + TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '5' MINUTES)) + GROUP BY supplier_id, window_start, window_end, window_time; + +-- tumbling 10 minutes on the first window +SELECT window_start, window_end, SUM(partial_price) as total_price + FROM TABLE( + TUMBLE(TABLE window1, DESCRIPTOR(rowtime), INTERVAL '10' MINUTES)) + GROUP BY window_start, window_end; +``` + +## Group Window Aggregation + +{{< label Batch >}} {{< label Streaming >}} + +{{< hint warning >}} +Warning: Group Window Aggregation is deprecated. It's encouraged to use Window TVF Aggregation which is more powerful and effective. + +Compared to Group Window Aggregation, Window TVF Aggregation have many advantages, including: +- Have all performance optimizations mentioned in [Performance Tuning]({{< ref "docs/dev/table/tuning" >}}). +- Support standard `GROUPING SETS` syntax. +- Can apply [Window TopN]({{< ref "docs/dev/table/sql/queries/window-topn">}}) after window aggregation result. +- and so on. +{{< /hint >}} + +Group Window Aggregations are defined in the `GROUP BY` clause of a SQL query. Just like queries with regular `GROUP BY` clauses, queries with a `GROUP BY` clause that includes a group window function compute a single result row per group. The following group windows functions are supported for SQL on batch and streaming tables. + +### Group Window Functions + + + + + + + + + + + + + + + + + + + + + + + +
    Group Window FunctionDescription
    TUMBLE(time_attr, interval)Defines a tumbling time window. A tumbling time window assigns rows to non-overlapping, continuous windows with a fixed duration (interval). For example, a tumbling window of 5 minutes groups rows in 5 minutes intervals. Tumbling windows can be defined on event-time (stream + batch) or processing-time (stream).
    HOP(time_attr, interval, interval)Defines a hopping time window (called sliding window in the Table API). A hopping time window has a fixed duration (second interval parameter) and hops by a specified hop interval (first interval parameter). If the hop interval is smaller than the window size, hopping windows are overlapping. Thus, rows can be assigned to multiple windows. For example, a hopping window of 15 minutes size and 5 minute hop interval assigns each row to 3 different windows of 15 minute size, which are evaluated in an interval of 5 minutes. Hopping windows can be defined on event-time (stream + batch) or processing-time (stream).
    SESSION(time_attr, interval)Defines a session time window. Session time windows do not have a fixed duration but their bounds are defined by a time interval of inactivity, i.e., a session window is closed if no event appears for a defined gap period. For example a session window with a 30 minute gap starts when a row is observed after 30 minutes inactivity (otherwise the row would be added to an existing window) and is closed if no row is added within 30 minutes. Session windows can work on event-time (stream + batch) or processing-time (stream).
    + +### Time Attributes + +For SQL queries on streaming tables, the `time_attr` argument of the group window function must refer to a valid time attribute that specifies the processing time or event time of rows. See the [documentation of time attributes]({{< ref "docs/dev/table/concepts/time_attributes" >}}) to learn how to define time attributes. + +For SQL on batch tables, the `time_attr` argument of the group window function must be an attribute of type `TIMESTAMP`. + +### Selecting Group Window Start and End Timestamps + +The start and end timestamps of group windows as well as time attributes can be selected with the following auxiliary functions: + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Auxiliary FunctionDescription
    + TUMBLE_START(time_attr, interval)
    + HOP_START(time_attr, interval, interval)
    + SESSION_START(time_attr, interval)
    +

    Returns the timestamp of the inclusive lower bound of the corresponding tumbling, hopping, or session window.

    + TUMBLE_END(time_attr, interval)
    + HOP_END(time_attr, interval, interval)
    + SESSION_END(time_attr, interval)
    +

    Returns the timestamp of the exclusive upper bound of the corresponding tumbling, hopping, or session window.

    +

    Note: The exclusive upper bound timestamp cannot be used as a }}">rowtime attribute in subsequent time-based operations, such as }}#interval-joins">interval joins and }}">group window or }}">over window aggregations.

    + TUMBLE_ROWTIME(time_attr, interval)
    + HOP_ROWTIME(time_attr, interval, interval)
    + SESSION_ROWTIME(time_attr, interval)
    +

    Returns the timestamp of the inclusive upper bound of the corresponding tumbling, hopping, or session window.

    +

    The resulting attribute is a }}">rowtime attribute that can be used in subsequent time-based operations such as }}#interval-joins">interval joins and }}">group window or }}">over window aggregations.

    + TUMBLE_PROCTIME(time_attr, interval)
    + HOP_PROCTIME(time_attr, interval, interval)
    + SESSION_PROCTIME(time_attr, interval)
    +

    Returns a }}#processing-time">proctime attribute that can be used in subsequent time-based operations such as }}#interval-joins">interval joins and }}">group window or }}">over window aggregations.

    + +*Note:* Auxiliary functions must be called with exactly same arguments as the group window function in the `GROUP BY` clause. + +The following examples show how to specify SQL queries with group windows on streaming tables. + +```sql +CREATE TABLE Orders ( + user BIGINT, + product STIRNG, + amount INT, + order_time TIMESTAMP(3), + WATERMARK FOR order_time AS order_time - INTERVAL '1' MINUTE +) WITH (...); + +SELECT + user, + TUMBLE_START(order_time, INTERVAL '1' DAY) AS wStart, + SUM(amount) FROM Orders +GROUP BY + TUMBLE(order_time, INTERVAL '1' DAY), + user +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/window-topn.md b/docs/content.zh/docs/dev/table/sql/queries/window-topn.md new file mode 100644 index 0000000000000..9507524fc207c --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/window-topn.md @@ -0,0 +1,110 @@ +--- +title: "窗口 Top-N" +weight: 15 +type: docs +--- + + +# Window Top-N +{{< label Streaming >}} + +Window Top-N is a special [Top-N]({{< ref "docs/dev/table/sql/queries/topn" >}}) which returns the N smallest or largest values for each window and other partitioned keys. + +For streaming queries, unlike regular Top-N on continuous tables, window Top-N does not emit intermediate results but only a final result, the total top N records at the end of the window. Moreover, window Top-N purges all intermediate state when no longer needed. +Therefore, window Top-N queries have better performance if users don't need results updated per record. Usually, Window Top-N is used with [Window Aggregation]({{< ref "docs/dev/table/sql/queries/window-agg" >}}) together. + +Window Top-N can be defined in the same syntax as regular Top-N, see [Top-N documentation]({{< ref "docs/dev/table/sql/queries/topn" >}}) for more information. +Besides that, Window Top-N requires the `PARTITION BY` clause contains `window_start` and `window_end` columns of the relation applied [Windowing TVF]({{< ref "docs/dev/table/sql/queries/window-tvf" >}}) or [Window Aggregation]({{< ref "docs/dev/table/sql/queries/window-agg" >}}). +Otherwise, the optimizer won’t be able to translate the query. + + +The following shows the syntax of the Window Top-N statement: + +```sql +SELECT [column_list] +FROM ( + SELECT [column_list], + ROW_NUMBER() OVER (PARTITION BY window_start, window_end [, col_key1...] + ORDER BY col1 [asc|desc][, col2 [asc|desc]...]) AS rownum + FROM table_name) -- relation applied windowing TVF +WHERE rownum <= N [AND conditions] +``` + +## Example + +The following example shows how to calculate Top 3 suppliers who have the highest sales for every tumbling 10 minutes window. + +```sql +-- tables must have time attribute, e.g. `bidtime` in this table +Flink SQL> desc Bid; ++-------------+------------------------+------+-----+--------+---------------------------------+ +| name | type | null | key | extras | watermark | ++-------------+------------------------+------+-----+--------+---------------------------------+ +| bidtime | TIMESTAMP(3) *ROWTIME* | true | | | `bidtime` - INTERVAL '1' SECOND | +| price | DECIMAL(10, 2) | true | | | | +| item | STRING | true | | | | +| supplier_id | STRING | true | | | | ++-------------+------------------------+------+-----+--------+---------------------------------+ + +Flink SQL> SELECT * FROM Bid; ++------------------+-------+------+-------------+ +| bidtime | price | item | supplier_id | ++------------------+-------+------+-------------+ +| 2020-04-15 08:05 | 4.00 | A | supplier1 | +| 2020-04-15 08:06 | 4.00 | C | supplier2 | +| 2020-04-15 08:07 | 2.00 | G | supplier1 | +| 2020-04-15 08:08 | 2.00 | B | supplier3 | +| 2020-04-15 08:09 | 5.00 | D | supplier4 | +| 2020-04-15 08:11 | 2.00 | B | supplier3 | +| 2020-04-15 08:13 | 1.00 | E | supplier1 | +| 2020-04-15 08:15 | 3.00 | H | supplier2 | +| 2020-04-15 08:17 | 6.00 | F | supplier5 | ++------------------+-------+------+-------------+ + +Flink SQL> SELECT * + FROM ( + SELECT *, ROW_NUMBER() OVER (PARTITION BY window_start, window_end ORDER BY price DESC) as rownum + FROM ( + SELECT window_start, window_end, supplier_id, SUM(price) as price, COUNT(*) as cnt + FROM TABLE( + TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES)) + GROUP BY window_start, window_end, supplier_id + ) + ) WHERE rownum <= 3; ++------------------+------------------+-------------+-------+-----+--------+ +| window_start | window_end | supplier_id | price | cnt | rownum | ++------------------+------------------+-------------+-------+-----+--------+ +| 2020-04-15 08:00 | 2020-04-15 08:10 | supplier1 | 6.00 | 2 | 1 | +| 2020-04-15 08:00 | 2020-04-15 08:10 | supplier4 | 5.00 | 1 | 2 | +| 2020-04-15 08:00 | 2020-04-15 08:10 | supplier2 | 4.00 | 1 | 3 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | supplier5 | 6.00 | 1 | 1 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | supplier2 | 3.00 | 1 | 2 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | supplier3 | 2.00 | 1 | 3 | ++------------------+------------------+-------------+-------+-----+--------+ +``` + +*Note: in order to better understand the behavior of windowing, we simplify the displaying of timestamp values to not show the trailing zeros, e.g. `2020-04-15 08:05` should be displayed as `2020-04-15 08:05:00.000` in Flink SQL Client if the type is `TIMESTAMP(3)`.* + +## Limitation + +Currently, Flink only supports Window Top-N which follows after [Window Aggregation]({{< ref "docs/dev/table/sql/queries/window-agg" >}}). Window Top-N after [Windowing TVF]({{< ref "docs/dev/table/sql/queries/window-tvf" >}}) will be support in the near future. + + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/window-tvf.md b/docs/content.zh/docs/dev/table/sql/queries/window-tvf.md new file mode 100644 index 0000000000000..3738e58f1d31a --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/window-tvf.md @@ -0,0 +1,285 @@ +--- +title: "窗口函数" +weight: 6 +type: docs +--- + + +# Windowing table-valued functions (Windowing TVFs) + +{{< label Streaming >}} + +Windows are at the heart of processing infinite streams. Windows split the stream into “buckets” of finite size, over which we can apply computations. This document focuses on how windowing is performed in Flink SQL and how the programmer can benefit to the maximum from its offered functionality. + +Apache Flink provides several window table-valued functions (TVF) to divide the elements of your table into windows, including: + +- [Tumble Windows](#tumble) +- [Hop Windows](#hop) +- [Cumulate Windows](#cumulate) +- Session Windows (will be supported soon) + +Note that each element can logically belong to more than one window, depending on the windowing table-valued function you use. For example, HOP windowing creates overlapping windows wherein a single element can be assigned to multiple windows. + +Windowing TVFs are Flink defined Polymorphic Table Functions (abbreviated PTF). PTF is part of the SQL 2016 standard, a special table-function, but can have a table as a parameter. PTF is a powerful feature to change the shape of a table. Because PTFs are used semantically like tables, their invocation occurs in a `FROM` clause of a `SELECT` statement. + +Windowing TVFs is a replacement of legacy [Grouped Window Functions]({{< ref "docs/dev/table/sql/queries/window-agg" >}}#group-window-aggregation-deprecated). Windowing TVFs is more SQL standard compliant and more powerful to support complex window-based computations, e.g. Window TopN, Window Join. However, [Grouped Window Functions]({{< ref "docs/dev/table/sql/queries/window-agg" >}}#group-window-aggregation) can only support Window Aggregation. + +See more how to apply further computations based on windowing TVF: +- [Window Aggregation]({{< ref "docs/dev/table/sql/queries/window-agg" >}}) +- [Window TopN]({{< ref "docs/dev/table/sql/queries/window-topn">}}) +- Window Join (will be supported soon) + +## Window Functions + +Apache Flink provides 3 built-in windowing TVFs: TUMBLE, `HOP` and `CUMULATE`. The return value of windowing TVF is a new relation that includes all columns of original relation as well as additional 3 columns named "window_start", "window_end", "window_time" to indicate the assigned window. The "window_time" field is a [time attributes]({{< ref "docs/dev/table/concepts/time_attributes" >}}) of the window after windowing TVF which can be used in subsequent time-based operations, e.g. another windowing TVF, or }}#interval-joins">interval joins, }}">over aggregations. The value of `window_time` always equal to `window_end - 1ms`. + +### TUMBLE + +The `TUMBLE` function assigns each element to a window of specified window size. Tumbling windows have a fixed size and do not overlap. For example, suppose you specify a tumbling window with a size of 5 minutes. In that case, Flink will evaluate the current window, and a new window started every five minutes, as illustrated by the following figure. + +{{< img src="/fig/tumbling-windows.svg" alt="Tumbling Windows" width="70%">}} + +The `TUMBLE` function assigns a window for each row of a relation based on a [time attribute]({{< ref "docs/dev/table/concepts/time_attributes" >}}) column. The return value of `TUMBLE` is a new relation that includes all columns of original relation as well as additional 3 columns named "window_start", "window_end", "window_time" to indicate the assigned window. The original time attribute "timecol" will be a regular timestamp column after window TVF. + +`TUMBLE` function takes three required parameters: + +```sql +TUMBLE(TABLE data, DESCRIPTOR(timecol), size) +``` + +- `data`: is a table parameter that can be any relation with a time attribute column. +- `timecol`: is a column descriptor indicating which [time attributes]({{< ref "docs/dev/table/concepts/time_attributes" >}}) column of data should be mapped to tumbling windows. +- `size`: is a duration specifying the width of the tumbling windows. + +Here is an example invocation on the `Bid` table: + +```sql +-- tables must have time attribute, e.g. `bidtime` in this table +Flink SQL> desc Bid; ++-------------+------------------------+------+-----+--------+---------------------------------+ +| name | type | null | key | extras | watermark | ++-------------+------------------------+------+-----+--------+---------------------------------+ +| bidtime | TIMESTAMP(3) *ROWTIME* | true | | | `bidtime` - INTERVAL '1' SECOND | +| price | DECIMAL(10, 2) | true | | | | +| item | STRING | true | | | | ++-------------+------------------------+------+-----+--------+---------------------------------+ + +Flink SQL> SELECT * FROM Bid; ++------------------+-------+------+ +| bidtime | price | item | ++------------------+-------+------+ +| 2020-04-15 08:05 | 4.00 | C | +| 2020-04-15 08:07 | 2.00 | A | +| 2020-04-15 08:09 | 5.00 | D | +| 2020-04-15 08:11 | 3.00 | B | +| 2020-04-15 08:13 | 1.00 | E | +| 2020-04-15 08:17 | 6.00 | F | ++------------------+-------+------+ + +-- NOTE: Currently Flink doesn't support evaluating individual window table-valued function, +-- window table-valued function should be used with aggregate operation, +-- this example is just used for explaining the syntax and the data produced by table-valued function. +Flink SQL> SELECT * FROM TABLE( + TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES)); +-- or with the named params +-- note: the DATA param must be the first +Flink SQL> SELECT * FROM TABLE( + TUMBLE( + DATA => TABLE Bid, + TIMECOL => DESCRIPTOR(bidtime), + SIZE => INTERVAL '10' MINUTES)); ++------------------+-------+------+------------------+------------------+-------------------------+ +| bidtime | price | item | window_start | window_end | window_time | ++------------------+-------+------+------------------+------------------+-------------------------+ +| 2020-04-15 08:05 | 4.00 | C | 2020-04-15 08:00 | 2020-04-15 08:10 | 2020-04-15 08:09:59.999 | +| 2020-04-15 08:07 | 2.00 | A | 2020-04-15 08:00 | 2020-04-15 08:10 | 2020-04-15 08:09:59.999 | +| 2020-04-15 08:09 | 5.00 | D | 2020-04-15 08:00 | 2020-04-15 08:10 | 2020-04-15 08:09:59.999 | +| 2020-04-15 08:11 | 3.00 | B | 2020-04-15 08:10 | 2020-04-15 08:20 | 2020-04-15 08:19:59.999 | +| 2020-04-15 08:13 | 1.00 | E | 2020-04-15 08:10 | 2020-04-15 08:20 | 2020-04-15 08:19:59.999 | +| 2020-04-15 08:17 | 6.00 | F | 2020-04-15 08:10 | 2020-04-15 08:20 | 2020-04-15 08:19:59.999 | ++------------------+-------+------+------------------+------------------+-------------------------+ + +-- apply aggregation on the tumbling windowed table +Flink SQL> SELECT window_start, window_end, SUM(price) + FROM TABLE( + TUMBLE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '10' MINUTES)) + GROUP BY window_start, window_end; ++------------------+------------------+-------+ +| window_start | window_end | price | ++------------------+------------------+-------+ +| 2020-04-15 08:00 | 2020-04-15 08:10 | 11.00 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | 10.00 | ++------------------+------------------+-------+ +``` + +*Note: in order to better understand the behavior of windowing, we simplify the displaying of timestamp values to not show the trailing zeros, e.g. `2020-04-15 08:05` should be displayed as `2020-04-15 08:05:00.000` in Flink SQL Client if the type is `TIMESTAMP(3)`.* + + +### HOP + +The `HOP` function assigns elements to windows of fixed length. Like a `TUMBLE` windowing function, the size of the windows is configured by the window size parameter. An additional window slide parameter controls how frequently a hopping window is started. Hence, hopping windows can be overlapping if the slide is smaller than the window size. In this case, elements are assigned to multiple windows. Hopping windows are also known as "sliding windows". + +For example, you could have windows of size 10 minutes that slides by 5 minutes. With this, you get every 5 minutes a window that contains the events that arrived during the last 10 minutes, as depicted by the following figure. + +{{< img src="/fig/sliding-windows.svg" alt="Hopping windows" width="70%">}} + +The `HOP` function assigns windows that cover rows within the interval of size and shifting every slide based on a [time attribute]({{< ref "docs/dev/table/concepts/time_attributes" >}}) column. The return value of `HOP` is a new relation that includes all columns of original relation as well as additional 3 columns named "window_start", "window_end", "window_time" to indicate the assigned window. The original time attribute "timecol" will be a regular timestamp column after windowing TVF. + +`HOP` takes three required parameters. + +```sql +HOP(TABLE data, DESCRIPTOR(timecol), slide, size [, offset ]) +``` + +- `data`: is a table parameter that can be any relation with an time attribute column. +- `timecol`: is a column descriptor indicating which [time attributes]({{< ref "docs/dev/table/concepts/time_attributes" >}}) column of data should be mapped to hopping windows. +- `slide`: is a duration specifying the duration between the start of sequential hopping windows +- `size`: is a duration specifying the width of the hopping windows. + +Here is an example invocation on the `Bid` table: + +```sql +-- NOTE: Currently Flink doesn't support evaluating individual window table-valued function, +-- window table-valued function should be used with aggregate operation, +-- this example is just used for explaining the syntax and the data produced by table-valued function. +> SELECT * FROM TABLE( + HOP(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '5' MINUTES, INTERVAL '10' MINUTES)); +-- or with the named params +-- note: the DATA param must be the first +> SELECT * FROM TABLE( + HOP( + DATA => TABLE Bid, + TIMECOL => DESCRIPTOR(bidtime), + SLIDE => INTERVAL '5' MINUTES, + SIZE => INTERVAL '10' MINUTES)); ++------------------+-------+------+------------------+------------------+-------------------------+ +| bidtime | price | item | window_start | window_end | window_time | ++------------------+-------+------+------------------+------------------+-------------------------+ +| 2020-04-15 08:05 | 4.00 | C | 2020-04-15 08:00 | 2020-04-15 08:10 | 2020-04-15 08:09:59.999 | +| 2020-04-15 08:05 | 4.00 | C | 2020-04-15 08:05 | 2020-04-15 08:15 | 2020-04-15 08:14:59.999 | +| 2020-04-15 08:07 | 2.00 | A | 2020-04-15 08:00 | 2020-04-15 08:10 | 2020-04-15 08:09:59.999 | +| 2020-04-15 08:07 | 2.00 | A | 2020-04-15 08:05 | 2020-04-15 08:15 | 2020-04-15 08:14:59.999 | +| 2020-04-15 08:09 | 5.00 | D | 2020-04-15 08:00 | 2020-04-15 08:10 | 2020-04-15 08:09:59.999 | +| 2020-04-15 08:09 | 5.00 | D | 2020-04-15 08:05 | 2020-04-15 08:15 | 2020-04-15 08:14:59.999 | +| 2020-04-15 08:11 | 3.00 | B | 2020-04-15 08:05 | 2020-04-15 08:15 | 2020-04-15 08:14:59.999 | +| 2020-04-15 08:11 | 3.00 | B | 2020-04-15 08:10 | 2020-04-15 08:20 | 2020-04-15 08:19:59.999 | +| 2020-04-15 08:13 | 1.00 | E | 2020-04-15 08:05 | 2020-04-15 08:15 | 2020-04-15 08:14:59.999 | +| 2020-04-15 08:13 | 1.00 | E | 2020-04-15 08:10 | 2020-04-15 08:20 | 2020-04-15 08:19:59.999 | +| 2020-04-15 08:17 | 6.00 | F | 2020-04-15 08:10 | 2020-04-15 08:20 | 2020-04-15 08:19:59.999 | +| 2020-04-15 08:17 | 6.00 | F | 2020-04-15 08:15 | 2020-04-15 08:25 | 2020-04-15 08:24:59.999 | ++------------------+-------+------+------------------+------------------+-------------------------+ + +-- apply aggregation on the hopping windowed table +> SELECT window_start, window_end, SUM(price) + FROM TABLE( + HOP(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '5' MINUTES, INTERVAL '10' MINUTES)) + GROUP BY window_start, window_end; ++------------------+------------------+-------+ +| window_start | window_end | price | ++------------------+------------------+-------+ +| 2020-04-15 08:00 | 2020-04-15 08:10 | 11.00 | +| 2020-04-15 08:05 | 2020-04-15 08:15 | 15.00 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | 10.00 | +| 2020-04-15 08:15 | 2020-04-15 08:25 | 6.00 | ++------------------+------------------+-------+ +``` + +### CUMULATE + +Cumulating windows are very useful in some scenarios, such as tumbling windows with early firing in a fixed window interval. For example, a daily dashboard draws cumulative UVs from 00:00 to every minute, the UV at 10:00 represents the total number of UV from 00:00 to 10:00. This can be easily and efficiently implemented by CUMULATE windowing. + +The `CUMULATE` function assigns elements to windows that cover rows within an initial interval of step size and expand to one more step size (keep window start fixed) every step until the max window size. +You can think `CUMULATE` function as applying `TUMBLE` windowing with max window size first, and split each tumbling windows into several windows with same window start and window ends of step-size difference. So cumulating windows do overlap and don't have a fixed size. + +For example, you could have a cumulating window for 1 hour step and 1 day max size, and you will get windows: `[00:00, 01:00)`, `[00:00, 02:00)`, `[00:00, 03:00)`, ..., `[00:00, 24:00)` for every day. + +{{< img src="/fig/cumulating-windows.png" alt="Cumulating Windows" width="70%">}} + +The `CUMULATE` functions assigns windows based on a [time attribute]({{< ref "docs/dev/table/concepts/time_attributes" >}}) column. The return value of `CUMULATE` is a new relation that includes all columns of original relation as well as additional 3 columns named "window_start", "window_end", "window_time" to indicate the assigned window. The original time attribute "timecol" will be a regular timestamp column after window TVF. + +`CUMULATE` takes three required parameters. + +```sql +CUMULATE(TABLE data, DESCRIPTOR(timecol), step, size) +``` + +- `data`: is a table parameter that can be any relation with an time attribute column. +- `timecol`: is a column descriptor indicating which [time attributes]({{< ref "docs/dev/table/concepts/time_attributes" >}}) column of data should be mapped to tumbling windows. +- `step`: is a duration specifying the increased window size between the end of sequential cumulating windows. +- `size`: is a duration specifying the max width of the cumulating windows. size must be an integral multiple of step . + +Here is an example invocation on the Bid table: + +```sql +-- NOTE: Currently Flink doesn't support evaluating individual window table-valued function, +-- window table-valued function should be used with aggregate operation, +-- this example is just used for explaining the syntax and the data produced by table-valued function. +> SELECT * FROM TABLE( + CUMULATE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '2' MINUTES, INTERVAL '10' MINUTES)); +-- or with the named params +-- note: the DATA param must be the first +> SELECT * FROM TABLE( + CUMULATE( + DATA => TABLE Bid, + TIMECOL => DESCRIPTOR(bidtime), + STEP => INTERVAL '2' MINUTES, + SIZE => INTERVAL '10' MINUTES)); ++------------------+-------+------+------------------+------------------+-------------------------+ +| bidtime | price | item | window_start | window_end | window_time | ++------------------+-------+------+------------------+------------------+-------------------------+ +| 2020-04-15 08:05 | 4.00 | C | 2020-04-15 08:00 | 2020-04-15 08:06 | 2020-04-15 08:05:59.999 | +| 2020-04-15 08:05 | 4.00 | C | 2020-04-15 08:00 | 2020-04-15 08:08 | 2020-04-15 08:07:59.999 | +| 2020-04-15 08:05 | 4.00 | C | 2020-04-15 08:00 | 2020-04-15 08:10 | 2020-04-15 08:09:59.999 | +| 2020-04-15 08:07 | 2.00 | A | 2020-04-15 08:00 | 2020-04-15 08:08 | 2020-04-15 08:07:59.999 | +| 2020-04-15 08:07 | 2.00 | A | 2020-04-15 08:00 | 2020-04-15 08:10 | 2020-04-15 08:09:59.999 | +| 2020-04-15 08:09 | 5.00 | D | 2020-04-15 08:00 | 2020-04-15 08:10 | 2020-04-15 08:09:59.999 | +| 2020-04-15 08:11 | 3.00 | B | 2020-04-15 08:10 | 2020-04-15 08:12 | 2020-04-15 08:11:59.999 | +| 2020-04-15 08:11 | 3.00 | B | 2020-04-15 08:10 | 2020-04-15 08:14 | 2020-04-15 08:13:59.999 | +| 2020-04-15 08:11 | 3.00 | B | 2020-04-15 08:10 | 2020-04-15 08:16 | 2020-04-15 08:15:59.999 | +| 2020-04-15 08:11 | 3.00 | B | 2020-04-15 08:10 | 2020-04-15 08:18 | 2020-04-15 08:17:59.999 | +| 2020-04-15 08:11 | 3.00 | B | 2020-04-15 08:10 | 2020-04-15 08:20 | 2020-04-15 08:19:59.999 | +| 2020-04-15 08:13 | 1.00 | E | 2020-04-15 08:10 | 2020-04-15 08:14 | 2020-04-15 08:13:59.999 | +| 2020-04-15 08:13 | 1.00 | E | 2020-04-15 08:10 | 2020-04-15 08:16 | 2020-04-15 08:15:59.999 | +| 2020-04-15 08:13 | 1.00 | E | 2020-04-15 08:10 | 2020-04-15 08:18 | 2020-04-15 08:17:59.999 | +| 2020-04-15 08:13 | 1.00 | E | 2020-04-15 08:10 | 2020-04-15 08:20 | 2020-04-15 08:19:59.999 | +| 2020-04-15 08:17 | 6.00 | F | 2020-04-15 08:10 | 2020-04-15 08:18 | 2020-04-15 08:17:59.999 | +| 2020-04-15 08:17 | 6.00 | F | 2020-04-15 08:10 | 2020-04-15 08:20 | 2020-04-15 08:19:59.999 | ++------------------+-------+------+------------------+------------------+-------------------------+ + +-- apply aggregation on the cumulating windowed table +> SELECT window_start, window_end, SUM(price) + FROM TABLE( + CUMULATE(TABLE Bid, DESCRIPTOR(bidtime), INTERVAL '2' MINUTES, INTERVAL '10' MINUTES)) + GROUP BY window_start, window_end; ++------------------+------------------+-------+ +| window_start | window_end | price | ++------------------+------------------+-------+ +| 2020-04-15 08:00 | 2020-04-15 08:06 | 4.00 | +| 2020-04-15 08:00 | 2020-04-15 08:08 | 6.00 | +| 2020-04-15 08:00 | 2020-04-15 08:10 | 11.00 | +| 2020-04-15 08:10 | 2020-04-15 08:12 | 3.00 | +| 2020-04-15 08:10 | 2020-04-15 08:14 | 4.00 | +| 2020-04-15 08:10 | 2020-04-15 08:16 | 4.00 | +| 2020-04-15 08:10 | 2020-04-15 08:18 | 10.00 | +| 2020-04-15 08:10 | 2020-04-15 08:20 | 10.00 | ++------------------+------------------+-------+ +``` + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/queries/with.md b/docs/content.zh/docs/dev/table/sql/queries/with.md new file mode 100644 index 0000000000000..5e3a6a3f69720 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/queries/with.md @@ -0,0 +1,53 @@ +--- +title: "WITH 语句" +weight: 3 +type: docs +--- + + +# WITH 语句 +{{< label Batch >}} {{< label Streaming >}} + +`WITH` provides a way to write auxiliary statements for use in a larger query. These statements, which are often referred to as Common Table Expression (CTE), can be thought of as defining temporary views that exist just for one query. + +The syntax of `WITH` statement is: + +```sql +WITH [ , ... ] +SELECT ... FROM ...; + +: + with_item_name (column_name[, ...n]) AS ( ) +``` + +The following example defines a common table expression `orders_with_total` and use it in a `GROUP BY` query. + +```sql +WITH orders_with_total AS ( + SELECT order_id, price + tax AS total + FROM Orders +) +SELECT order_id, SUM(total) +FROM orders_with_total +GROUP BY order_id; +``` + + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/reset.md b/docs/content.zh/docs/dev/table/sql/reset.md new file mode 100644 index 0000000000000..24c856f93477f --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/reset.md @@ -0,0 +1,63 @@ +--- +title: "RESET 语句" +weight: 15 +type: docs +aliases: + - /dev/table/sql/reset.html +--- + + +# RESET Statements + +`RESET` statements are used to reset the configuration to the default. + +## Run a RESET statement + +{{< tabs "reset statement" >}} +{{< tab "SQL CLI" >}} + +`RESET` statements can be executed in [SQL CLI]({{< ref "docs/dev/table/sqlClient" >}}). + +The following examples show how to run a `RESET` statement in SQL CLI. + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "reset" >}} +{{< tab "SQL CLI" >}} +```sql +Flink SQL> RESET table.planner; +[INFO] Session property has been reset. + +Flink SQL> RESET; +[INFO] All session properties have been set to their default values. +``` +{{< /tab >}} +{{< /tabs >}} + +## Syntax + +```sql +RESET (key)? +``` + +If no key is specified, it reset all the properties to the default. Otherwise, reset the specified key to the default. + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/set.md b/docs/content.zh/docs/dev/table/sql/set.md new file mode 100644 index 0000000000000..83c79e9d93321 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/set.md @@ -0,0 +1,63 @@ +--- +title: "SET 语句" +weight: 14 +type: docs +aliases: + - /dev/table/sql/set.html +--- + + +# SET Statements + +`SET` statements are used to modify the configuration or list the configuration. + +## Run a SET statement + +{{< tabs "set statement" >}} +{{< tab "SQL CLI" >}} + +`SET` statements can be executed in [SQL CLI]({{< ref "docs/dev/table/sqlClient" >}}). + +The following examples show how to run a `SET` statement in SQL CLI. + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "set" >}} +{{< tab "SQL CLI" >}} +```sql +Flink SQL> SET table.planner = blink; +[INFO] Session property has been set. + +Flink SQL> SET; +table.planner=blink; +``` +{{< /tab >}} +{{< /tabs >}} + +## Syntax + +```sql +SET (key = value)? +``` + +If no key and value are specified, it just print all the properties. Otherwise, set the key with specified value. + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/show.md b/docs/content.zh/docs/dev/table/sql/show.md new file mode 100644 index 0000000000000..0297cdd0b0162 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/show.md @@ -0,0 +1,502 @@ +--- +title: "SHOW 语句" +weight: 11 +type: docs +aliases: + - /zh/dev/table/sql/show.html +--- + + +# SHOW 语句 + + + +SHOW 语句用于列出所有的 catalog,或者列出当前 catalog 中所有的 database,或者列出当前 catalog 和当前 database 的所有表或视图,或者列出当前正在使用的 catalog 和 database, 或者列出创建指定表的语句,或者列出当前 catalog 和当前 database 中所有的 function,包括:系统 function 和用户定义的 function,或者仅仅列出当前 catalog 和当前 database 中用户定义的 function,或者列出当前环境所有激活的 module,或者列出当前环境所有加载的 module 及激活状态。 + +目前 Flink SQL 支持下列 SHOW 语句: +- SHOW CATALOGS +- SHOW CURRENT CATALOG +- SHOW DATABASES +- SHOW CURRENT DATABASE +- SHOW TABLES +- SHOW CREATE TABLE +- SHOW VIEWS +- SHOW FUNCTIONS +- SHOW MODULES +- SHOW FULL MODULES + + +## 执行 SHOW 语句 + +{{< tabs "execute" >}} +{{< tab "Java" >}} +可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行 SHOW 语句。 若 SHOW 操作执行成功,`executeSql()` 方法返回所有对象,否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 SHOW 语句。 + +{{< /tab >}} +{{< tab "Scala" >}} +可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行 SHOW 语句。 若 SHOW 操作执行成功,`executeSql()` 方法返回所有对象,否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 SHOW 语句。 +{{< /tab >}} +{{< tab "Python" >}} + +可以使用 `TableEnvironment` 中的 `execute_sql()` 方法执行 SHOW 语句。 若 SHOW 操作执行成功,`execute_sql()` 方法返回所有对象,否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 SHOW 语句。 + +{{< /tab >}} +{{< tab "SQL CLI" >}} + +可以在 [SQL CLI]({{< ref "docs/dev/table/sqlClient" >}}) 中执行 SHOW 语句。 + +以下的例子展示了如何在 SQL CLI 中执行一个 SHOW 语句。 + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "bc804bed-4550-4f60-a8c0-17e6d741e08d" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + +// show catalogs +tEnv.executeSql("SHOW CATALOGS").print(); +// +-----------------+ +// | catalog name | +// +-----------------+ +// | default_catalog | +// +-----------------+ + +// show current catalog +tEnv.executeSql("SHOW CURRENT CATALOG").print(); +// +----------------------+ +// | current catalog name | +// +----------------------+ +// | default_catalog | +// +----------------------+ + +// show databases +tEnv.executeSql("SHOW DATABASES").print(); +// +------------------+ +// | database name | +// +------------------+ +// | default_database | +// +------------------+ + +// show current database +tEnv.executeSql("SHOW CURRENT DATABASE").print(); +// +-----------------------+ +// | current database name | +// +-----------------------+ +// | default_database | +// +-----------------------+ + +// create a table +tEnv.executeSql("CREATE TABLE my_table (...) WITH (...)"); +// show tables +tEnv.executeSql("SHOW TABLES").print(); +// +------------+ +// | table name | +// +------------+ +// | my_table | +// +------------+ + +// show create table +tEnv.executeSql("SHOW CREATE TABLE my_table").print(); +// CREATE TABLE `default_catalog`.`default_db`.`my_table` ( +// ... +// ) WITH ( +// ... +// ) + + +// create a view +tEnv.executeSql("CREATE VIEW my_view AS ..."); +// show views +tEnv.executeSql("SHOW VIEWS").print(); +// +-----------+ +// | view name | +// +-----------+ +// | my_view | +// +-----------+ + +// show functions +tEnv.executeSql("SHOW FUNCTIONS").print(); +// +---------------+ +// | function name | +// +---------------+ +// | mod | +// | sha256 | +// | ... | +// +---------------+ + +// create a user defined function +tEnv.executeSql("CREATE FUNCTION f1 AS ..."); +// show user defined functions +tEnv.executeSql("SHOW USER FUNCTIONS").print(); +// +---------------+ +// | function name | +// +---------------+ +// | f1 | +// | ... | +// +---------------+ + +// show modules +tEnv.executeSql("SHOW MODULES").print(); +// +-------------+ +// | module name | +// +-------------+ +// | core | +// +-------------+ + +// show full modules +tEnv.executeSql("SHOW FULL MODULES").print(); +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | core | true | +// | hive | false | +// +-------------+-------+ + +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() +val tEnv = StreamTableEnvironment.create(env) + +// show catalogs +tEnv.executeSql("SHOW CATALOGS").print() +// +-----------------+ +// | catalog name | +// +-----------------+ +// | default_catalog | +// +-----------------+ + +// show databases +tEnv.executeSql("SHOW DATABASES").print() +// +------------------+ +// | database name | +// +------------------+ +// | default_database | +// +------------------+ + +// create a table +tEnv.executeSql("CREATE TABLE my_table (...) WITH (...)") +// show tables +tEnv.executeSql("SHOW TABLES").print() +// +------------+ +// | table name | +// +------------+ +// | my_table | +// +------------+ + +// show create table +tEnv.executeSql("SHOW CREATE TABLE my_table").print() +// CREATE TABLE `default_catalog`.`default_db`.`my_table` ( +// ... +// ) WITH ( +// ... +// ) +// create a view +tEnv.executeSql("CREATE VIEW my_view AS ...") +// show views +tEnv.executeSql("SHOW VIEWS").print() +// +-----------+ +// | view name | +// +-----------+ +// | my_view | +// +-----------+ + +// show functions +tEnv.executeSql("SHOW FUNCTIONS").print() +// +---------------+ +// | function name | +// +---------------+ +// | mod | +// | sha256 | +// | ... | +// +---------------+ + +// create a user defined function +tEnv.executeSql("CREATE FUNCTION f1 AS ...") +// show user defined functions +tEnv.executeSql("SHOW USER FUNCTIONS").print() +// +---------------+ +// | function name | +// +---------------+ +// | f1 | +// | ... | +// +---------------+ + +// show modules +tEnv.executeSql("SHOW MODULES").print() +// +-------------+ +// | module name | +// +-------------+ +// | core | +// +-------------+ + +// show full modules +tEnv.executeSql("SHOW FULL MODULES").print() +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | core | true | +// | hive | false | +// +-------------+-------+ + +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +settings = EnvironmentSettings.new_instance()... +table_env = StreamTableEnvironment.create(env, settings) + +# show catalogs +table_env.execute_sql("SHOW CATALOGS").print() +# +-----------------+ +# | catalog name | +# +-----------------+ +# | default_catalog | +# +-----------------+ + +# show databases +table_env.execute_sql("SHOW DATABASES").print() +# +------------------+ +# | database name | +# +------------------+ +# | default_database | +# +------------------+ + +# create a table +table_env.execute_sql("CREATE TABLE my_table (...) WITH (...)") +# show tables +table_env.execute_sql("SHOW TABLES").print() +# +------------+ +# | table name | +# +------------+ +# | my_table | +# +------------+ +# show create table +table_env.executeSql("SHOW CREATE TABLE my_table").print() +# CREATE TABLE `default_catalog`.`default_db`.`my_table` ( +# ... +# ) WITH ( +# ... +# ) + +# create a view +table_env.execute_sql("CREATE VIEW my_view AS ...") +# show views +table_env.execute_sql("SHOW VIEWS").print() +# +-----------+ +# | view name | +# +-----------+ +# | my_view | +# +-----------+ + +# show functions +table_env.execute_sql("SHOW FUNCTIONS").print() +# +---------------+ +# | function name | +# +---------------+ +# | mod | +# | sha256 | +# | ... | +# +---------------+ + +# create a user defined function +table_env.execute_sql("CREATE FUNCTION f1 AS ...") +# show user defined functions +table_env.execute_sql("SHOW USER FUNCTIONS").print() +# +---------------+ +# | function name | +# +---------------+ +# | f1 | +# | ... | +# +---------------+ + +# show modules +table_env.execute_sql("SHOW MODULES").print() +# +-------------+ +# | module name | +# +-------------+ +# | core | +# +-------------+ + +# show full modules +table_env.execute_sql("SHOW FULL MODULES").print() +# +-------------+-------+ +# | module name | used | +# +-------------+-------+ +# | core | true | +# | hive | false | +# +-------------+-------+ +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +```sql + +Flink SQL> SHOW CATALOGS; +default_catalog + +Flink SQL> SHOW DATABASES; +default_database + +Flink SQL> CREATE TABLE my_table (...) WITH (...); +[INFO] Table has been created. + +Flink SQL> SHOW TABLES; +my_table + +Flink SQL> SHOW CREATE TABLE my_table; +CREATE TABLE `default_catalog`.`default_db`.`my_table` ( + ... +) WITH ( + ... +) + +Flink SQL> CREATE VIEW my_view AS ...; +[INFO] View has been created. + +Flink SQL> SHOW VIEWS; +my_view + +Flink SQL> SHOW FUNCTIONS; +mod +sha256 +... + +Flink SQL> CREATE FUNCTION f1 AS ...; +[INFO] Function has been created. + +Flink SQL> SHOW USER FUNCTIONS; +f1 +... + +Flink SQL> SHOW MODULES; ++-------------+ +| module name | ++-------------+ +| core | ++-------------+ +1 row in set + + +Flink SQL> SHOW FULL MODULES; ++-------------+------+ +| module name | used | ++-------------+------+ +| core | true | ++-------------+------+ +1 row in set + + +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + +## SHOW CATALOGS + +```sql +SHOW CATALOGS +``` + +展示所有的 catalog。 + +## SHOW CURRENT CATALOG + +```sql +SHOW CURRENT CATALOG +``` + +显示当前正在使用的 catalog。 + +## SHOW DATABASES + +```sql +SHOW DATABASES +``` + +展示当前 catalog 中所有的 database。 + +## SHOW CURRENT DATABASE + +```sql +SHOW CURRENT DATABASE +``` + +显示当前正在使用的 database。 + +## SHOW TABLES + +```sql +SHOW TABLES +``` + +展示当前 catalog 和当前 database 中所有的表。 + +## SHOW CREATE TABLE + +```sql +SHOW CREATE TABLE [catalog_name.][db_name.]table_name +``` + +展示创建指定表的 create 语句。 + +Attention 目前 `SHOW CREATE TABLE` 只支持通过 Flink SQL DDL 创建的表。 + +## SHOW VIEWS + +```sql +SHOW VIEWS +``` + +展示当前 catalog 和当前 database 中所有的视图。 + +## SHOW FUNCTIONS + +```sql +SHOW [USER] FUNCTIONS +``` + +展示当前 catalog 和当前 database 中所有的 function,包括:系统 function 和用户定义的 function。 + +**USER** +仅仅展示当前 catalog 和当前 database 中用户定义的 function。 + +## SHOW MODULES + +```sql +SHOW [FULL] MODULES +``` + +展示当前环境激活的所有 module。 + +**FULL** +展示当前环境加载的所有 module 及激活状态。 + +{{< top >}} diff --git a/docs/content.zh/docs/dev/table/sql/unload.md b/docs/content.zh/docs/dev/table/sql/unload.md new file mode 100644 index 0000000000000..650049608c0a6 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/unload.md @@ -0,0 +1,115 @@ +--- +title: "UNLOAD 语句" +weight: 13 +type: docs +aliases: + - /zh/dev/table/sql/unload.html +--- + + +# UNLOAD Statements + +UNLOAD statements are used to unload a built-in or user-defined module. + +## Run a UNLOAD statement + +{{< tabs "unload statement" >}} +{{< tab "Java" >}} + +UNLOAD statements can be executed with the `executeSql()` method of the `TableEnvironment`. The `executeSql()` method returns 'OK' for a successful LOAD operation; otherwise it will throw an exception. + +The following examples show how to run a UNLOAD statement in `TableEnvironment`. + +{{< /tab >}} +{{< tab "Scala" >}} + +UNLOAD statements can be executed with the `executeSql()` method of the `TableEnvironment`. The `executeSql()` method returns 'OK' for a successful LOAD operation; otherwise it will throw an exception. + +The following examples show how to run a UNLOAD statement in `TableEnvironment`. +{{< /tab >}} +{{< tab "Python" >}} + +UNLOAD statements can be executed with the `executeSql()` method of the `TableEnvironment`. The `executeSql()` method returns 'OK' for a successful LOAD operation; otherwise it will throw an exception. + +The following examples show how to run a UNLOAD statement in `TableEnvironment`. + +{{< /tab >}} +{{< tab "SQL CLI" >}} + +UNLOAD statements can be executed in [SQL CLI]({{< ref "docs/dev/table/sqlClient" >}}). + +The following examples show how to run a UNLOAD statement in SQL CLI. + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "unload modules" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + +// unload a core module +tEnv.executeSql("UNLOAD MODULE core"); +tEnv.executeSql("SHOW MODULES").print(); +// Empty set +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() +val tEnv = StreamTableEnvironment.create(env) + +// unload a core module +tEnv.executeSql("UNLOAD MODULE core") +tEnv.executeSql("SHOW MODULES").print() +// Empty set +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +settings = EnvironmentSettings.new_instance()... +table_env = StreamTableEnvironment.create(env, settings) + +# unload a core module +table_env.execute_sql("UNLOAD MODULE core") +table_env.execute_sql("SHOW MODULES").print() +# Empty set +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +```sql +Flink SQL> UNLOAD MODULE core; +[INFO] Unload module succeeded! + +Flink SQL> SHOW MODULES; +Empty set +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + +## UNLOAD MODULE + +The following grammar gives an overview of the available syntax: +```sql +UNLOAD MODULE module_name +``` diff --git a/docs/content.zh/docs/dev/table/sql/use.md b/docs/content.zh/docs/dev/table/sql/use.md new file mode 100644 index 0000000000000..648eb341e543c --- /dev/null +++ b/docs/content.zh/docs/dev/table/sql/use.md @@ -0,0 +1,267 @@ +--- +title: "USE 语句" +weight: 10 +type: docs +aliases: + - /zh/dev/table/sql/use.html +--- + + +# USE 语句 + + + +USE 语句用来设置当前的 catalog 或者 database。 + +## 运行一个 USE 语句 + +{{< tabs "explain" >}} +{{< tab "Java" >}} + +可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行 USE 语句。 若 USE 操作执行成功,`executeSql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 USE 语句。 + +{{< /tab >}} +{{< tab "Scala" >}} + +可以使用 `TableEnvironment` 中的 `executeSql()` 方法执行 USE 语句。 若 USE 操作执行成功,`executeSql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 USE 语句。 +{{< /tab >}} +{{< tab "Python" >}} + +可以使用 `TableEnvironment` 中的 `execute_sql()` 方法执行 USE 语句。 若 USE 操作执行成功,`execute_sql()` 方法返回 'OK',否则会抛出异常。 + +以下的例子展示了如何在 `TableEnvironment` 中执行一个 USE 语句。 + +{{< /tab >}} +{{< tab "SQL CLI" >}} + +可以在 SQL CLI 中执行 USE 语句。 + +以下的例子展示了如何在 SQL CLI 中执行一个 USE 语句。 + +{{< /tab >}} +{{< /tabs >}} + +{{< tabs "9c2050ce-b261-4692-9447-b9b6772c5b38" >}} +{{< tab "Java" >}} +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); +StreamTableEnvironment tEnv = StreamTableEnvironment.create(env); + +// create a catalog +tEnv.executeSql("CREATE CATALOG cat1 WITH (...)"); +tEnv.executeSql("SHOW CATALOGS").print(); +// +-----------------+ +// | catalog name | +// +-----------------+ +// | default_catalog | +// | cat1 | +// +-----------------+ + +// change default catalog +tEnv.executeSql("USE CATALOG cat1"); + +tEnv.executeSql("SHOW DATABASES").print(); +// databases are empty +// +---------------+ +// | database name | +// +---------------+ +// +---------------+ + +// create a database +tEnv.executeSql("CREATE DATABASE db1 WITH (...)"); +tEnv.executeSql("SHOW DATABASES").print(); +// +---------------+ +// | database name | +// +---------------+ +// | db1 | +// +---------------+ + +// change default database +tEnv.executeSql("USE db1"); + +// change module resolution order and enabled status +tEnv.executeSql("USE MODULES hive"); +tEnv.executeSql("SHOW FULL MODULES").print(); +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | hive | true | +// | core | false | +// +-------------+-------+ +``` +{{< /tab >}} +{{< tab "Scala" >}} +```scala +val env = StreamExecutionEnvironment.getExecutionEnvironment() +val tEnv = StreamTableEnvironment.create(env) + +// create a catalog +tEnv.executeSql("CREATE CATALOG cat1 WITH (...)") +tEnv.executeSql("SHOW CATALOGS").print() +// +-----------------+ +// | catalog name | +// +-----------------+ +// | default_catalog | +// | cat1 | +// +-----------------+ + +// change default catalog +tEnv.executeSql("USE CATALOG cat1") + +tEnv.executeSql("SHOW DATABASES").print() +// databases are empty +// +---------------+ +// | database name | +// +---------------+ +// +---------------+ + +// create a database +tEnv.executeSql("CREATE DATABASE db1 WITH (...)") +tEnv.executeSql("SHOW DATABASES").print() +// +---------------+ +// | database name | +// +---------------+ +// | db1 | +// +---------------+ + +// change default database +tEnv.executeSql("USE db1") + +// change module resolution order and enabled status +tEnv.executeSql("USE MODULES hive") +tEnv.executeSql("SHOW FULL MODULES").print() +// +-------------+-------+ +// | module name | used | +// +-------------+-------+ +// | hive | true | +// | core | false | +// +-------------+-------+ +``` +{{< /tab >}} +{{< tab "Python" >}} +```python +settings = EnvironmentSettings.new_instance()... +table_env = StreamTableEnvironment.create(env, settings) + +# create a catalog +table_env.execute_sql("CREATE CATALOG cat1 WITH (...)") +table_env.execute_sql("SHOW CATALOGS").print() +# +-----------------+ +# | catalog name | +# +-----------------+ +# | default_catalog | +# | cat1 | +# +-----------------+ + +# change default catalog +table_env.execute_sql("USE CATALOG cat1") + +table_env.execute_sql("SHOW DATABASES").print() +# databases are empty +# +---------------+ +# | database name | +# +---------------+ +# +---------------+ + +# create a database +table_env.execute_sql("CREATE DATABASE db1 WITH (...)") +table_env.execute_sql("SHOW DATABASES").print() +# +---------------+ +# | database name | +# +---------------+ +# | db1 | +# +---------------+ + +# change default database +table_env.execute_sql("USE db1") + +# change module resolution order and enabled status +table_env.execute_sql("USE MODULES hive") +table_env.execute_sql("SHOW FULL MODULES").print() +# +-------------+-------+ +# | module name | used | +# +-------------+-------+ +# | hive | true | +# | core | false | +# +-------------+-------+ +``` +{{< /tab >}} +{{< tab "SQL CLI" >}} +```sql +Flink SQL> CREATE CATALOG cat1 WITH (...); +[INFO] Catalog has been created. + +Flink SQL> SHOW CATALOGS; +default_catalog +cat1 + +Flink SQL> USE CATALOG cat1; + +Flink SQL> SHOW DATABASES; + +Flink SQL> CREATE DATABASE db1 WITH (...); +[INFO] Database has been created. + +Flink SQL> SHOW DATABASES; +db1 + +Flink SQL> USE db1; + +Flink SQL> USE MODULES hive; +[INFO] Use modules succeeded! +Flink SQL> SHOW FULL MODULES; ++-------------+-------+ +| module name | used | ++-------------+-------+ +| hive | true | +| core | false | ++-------------+-------+ +2 rows in set +``` +{{< /tab >}} +{{< /tabs >}} + +{{< top >}} + +## USE CATLOAG + +```sql +USE CATALOG catalog_name +``` + +设置当前的 catalog。所有后续命令未显式指定 catalog 的将使用此 catalog。如果指定的的 catalog 不存在,则抛出异常。默认的当前 catalog 是 `default_catalog`。 + +## USE MODULES +```sql +USE MODULES module_name1[, module_name2, ...] +``` +Set the enabled modules with declared order. All subsequent commands will resolve metadata(functions/user-defined types/rules, *etc.*) within enabled modules and follow resolution order. A module is used by default when it is loaded. Loaded modules will become disabled if not used by `USE MODULES` statement. The default loaded and enabled module is `core`. + +## USE + +```sql +USE [catalog_name.]database_name +``` + +设置当前的 database。所有后续命令未显式指定 database 的将使用此 database。如果指定的的 database 不存在,则抛出异常。默认的当前 database 是 `default_database`。 diff --git a/docs/content.zh/docs/dev/table/sqlClient.md b/docs/content.zh/docs/dev/table/sqlClient.md new file mode 100644 index 0000000000000..8b57c32c230a0 --- /dev/null +++ b/docs/content.zh/docs/dev/table/sqlClient.md @@ -0,0 +1,708 @@ +--- +title: "SQL 客户端" +weight: 91 +type: docs +aliases: + - /zh/dev/table/sqlClient.html +--- + + +# SQL 客户端 + +Flink 的 Table & SQL API 可以处理 SQL 语言编写的查询语句,但是这些查询需要嵌入用 Java 或 Scala 编写的表程序中。此外,这些程序在提交到集群前需要用构建工具打包。这或多或少限制了 Java/Scala 程序员对 Flink 的使用。 + +*SQL 客户端* 的目的是提供一种简单的方式来编写、调试和提交表程序到 Flink 集群上,而无需写一行 Java 或 Scala 代码。*SQL 客户端命令行界面(CLI)* 能够在命令行中检索和可视化分布式应用中实时产生的结果。 + +{{< img width="80%" src="/fig/sql_client_demo.gif" alt="Animated demo of the Flink SQL Client CLI running table programs on a cluster" >}} + + + +入门 +--------------- + +本节介绍如何在命令行里启动(setup)和运行你的第一个 Flink SQL 程序。 + +SQL 客户端捆绑在常规 Flink 发行版中,因此可以直接运行。它仅需要一个正在运行的 Flink 集群就可以在其中执行表程序。有关设置 Flink 群集的更多信息,请参见[集群和部署]({{< ref "docs/deployment/resource-providers/standalone/overview" >}})部分。如果仅想试用 SQL 客户端,也可以使用以下命令启动本地集群: + +```bash +./bin/start-cluster.sh +``` + +### 启动 SQL 客户端命令行界面 + +SQL Client 脚本也位于 Flink 的 bin 目录中。[将来](#局限与未来),用户可以通过启动嵌入式 standalone 进程或通过连接到远程 SQL 客户端网关来启动 SQL 客户端命令行界面。目前仅支持 `embedded`,模式默认值`embedded`。可以通过以下方式启动 CLI: + +```bash +./bin/sql-client.sh +``` + +或者显式使用 `embedded` 模式: + +```bash +./bin/sql-client.sh embedded +``` + +### 执行 SQL 查询 + +命令行界面启动后,你可以使用 `HELP` 命令列出所有可用的 SQL 语句。输入第一条 SQL 查询语句并按 `Enter` 键执行,可以验证你的设置及集群连接是否正确: + +```sql +SELECT 'Hello World'; +``` + +该查询不需要 table source,并且只产生一行结果。CLI 将从集群中检索结果并将其可视化。按 `Q` 键退出结果视图。 + +CLI 为维护和可视化结果提供**三种模式**。 + +**表格模式**(table mode)在内存中实体化结果,并将结果用规则的分页表格可视化展示出来。执行如下命令启用: + +```text +SET sql-client.execution.result-mode=table; +``` + +**变更日志模式**(changelog mode)不会实体化和可视化结果,而是由插入(`+`)和撤销(`-`)组成的持续查询产生结果流。 + +```text +SET sql-client.execution.result-mode=changelog; +``` + +**Tableau模式**(tableau mode)更接近传统的数据库,会将执行的结果以制表的形式直接打在屏幕之上。具体显示的内容会取决于作业 +执行模式的不同(`execution.type`): + +```text +SET sql-client.execution.result-mode=tableau; +``` + +注意当你使用这个模式运行一个流式查询的时候,Flink 会将结果持续的打印在当前的屏幕之上。如果这个流式查询的输入是有限的数据集, +那么Flink在处理完所有的数据之后,会自动的停止作业,同时屏幕上的打印也会相应的停止。如果你想提前结束这个查询,那么可以直接使用 +`CTRL-C` 按键,这个会停掉作业同时停止屏幕上的打印。 + +你可以用如下查询来查看三种结果模式的运行情况: + +```sql +SELECT name, COUNT(*) AS cnt FROM (VALUES ('Bob'), ('Alice'), ('Greg'), ('Bob')) AS NameTable(name) GROUP BY name; +``` + +此查询执行一个有限字数示例: + +*变更日志模式* 下,看到的结果应该类似于: + +```text ++ Bob, 1 ++ Alice, 1 ++ Greg, 1 +- Bob, 1 ++ Bob, 2 +``` + +*表格模式* 下,可视化结果表将不断更新,直到表程序以如下内容结束: + +```text +Bob, 2 +Alice, 1 +Greg, 1 +``` + +*Tableau模式* 下,如果这个查询以流的方式执行,那么将显示以下内容: +```text ++-----+----------------------+----------------------+ +| +/- | name | cnt | ++-----+----------------------+----------------------+ +| + | Bob | 1 | +| + | Alice | 1 | +| + | Greg | 1 | +| - | Bob | 1 | +| + | Bob | 2 | ++-----+----------------------+----------------------+ +Received a total of 5 rows +``` + +如果这个查询以批的方式执行,显示的内容如下: +```text ++-------+-----+ +| name | cnt | ++-------+-----+ +| Alice | 1 | +| Bob | 2 | +| Greg | 1 | ++-------+-----+ +3 rows in set +``` + +这几种结果模式在 SQL 查询的原型设计过程中都非常有用。这些模式的结果都存储在 SQL 客户端 的 Java 堆内存中。为了保持 CLI +界面及时响应,变更日志模式仅显示最近的 1000 个更改。表格模式支持浏览更大的结果,这些结果仅受可用主内存和配置的[最大行数](#sql-client-execution-max-table-result-rows)(`sql-client.execution.max-table-result.rows`)的限制。 + +注意 在批处理环境下执行的查询只能用表格模式或者Tableau模式进行检索。 + +定义查询语句后,可以将其作为长时间运行的独立 Flink 作业提交给集群。[配置部分](#configuration)解释如何声明读取数据的 table source,写入数据的 sink 以及配置其他表程序属性的方法。 + +{{< top >}} + + + +Configuration +------------- + +### SQL Client startup options + +The SQL Client can be started with the following optional CLI commands. They are discussed in detail in the subsequent paragraphs. + +```text +./bin/sql-client.sh --help + +Mode "embedded" (default) submits Flink jobs from the local machine. + + Syntax: [embedded] [OPTIONS] + "embedded" mode options: + -d,--defaults Deprecated feature: the environment + properties with which every new + session is initialized. Properties + might be overwritten by session + properties. + -e,--environment Deprecated feature: the environment + properties to be imported into the + session. It might overwrite default + environment properties. + -f,--file + + + + +{{ define "menu" }} + {{ partial "docs/menu" . }} +{{ end }} + +{{ define "header" }} + {{ partial "docs/header" . }} + + {{ if default true (default .Site.Params.BookToC .Params.BookToC) }} + + {{ end }} +{{ end }} + +{{ define "footer" }} + {{ partial "docs/footer" . }} +{{ end }} + +{{ define "comments" }} + {{ if and .Content (default true (default .Site.Params.BookComments .Params.BookComments)) }} +
    + {{- partial "docs/comments" . -}} +
    + {{ end }} +{{ end }} + +{{ define "main" }} +
    + {{- .Content -}} +
    +{{ end }} + +{{ define "toc" }} + {{ partial "docs/toc" . }} +{{ end }} diff --git a/docs/layouts/partials/docs/footer.html b/docs/layouts/partials/docs/footer.html new file mode 100644 index 0000000000000..45aa36fd525eb --- /dev/null +++ b/docs/layouts/partials/docs/footer.html @@ -0,0 +1,28 @@ + + +{{ if .IsPage }} +{{ $folder := "content" }} +{{ if eq "/zh" .Site.LanguagePrefix }} + {{ $folder = "content.zh" }} +{{ end }} +Edit This Page +{{ end }} diff --git a/docs/layouts/partials/docs/inject/content-before.html b/docs/layouts/partials/docs/inject/content-before.html new file mode 100644 index 0000000000000..35b65bdc7922d --- /dev/null +++ b/docs/layouts/partials/docs/inject/content-before.html @@ -0,0 +1,36 @@ + + +{{ if $.Site.Params.ShowOutDatedWarning }} +
    +
    + {{ markdownify "This documentation is for an out-of-date version of Apache Flink. We recommend you use the latest [stable version](https://ci.apache.org/projects/flink/flink-docs-stable/)."}} +
    +
    +{{ end }} +{{ if (not $.Site.Params.IsStable) }} +
    +
    + {{ markdownify "This documentation is for an unreleased version of Apache Flink. We recommend you use the latest [stable version](https://ci.apache.org/projects/flink/flink-docs-stable/)."}} +
    +
    +{{ end }} + diff --git a/docs/layouts/partials/docs/inject/head.html b/docs/layouts/partials/docs/inject/head.html new file mode 100644 index 0000000000000..f456c36bdc29f --- /dev/null +++ b/docs/layouts/partials/docs/inject/head.html @@ -0,0 +1,25 @@ + + + + + + diff --git a/docs/layouts/partials/docs/inject/menu-after.html b/docs/layouts/partials/docs/inject/menu-after.html new file mode 100644 index 0000000000000..df19fb428e1d3 --- /dev/null +++ b/docs/layouts/partials/docs/inject/menu-after.html @@ -0,0 +1,76 @@ + + +
    + +{{ range $links := .Site.Params.MenuLinks }} + {{ index $links 0 }} +
    +{{ end }} + +
  • +
    + + + +
  • + + +{{ $translations := dict }} +{{ range .Site.Home.AllTranslations }} + {{ $translations = merge $translations (dict .Language.Lang .) }} +{{ end }} +{{ range .Translations }} + {{ $translations = merge $translations (dict .Language.Lang .) }} +{{ end }} + + +{{ range .Site.Languages }}{{ with index $translations .Lang }} +{{ if (ne $.Site.Language .Language) }} + +    + {{ .Language.LanguageName }} + +{{ end }}{{ end }}{{ end }} diff --git a/docs/layouts/partials/docs/inject/menu-before.html b/docs/layouts/partials/docs/inject/menu-before.html new file mode 100644 index 0000000000000..5b688b7084a3b --- /dev/null +++ b/docs/layouts/partials/docs/inject/menu-before.html @@ -0,0 +1,25 @@ + + + +

    v{{ $.Site.Params.Version }}

    \ No newline at end of file diff --git a/docs/layouts/partials/docs/interpolate.html b/docs/layouts/partials/docs/interpolate.html new file mode 100644 index 0000000000000..6b9702aaa4f4f --- /dev/null +++ b/docs/layouts/partials/docs/interpolate.html @@ -0,0 +1,24 @@ + + +{{ $str := replace . "$scala_version" site.Params.ScalaVersion }} +{{ $str = replace $str "$version" site.Params.Version }} +{{ return $str }} \ No newline at end of file diff --git a/docs/layouts/partials/docs/menu-filetree.html b/docs/layouts/partials/docs/menu-filetree.html new file mode 100644 index 0000000000000..b583347dd0364 --- /dev/null +++ b/docs/layouts/partials/docs/menu-filetree.html @@ -0,0 +1,68 @@ + + +{{ $bookSection := default "docs" .Site.Params.BookSection }} +{{ if eq $bookSection "*" }} + {{ $bookSection = "/" }}{{/* Backward compatibility */}} +{{ end }} + +{{ with .Site.GetPage $bookSection }} + {{ template "book-section-children" (dict "Section" . "CurrentPage" $) }} +{{ end }} + +{{ define "book-section-children" }}{{/* (dict "Section" .Section "CurrentPage" .CurrentPage) */}} +
      + {{ range (where .Section.Pages "Params.bookhidden" "ne" true) }} + {{ if .IsSection }} +
    • + {{ template "book-page-link" (dict "Page" . "CurrentPage" $.CurrentPage) }} + {{ template "book-section-children" (dict "Section" . "CurrentPage" $.CurrentPage) }} +
    • + {{ else if and .IsPage .Content }} +
    • + {{ template "book-page-link" (dict "Page" . "CurrentPage" $.CurrentPage) }} +
    • + {{ end }} + {{ end }} +
    +{{ end }} + +{{ define "book-page-link" }}{{/* (dict "Page" .Page "CurrentPage" .CurrentPage) */}} + {{ $current := eq .CurrentPage .Page }} + {{ $ancestor := .Page.IsAncestor .CurrentPage }} + + {{ if .Page.Params.sectionBreak }} +
    + {{ end }} + {{ if .Page.Params.bookCollapseSection }} + + + {{ else if .Page.Content }} + + {{- partial "docs/title" .Page -}} + + {{ else }} + {{- partial "docs/title" .Page -}} + {{ end }} +{{ end }} diff --git a/docs/layouts/partials/docs/menu.html b/docs/layouts/partials/docs/menu.html new file mode 100644 index 0000000000000..77d0301599e2b --- /dev/null +++ b/docs/layouts/partials/docs/menu.html @@ -0,0 +1,42 @@ + + + + + +{{ $script := resources.Get "menu-reset.js" | resources.Minify }} +{{ with $script.Content }} + +{{ end }} diff --git a/docs/layouts/partials/docs/simple-title.html b/docs/layouts/partials/docs/simple-title.html new file mode 100644 index 0000000000000..b324d481841c8 --- /dev/null +++ b/docs/layouts/partials/docs/simple-title.html @@ -0,0 +1,33 @@ + + +{{ $title := "" }} + +{{ if .Title }} + {{ $title = .Title }} +{{ else if and .IsSection .File }} + {{ $title = path.Base .File.Dir | humanize | title }} +{{ else if and .IsPage .File }} + {{ $title = .File.BaseFileName | humanize | title }} +{{ end }} + +{{ return $title }} \ No newline at end of file diff --git a/docs/layouts/partials/docs/title.html b/docs/layouts/partials/docs/title.html new file mode 100644 index 0000000000000..f9c96daf70fc7 --- /dev/null +++ b/docs/layouts/partials/docs/title.html @@ -0,0 +1,42 @@ + + +{{ $title := "" }} + +{{ if .Title }} + {{ $title = .Title }} +{{ else if and .IsSection .File }} + {{ $title = path.Base .File.Dir | humanize | title }} +{{ else if and .IsPage .File }} + {{ $title = .File.BaseFileName | humanize | title }} +{{ end }} + +{{ if .Params.icon }} + {{ $title = printf "%s  %s" .Params.icon $title }} +{{ end }} + +{{ if .Params.bold }} + {{ $title = printf `
    %s
    ` $title }} +{{ end }} + +{{ return ($title | safeHTML) }} + diff --git a/docs/layouts/partials/docs/toc.html b/docs/layouts/partials/docs/toc.html new file mode 100644 index 0000000000000..863eec29bc8cc --- /dev/null +++ b/docs/layouts/partials/docs/toc.html @@ -0,0 +1,23 @@ + +{{/* + Generates the pages table of contents. Unfortunately, hugo does not give us a lot of flexibility + around how the TOC is generated so we have to fall back to a regex to add the header. +*/}} +{{ .TableOfContents | replaceRE "