diff --git a/.gitignore b/.gitignore
index e4673a2851d13..a3578a69d20fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,9 +23,12 @@ flink-runtime-web/web-dashboard/node/
flink-runtime-web/web-dashboard/node_modules/
flink-runtime-web/web-dashboard/web/
flink-python/dist/
+flink-python/apache-flink-libraries/dist/
flink-python/build/
+flink-python/apache-flink-libraries/build
flink-python/pyflink.egg-info/
flink-python/apache_flink.egg-info/
+flink-python/apache-flink-libraries/apache_flink_libraries.egg-info/
flink-python/docs/_build
flink-python/.tox/
flink-python/dev/download
@@ -34,12 +37,12 @@ flink-python/dev/log/
flink-python/dev/.stage.txt
flink-python/.eggs/
flink-python/apache-flink-*.dev0/
+flink-python/apache-flink-libraries/apache_flink_libraries-*.dev0/
flink-python/**/*.c
flink-python/**/*.so
atlassian-ide-plugin.xml
out/
/docs/api
-/docs/content
/docs/.bundle
/docs/.rubydeps
/docs/ruby2/.bundle
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000..6cb7e5ce65305
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "docs/themes/book"]
+ path = docs/themes/book
+ url = https://github.com/alex-shpak/hugo-book
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 6d74a8567fcc5..cbe3b11650938 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -38,7 +38,7 @@ resources:
containers:
# Container with Maven 3.2.5, SSL to have the same environment everywhere.
- container: flink-build-container
- image: rmetzger/flink-ci:ubuntu-amd64-f009d96
+ image: rmetzger/flink-ci:ubuntu-amd64-7ac4e28
# On AZP provided machines, set this flag to allow writing coredumps in docker
options: --privileged
@@ -49,6 +49,7 @@ resources:
variables:
MAVEN_CACHE_FOLDER: $(Pipeline.Workspace)/.m2/repository
E2E_CACHE_FOLDER: $(Pipeline.Workspace)/e2e_cache
+ E2E_TARBALL_CACHE: $(Pipeline.Workspace)/e2e_artifact_cache
MAVEN_OPTS: '-Dmaven.repo.local=$(MAVEN_CACHE_FOLDER)'
CACHE_KEY: maven | $(Agent.OS) | **/pom.xml, !**/target/**
CACHE_FALLBACK_KEY: maven | $(Agent.OS)
@@ -56,6 +57,8 @@ variables:
SECRET_S3_BUCKET: $[variables.IT_CASE_S3_BUCKET]
SECRET_S3_ACCESS_KEY: $[variables.IT_CASE_S3_ACCESS_KEY]
SECRET_S3_SECRET_KEY: $[variables.IT_CASE_S3_SECRET_KEY]
+ SECRET_GLUE_SCHEMA_ACCESS_KEY: $[variables.IT_CASE_GLUE_SCHEMA_ACCESS_KEY]
+ SECRET_GLUE_SCHEMA_SECRET_KEY: $[variables.IT_CASE_GLUE_SCHEMA_SECRET_KEY]
stages:
@@ -79,16 +82,7 @@ stages:
pool:
vmImage: 'ubuntu-16.04'
steps:
- - task: UseRubyVersion@0
- inputs:
- versionSpec: '= 2.4'
- addToPath: true
- script: ./tools/ci/docs.sh
- # upload spider.log for debugging
- - task: PublishPipelineArtifact@1
- inputs:
- targetPath: ./docs/spider.log
- artifact: spider.log
# CI / Special stage for release, e.g. building PyFlink wheel packages, etc:
- stage: ci_release
displayName: "CI build (release)"
diff --git a/docs/.gitignore b/docs/.gitignore
index 3d6212de50701..270dd7f731332 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -2,7 +2,7 @@
.jekyll-metadata
.jekyll-cache/
.rubydeps/
-content/
-content_*/
ruby2/.bundle/
ruby2/.rubydeps/
+public/
+resources/
diff --git a/docs/404.md b/docs/404.md
deleted file mode 100644
index 5a25bad7426f0..0000000000000
--- a/docs/404.md
+++ /dev/null
@@ -1,26 +0,0 @@
----
-title: "404"
-permalink: /404.html
-layout: 404_base
----
-
-
-The page you are looking for has been moved. This could be because of a recent reorganization of the
-documentation. Redirecting to [Documentation Home Page](/) in 5 seconds.
diff --git a/docs/404.zh.md b/docs/404.zh.md
deleted file mode 100644
index 5a25bad7426f0..0000000000000
--- a/docs/404.zh.md
+++ /dev/null
@@ -1,26 +0,0 @@
----
-title: "404"
-permalink: /404.html
-layout: 404_base
----
-
-
-The page you are looking for has been moved. This could be because of a recent reorganization of the
-documentation. Redirecting to [Documentation Home Page](/) in 5 seconds.
diff --git a/docs/Gemfile b/docs/Gemfile
deleted file mode 100644
index 7dfc2ae0e3a4c..0000000000000
--- a/docs/Gemfile
+++ /dev/null
@@ -1,35 +0,0 @@
-################################################################################
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-source 'https://rubygems.org'
-
-ruby '>= 2.4.0'
-
-gem 'jekyll', '4.0.1'
-gem 'addressable', '2.7.0'
-gem 'octokit', '4.14.0'
-gem 'therubyracer', '0.12.3'
-gem 'json', '2.2.0'
-gem 'jekyll-multiple-languages', '2.0.3'
-gem 'jekyll-paginate', '1.1.0'
-gem 'liquid-c', '4.0.0' # speed-up site generation
-gem 'sassc', '2.2.1' # speed-up site generation
-
-# group :jekyll_plugins do
-# gem 'hawkins'
-# end
diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
deleted file mode 100644
index 7d55d5cc6d63c..0000000000000
--- a/docs/Gemfile.lock
+++ /dev/null
@@ -1,96 +0,0 @@
-GEM
- remote: https://rubygems.org/
- specs:
- addressable (2.7.0)
- public_suffix (>= 2.0.2, < 5.0)
- colorator (1.1.0)
- concurrent-ruby (1.1.6)
- em-websocket (0.5.1)
- eventmachine (>= 0.12.9)
- http_parser.rb (~> 0.6.0)
- eventmachine (1.2.7)
- faraday (0.17.0)
- multipart-post (>= 1.2, < 3)
- ffi (1.11.2)
- forwardable-extended (2.6.0)
- http_parser.rb (0.6.0)
- i18n (1.8.3)
- concurrent-ruby (~> 1.0)
- jekyll (4.0.1)
- addressable (~> 2.4)
- colorator (~> 1.0)
- em-websocket (~> 0.5)
- i18n (>= 0.9.5, < 2)
- jekyll-sass-converter (~> 2.0)
- jekyll-watch (~> 2.0)
- kramdown (~> 2.1)
- kramdown-parser-gfm (~> 1.0)
- liquid (~> 4.0)
- mercenary (~> 0.3.3)
- pathutil (~> 0.9)
- rouge (~> 3.0)
- safe_yaml (~> 1.0)
- terminal-table (~> 1.8)
- jekyll-multiple-languages (2.0.3)
- jekyll-paginate (1.1.0)
- jekyll-sass-converter (2.1.0)
- sassc (> 2.0.1, < 3.0)
- jekyll-watch (2.2.1)
- listen (~> 3.0)
- json (2.2.0)
- kramdown (2.2.1)
- rexml
- kramdown-parser-gfm (1.1.0)
- kramdown (~> 2.0)
- libv8 (3.16.14.19)
- liquid (4.0.3)
- liquid-c (4.0.0)
- liquid (>= 3.0.0)
- listen (3.2.1)
- rb-fsevent (~> 0.10, >= 0.10.3)
- rb-inotify (~> 0.9, >= 0.9.10)
- mercenary (0.3.6)
- multipart-post (2.1.1)
- octokit (4.14.0)
- sawyer (~> 0.8.0, >= 0.5.3)
- pathutil (0.16.2)
- forwardable-extended (~> 2.6)
- public_suffix (4.0.1)
- rb-fsevent (0.10.4)
- rb-inotify (0.10.1)
- ffi (~> 1.0)
- ref (2.0.0)
- rexml (3.2.4)
- rouge (3.20.0)
- safe_yaml (1.0.5)
- sassc (2.2.1)
- ffi (~> 1.9)
- sawyer (0.8.2)
- addressable (>= 2.3.5)
- faraday (> 0.8, < 2.0)
- terminal-table (1.8.0)
- unicode-display_width (~> 1.1, >= 1.1.1)
- therubyracer (0.12.3)
- libv8 (~> 3.16.14.15)
- ref
- unicode-display_width (1.7.0)
-
-PLATFORMS
- ruby
-
-DEPENDENCIES
- addressable (= 2.7.0)
- jekyll (= 4.0.1)
- jekyll-multiple-languages (= 2.0.3)
- jekyll-paginate (= 1.1.0)
- json (= 2.2.0)
- liquid-c (= 4.0.0)
- octokit (= 4.14.0)
- sassc (= 2.2.1)
- therubyracer (= 0.12.3)
-
-RUBY VERSION
- ruby 2.6.3p62
-
-BUNDLED WITH
- 1.17.2
diff --git a/docs/README.md b/docs/README.md
index 509ba6db825ef..7aab7b52b8452 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -6,55 +6,23 @@ https://flink.apache.org/ is also generated from the files found here.
# Requirements
-The dependencies are declared in the Gemfile in this directory. We use Markdown
-to write and Jekyll to translate the documentation to static HTML. All required
-dependencies are installed locally when you build the documentation through the
-`build_docs.sh` script. If you want to install the software manually, use Ruby's
-Bundler Gem to install all dependencies:
+### Build the site locally
- gem install bundler -v 1.16.1
- bundle install
+Make sure you have installed [Hugo](https://gohugo.io/getting-started/installing/) on your
+system. To build the Flink docs, you need the *extended version* of Hugo with Sass/SCSS support.
-Note that in Ubuntu based systems, it may be necessary to install the following
-packages: `rubygems ruby-dev libssl-dev build-essential`.
+From this directory:
-# Using Dockerized Jekyll
+ * Fetch the theme submodule
+ ```sh
+ git submodule update --init --recursive
+ ```
+ * Start local server
+ ```sh
+ hugo -b "" serve
+ ```
-We dockerized the jekyll environment above. If you have [docker](https://docs.docker.com/),
-you can run the following command to start the container.
-
-```
-cd flink/docs/docker
-./run.sh
-```
-
-It takes a few moments to build the image for the first time but will be a second from the second time.
-The run.sh command brings you in a bash session where you run the `./build_docs.sh` script mentioned above.
-
-
-# Build
-
-The `docs/build_docs.sh` script installs dependencies locally, calls Jekyll, and
-generates the documentation in `docs/content`. You can then point your browser
-to `docs/content/index.html` and start reading.
-
-If you call the script with the preview flag `build_docs.sh -p`, Jekyll will
-start a web server at `localhost:4000` and watch the docs directory for
-updates. Use this mode to preview changes locally.
-
-You can call the script with the incremental flag `build_docs.sh -i`.
-Jekyll will then serve a live preview at `localhost:4000`,
-and it will be much faster because it will only rebuild the pages corresponding
-to files that are modified. Note that if you are making changes that affect
-the sidebar navigation, you'll have to build the entire site to see
-those changes reflected on every page.
-
-| Flag | Action |
-| -----| -------|
-| -p | Run interactive preview |
-| -i | Incremental builds |
-| -e | Build only English docs |
-| -z | Build only Chinese docs |
+The site can be viewed at http://localhost:1313/
## Generate configuration tables
@@ -64,11 +32,11 @@ Configuration descriptions are auto generated from code. To trigger the generati
mvn -Pgenerate-config-docs install
```
-The resulting html files will be written to `_includes/generated`. Tables are regenerated each time the command is invoked.
+The resulting html files will be written to `layouts/shortcodes/generated`. Tables are regenerated each time the command is invoked.
These tables can be directly included into the documentation:
```
-{% include generated/file_name.html %}
+{{< generated/file_name >}}
```
# Contribute
@@ -85,11 +53,13 @@ In addition to Markdown, every page contains a Jekyll front matter, which specif
title: "Title of the Page"
---
-Furthermore, you can access the variables found in `docs/_config.yml` as follows:
-
- {{ site.NAME }}
-
-This will be replaced with the value of the variable called `NAME` when generating the docs.
+ ---
+ title: "Title of the Page" <-- Title rendered in the side nave
+ weight: 1 <-- Weight controls the ordering of pages in the side nav.
+ type: docs <-- required
+ aliases: <-- Alias to setup redirect from removed page to this one
+ - /alias/to/removed/page.html
+ ---
## Structure
@@ -100,8 +70,8 @@ This will be replaced with the value of the variable called `NAME` when generati
All documents are structured with headings. From these headings, you can automatically generate a page table of contents (see below).
```
-# Level-1 Heading <- Used for the title of the page (don't use this)
-## Level-2 Heading <- Start with this one
+# Level-1 Heading <- Used for the title of the page
+## Level-2 Heading <- Start with this one for content
### Level-3 heading
#### Level-4 heading
##### Level-5 heading
@@ -111,47 +81,149 @@ Please stick to the "logical order" when using the headlines, e.g. start with le
#### Table of Contents
- * This will be replaced by the TOC
- {:toc}
+Table of contents are added automatically to every page, based on heading levels 2 - 4.
+The ToC can be ommitted by adding the following to the front matter of the page:
+
+ ---
+ bookToc: false
+ ---
+
+### ShortCodes
+
+Flink uses [shortcodes](https://gohugo.io/content-management/shortcodes/) to add custom functionality
+to its documentation markdown. The following are available for use:
+#### Flink Artifact
-Add this markup (both lines) to the document in order to generate a table of contents for the page. Headings until level 3 headings are included.
+ {{< artfiact flink-streaming-java withScalaVersion >}}
-You can exclude a heading from the table of contents:
+This will be replaced by the maven artifact for flink-streaming-java that users should copy into their pom.xml file. It will render out to:
- # Excluded heading
- {:.no_toc}
+```xml
+
+ org.apache.flink
+ flink-streaming-java_2.11
+
+
+```
+
+It includes a number of optional flags:
+
+* withScalaVersion: Includes the scala version suffix to the artifact id
+* withTestScope: Includes `test` to the module. Useful for marking test dependencies.
+* withTestClassifier: Includes `tests`. Useful when users should be pulling in Flinks tests dependencies. This is mostly for the test harnesses and probably not what you want.
#### Back to Top
- {% top %}
+ {{< top >}}
This will be replaced by a back to top link. It is recommended to use these links at least at the end of each level-2 section.
-#### Labels
+#### Info Hints
+
+ {{< hint info >}}
+ Some interesting information
+ {{< /hint >}}
+
+The hint will be rendered in a blue box. This hint is useful when providing
+additional information for the user that does not fit into the flow of the documentation.
+
+#### Info Warning
+
+ {{< hint warning >}}
+ Something to watch out for.
+ {{< /hint >}}
+
+The hint will be rendered in a yellow box. This hint is useful when highlighting
+information users should watch out for to prevent errors.
+
+#### Info Danger
+
+ {{< hint danger >}}
+ Something to avoid
+ {{< /hint >}}
+
+The hint will be rendered in a red box. This hint is useful when highlighting
+information users need to know to avoid data loss or to point out broken
+functionality.
+
+#### Label
+
+ {{< label "My Label" >}}
+
+The label will be rendered in an inlined blue box. This is useful for labeling functionality
+such as whether a SQL feature works for only batch or streaming execution.
+
+#### Flink version
+
+ {{< version >}}
+
+Interpolates the current Flink version
+
+#### Scala Version
+
+ {{< scala_verison >}}
+
+Interpolates the default scala version
+
+#### Stable
+
+ {{< stable >}}
+ Some content
+ {{< /stable >}}
+
+This shortcode will only render its content if the site is marked as stable.
+
+#### Unstable
- {% info %}
- {% warn %}
+ {{< unstable >}}
+ Some content
+ {{< /unstable >}}
+
+This shortcode will only render its content if the site is marked as unstable.
-These will be replaced by an info or warning label. You can change the text of the label by providing an argument:
+#### Query State Warning
- {% info Recommendation %}
+ {{< query_state_warning >}}
+
+Will render a warning the current SQL feature may have unbounded state requirements.
-### Documentation
+#### tab
-#### Navigation
+ {{< tabs "sometab" >}}
+ {{< tab "Java" >}}
+ ```java
+ System.out.println("Hello World!");
+ ```
+ {{< /tab >}}
+ {{< tab "Scala" >}}
+ ```scala
+ println("Hello World!");
+ ```
+ {< /tab >}}
+ {{< /tabs }}
+
+Prints the content in tabs. IMPORTANT: The label in the outermost "tabs" shortcode must
+be unique for the page.
-The navigation on the left side of the docs is automatically generated when building the docs. You can modify the markup in `_include/sidenav.html`.
+#### Github Repo
-The structure of the navigation is determined by the front matter of all pages. The fields used to determine the structure are:
+ {{< github_repo >}}
+
+Renders a link to the apache flink repo.
-- `nav-id` => ID of this page. Other pages can use this ID as their parent ID.
-- `nav-parent_id` => ID of the parent. This page will be listed under the page with id `nav-parent_id`.
+#### Github Link
-Level 0 is made up of all pages, which have nav-parent_id set to `root`. There is no limitation on how many levels you can nest.
+ {{< gh_link file="/some/file.java" name="Some file" >}}
+
+Renders a link to a file in the Apache Flink repo with a given name.
+
+#### JavaDocs Link
+ {{< javadoc file="some/file" name="Some file" >}}
-The `title` of the page is used as the default link text. You can override this via `nav-title`. The relative position per navigational level is determined by `nav-pos`.
+Renders a link to a file in the Apache Flink Java Documentation.
-If you have a page with sub pages, the link target will be used to expand the sub level navigation. If you want to actually add a link to the page as well, you can add the `nav-show_overview: true` field to the front matter. This will then add an `Overview` sub page to the expanded list.
+#### PythonDocs Link
+ {< pythondoc file="some/file" name="Some file" >}}
-The nesting is also used for the breadcrumbs like `Application Development > Libraries > Machine Learning > Optimization`.
+Renders a link to a file in the Apache Flink Python Documentation.
diff --git a/docs/_config.yml b/docs/_config.yml
deleted file mode 100644
index 019b0345f6d0d..0000000000000
--- a/docs/_config.yml
+++ /dev/null
@@ -1,112 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#------------------------------------------------------------------------------
-# VARIABLES
-#------------------------------------------------------------------------------
-# Variables specified in this file can be used in the documentation via:
-# {{ site.CONFIG_KEY }}
-#------------------------------------------------------------------------------
-
-# This are the version referenced in the docs. Please only use these variables
-# to reference a specific Flink version, because this is the only place where
-# we change the version for the complete docs when forking of a release branch
-# etc.
-# The full version string as referenced in Maven (e.g. 1.2.1)
-version: "1.13-SNAPSHOT"
-# For stable releases, leave the bugfix version out (e.g. 1.2). For snapshot
-# release this should be the same as the regular version
-version_title: "1.13-SNAPSHOT"
-# Branch on Github for this version
-github_branch: "master"
-
-# Plain Scala version is needed for e.g. the Gradle quickstart.
-scala_version: "2.11"
-# This suffix is appended to the Scala-dependent Maven artifact names
-scala_version_suffix: "_2.11"
-
-# Some commonly linked pages (this was more important to have as a variable
-# during incubator; by now it should also be fine to hardcode these.)
-website_url: "https://flink.apache.org"
-jira_url: "https://issues.apache.org/jira/browse/FLINK"
-github_url: "https://github.com/apache/flink"
-download_url: "https://flink.apache.org/downloads.html"
-zh_download_url: "https://flink.apache.org/zh/downloads.html"
-
-# please use a protocol relative URL here
-baseurl: //ci.apache.org/projects/flink/flink-docs-master
-stable_baseurl: //ci.apache.org/projects/flink/flink-docs-stable
-
-javadocs_baseurl: //ci.apache.org/projects/flink/flink-docs-master
-pythondocs_baseurl: //ci.apache.org/projects/flink/flink-docs-master
-
-statefundocs_baseurl: //ci.apache.org/projects/flink/flink-statefun-docs-master
-statefundocs_stable_baseurl: //ci.apache.org/projects/flink/flink-statefun-docs-stable
-
-# Flag whether this is a stable version or not. Used for the quickstart page.
-is_stable: false
-
-# Flag to indicate whether an outdated warning should be shown.
-show_outdated_warning: false
-
-previous_docs:
- '1.12': http://ci.apache.org/projects/flink/flink-docs-release-1.12
- '1.11': http://ci.apache.org/projects/flink/flink-docs-release-1.11
- '1.10': http://ci.apache.org/projects/flink/flink-docs-release-1.10
- '1.9': http://ci.apache.org/projects/flink/flink-docs-release-1.9
- '1.8': http://ci.apache.org/projects/flink/flink-docs-release-1.8
- '1.7': http://ci.apache.org/projects/flink/flink-docs-release-1.7
- '1.6': http://ci.apache.org/projects/flink/flink-docs-release-1.6
- '1.5': http://ci.apache.org/projects/flink/flink-docs-release-1.5
- '1.4': http://ci.apache.org/projects/flink/flink-docs-release-1.4
- '1.3': http://ci.apache.org/projects/flink/flink-docs-release-1.3
- '1.2': http://ci.apache.org/projects/flink/flink-docs-release-1.2
- '1.1': http://ci.apache.org/projects/flink/flink-docs-release-1.1
- '1.0': http://ci.apache.org/projects/flink/flink-docs-release-1.0
-
-#------------------------------------------------------------------------------
-# BUILD CONFIG
-#------------------------------------------------------------------------------
-# These variables configure the jekyll build (./build_docs.sh). You don't need
-# to change anything here.
-#------------------------------------------------------------------------------
-
-exclude:
- - "build_docs.sh"
- - "check_links.sh"
- - "spider.log"
-
-# Used in some documents to initialize arrays. Don't delete.
-array: []
-
-defaults:
- -
- scope:
- path: ""
- values:
- layout: plain
- nav-pos: 99999 # Move to end if no pos specified
-
-host: 0.0.0.0
-
-kramdown:
- toc_levels: 1..3 # Include h1-h3 for ToC
-
-# The all languages used
-languages: ['en', 'zh']
-
-plugins: ['jekyll-paginate', 'jekyll-multiple-languages']
diff --git a/docs/_config_dev_en.yml b/docs/_config_dev_en.yml
deleted file mode 100644
index eb11fbf39f319..0000000000000
--- a/docs/_config_dev_en.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License
-
-exclude:
- - "*.zh.md"
- - "build_docs.sh"
- - "check_links.sh"
- - "content"
- - "content_en"
- - "content_zh"
diff --git a/docs/_config_dev_zh.yml b/docs/_config_dev_zh.yml
deleted file mode 100644
index 8b9ddeb88caaf..0000000000000
--- a/docs/_config_dev_zh.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License
-
-exclude:
- - "*.md"
- - "build_docs.sh"
- - "check_links.sh"
- - "content"
- - "content_en"
- - "content_zh"
-
-include:
- - "*.zh.md"
diff --git a/docs/_data/sql-connectors.yml b/docs/_data/sql-connectors.yml
deleted file mode 100644
index 46a94256a6482..0000000000000
--- a/docs/_data/sql-connectors.yml
+++ /dev/null
@@ -1,154 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License
-
-# INSTRUCTIONS:
-#
-# In order to add a new connector/format add a new entry to this file.
-# You need specify a name that will be used in e.g. the description of the connector/format and
-# a category (either "format" or "connector"). The category determines which table will the entry
-# end up in on the Download page. The "maven" parameter describes the name of the maven module. The
-# three parameters are required.
-#
-# If you specify "built-in=true" the corresponding table on the connector/format will not contain
-# a link, but just a "Built-in" entry. If the built-in is set to true you do not need to provide the
-# sql-url.
-#
-# If a connector comes with different versions for the external system, you can put those under a
-# "versions" property. Each entry in the "versions" section should have a "version", which
-# determines name for the version and "maven" and "sql-url" entries for that particular version.
-# If you use the "versions" property, "maven" and "sql-url" should not be present in the top level
-# section of the connector. (Multiple versions are supported only for the connector for now. If you
-# need multiple versions support for formats, please update downloads.md)
-#
-# NOTE: You can use liquid variables in "sql-url" and "maven" properties.
-
-avro:
- name: Avro
- maven: flink-sql-avro
- category: format
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-avro/{{site.version}}/flink-sql-avro-{{site.version}}.jar
-
-avro-confluent:
- name: Avro Schema Registry
- maven: flink-avro-confluent-registry
- category: format
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-avro-confluent-registry/{{site.version}}/flink-sql-avro-confluent-registry-{{site.version}}.jar
-
-orc:
- name: ORC
- maven: flink-orc{{site.scala_version_suffix}}
- category: format
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-orc{{site.scala_version_suffix}}/{{site.version}}/flink-sql-orc{{site.scala_version_suffix}}-{{site.version}}.jar
-
-parquet:
- name: Parquet
- maven: flink-parquet{{site.scala_version_suffix}}
- category: format
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-parquet{{site.scala_version_suffix}}/{{site.version}}/flink-sql-parquet{{site.scala_version_suffix}}-{{site.version}}.jar
-
-debezium-avro-confluent:
- name: Debezium
- maven: flink-avro-confluent-registry
- category: format
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-avro-confluent-registry/{{site.version}}/flink-sql-avro-confluent-registry-{{site.version}}.jar
-
-debezium-json:
- name: Debezium
- maven: flink-json
- category: format
- built-in: true
-
-canal:
- name: Canal
- maven: flink-json
- category: format
- built-in: true
-
-maxwell:
- name: Maxwell
- maven: flink-json
- category: format
- built-in: true
-
-csv:
- name: CSV
- maven: flink-csv
- category: format
- built-in: true
-
-json:
- name: Json
- maven: flink-json
- category: format
- built-in: true
-
-raw:
- name: RAW
- maven:
- category: format
- built-in: true
-
-elastic:
- name: Elasticsearch
- category: connector
- versions:
- - version: 6.x
- maven: flink-connector-elasticsearch6{{site.scala_version_suffix}}
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch6{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-elasticsearch6{{site.scala_version_suffix}}-{{site.version}}.jar
- - version: 7.x and later versions
- maven: flink-connector-elasticsearch7{{site.scala_version_suffix}}
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-elasticsearch7{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-elasticsearch7{{site.scala_version_suffix}}-{{site.version}}.jar
-
-hbase:
- name: HBase
- category: connector
- versions:
- - version: 1.4.x
- maven: flink-connector-hbase-1.4{{site.scala_version_suffix}}
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hbase-1.4{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-hbase-1.4{{site.scala_version_suffix}}-{{site.version}}.jar
- - version: 2.2.x
- maven: flink-connector-hbase-2.2{{site.scala_version_suffix}}
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hbase-2.2{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-hbase-2.2{{site.scala_version_suffix}}-{{site.version}}.jar
-
-jdbc:
- name: JDBC
- category: connector
- maven: flink-connector-jdbc{{site.scala_version_suffix}}
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc{{site.scala_version_suffix}}/{{site.version}}/flink-connector-jdbc{{site.scala_version_suffix}}-{{site.version}}.jar
-
-kafka:
- name: Kafka
- category: connector
- versions:
- - version: universal
- maven: flink-connector-kafka{{site.scala_version_suffix}}
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-kafka{{site.scala_version_suffix}}-{{site.version}}.jar
-
-upsert-kafka:
- name: Upsert Kafka
- category: connector
- versions:
- - version: universal
- maven: flink-connector-kafka{{site.scala_version_suffix}}
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-kafka{{site.scala_version_suffix}}-{{site.version}}.jar
-
-kinesis:
- name: Kinesis
- category: connector
- maven: flink-connector-kinesis{{ site.scala_version_suffix }}
- sql-url: https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kinesis{{site.scala_version_suffix}}/{{site.version}}/flink-sql-connector-kinesis{{site.scala_version_suffix}}-{{site.version}}.jar
-
diff --git a/docs/_includes/generated/all_jobmanager_section.html b/docs/_includes/generated/all_jobmanager_section.html
deleted file mode 100644
index 3cf1e6116c14f..0000000000000
--- a/docs/_includes/generated/all_jobmanager_section.html
+++ /dev/null
@@ -1,66 +0,0 @@
-
-
-
-
Key
-
Default
-
Type
-
Description
-
-
-
-
-
jobmanager.archive.fs.dir
-
(none)
-
String
-
Dictionary for JobManager to store the archives of completed jobs.
-
-
-
jobmanager.execution.attempts-history-size
-
16
-
Integer
-
The maximum number of prior execution attempts kept in history.
-
-
-
jobmanager.execution.failover-strategy
-
"region"
-
String
-
This option specifies how the job computation recovers from task failures. Accepted values are:
'full': Restarts all tasks to recover the job.
'region': Restarts all tasks that could be affected by the task failure. More details can be found here.
-
-
-
jobmanager.retrieve-taskmanager-hostname
-
true
-
Boolean
-
Flag indicating whether JobManager would retrieve canonical host name of TaskManager during registration. If the option is set to "false", TaskManager registration with JobManager could be faster, since no reverse DNS lookup is performed. However, local input split assignment (such as for HDFS files) may be impacted.
-
-
-
jobmanager.rpc.address
-
(none)
-
String
-
The config parameter defining the network address to connect to for communication with the job manager. This value is only interpreted in setups where a single JobManager with static name or address exists (simple standalone setups, or container setups with dynamic service name resolution). It is not used in many high-availability setups, when a leader-election service (like ZooKeeper) is used to elect and discover the JobManager leader from potentially multiple standby JobManagers.
-
-
-
jobmanager.rpc.port
-
6123
-
Integer
-
The config parameter defining the network port to connect to for communication with the job manager. Like jobmanager.rpc.address, this value is only interpreted in setups where a single JobManager with static name/address and port exists (simple standalone setups, or container setups with dynamic service name resolution). This config option is not used in many high-availability setups, when a leader-election service (like ZooKeeper) is used to elect and discover the JobManager leader from potentially multiple standby JobManagers.
-
-
-
jobstore.cache-size
-
52428800
-
Long
-
The job store cache size in bytes which is used to keep completed jobs in memory.
-
-
-
jobstore.expiration-time
-
3600
-
Long
-
The time in seconds after which a completed job expires and is purged from the job store.
-
-
-
jobstore.max-capacity
-
2147483647
-
Integer
-
The max number of completed jobs that can be kept in the job store.
The state backend to be used to store and checkpoint state.
-
-
-
state.checkpoints.dir
-
(none)
-
String
-
The default directory used for storing the data files and meta data of checkpoints in a Flink supported filesystem. The storage path must be accessible from all participating processes/nodes(i.e. all TaskManagers and JobManagers).
-
-
-
state.savepoints.dir
-
(none)
-
String
-
The default directory for savepoints. Used by the state backends that write savepoints to file systems (MemoryStateBackend, FsStateBackend, RocksDBStateBackend).
-
-
-
state.backend.incremental
-
false
-
Boolean
-
Option whether the state backend should create incremental checkpoints, if possible. For an incremental checkpoint, only a diff from the previous checkpoint is stored, rather than the complete checkpoint state. Once enabled, the state size shown in web UI or fetched from rest API only represents the delta checkpoint size instead of full checkpoint size. Some state backends may not support incremental checkpoints and ignore this option.
-
-
-
state.backend.local-recovery
-
false
-
Boolean
-
This option configures local recovery for this state backend. By default, local recovery is deactivated. Local recovery currently only covers keyed state backends. Currently, MemoryStateBackend does not support local recovery and ignore this option.
-
-
-
state.checkpoints.num-retained
-
1
-
Integer
-
The maximum number of completed checkpoints to retain.
-
-
-
taskmanager.state.local.root-dirs
-
(none)
-
String
-
The config parameter defining the root directories for storing file-based state for local recovery. Local recovery currently only covers keyed state backends. Currently, MemoryStateBackend does not support local recovery and ignore this option
Only relevant if `execution.checkpointing.unaligned` is enabled.
If timeout is 0, checkpoints will always start unaligned.
If timeout has a positive value, checkpoints will start aligned. If during checkpointing, checkpoint start delay exceeds this timeout, alignment will timeout and checkpoint barrier will start working as unaligned checkpoint.
Possible values: [DELETE_ON_CANCELLATION, RETAIN_ON_CANCELLATION]
-
Externalized checkpoints write their meta data out to persistent storage and are not automatically cleaned up when the owning job fails or is suspended (terminating with job status `JobStatus#FAILED` or `JobStatus#SUSPENDED`. In this case, you have to manually clean up the checkpoint state, both the meta data and actual program state.
The mode defines how an externalized checkpoint should be cleaned up on job cancellation. If you choose to retain externalized checkpoints on cancellation you have to handle checkpoint clean up manually when you cancel the job as well (terminating with job status `JobStatus#CANCELED`).
The target directory for externalized checkpoints is configured via `state.checkpoints.dir`.
-
-
-
execution.checkpointing.interval
-
(none)
-
Duration
-
Gets the interval in which checkpoints are periodically scheduled.
This setting defines the base interval. Checkpoint triggering may be delayed by the settings `execution.checkpointing.max-concurrent-checkpoints` and `execution.checkpointing.min-pause`
The maximum number of checkpoint attempts that may be in progress at the same time. If this value is n, then no checkpoints will be triggered while n checkpoint attempts are currently in flight. For the next checkpoint to be triggered, one checkpoint attempt would need to finish or expire.
-
-
-
execution.checkpointing.min-pause
-
0 ms
-
Duration
-
The minimal pause between checkpointing attempts. This setting defines how soon thecheckpoint coordinator may trigger another checkpoint after it becomes possible to triggeranother checkpoint with respect to the maximum number of concurrent checkpoints(see `execution.checkpointing.max-concurrent-checkpoints`).
If the maximum number of concurrent checkpoints is set to one, this setting makes effectively sure that a minimum amount of time passes where no checkpoint is in progress at all.
-
-
-
execution.checkpointing.mode
-
EXACTLY_ONCE
-
Enum
Possible values: [EXACTLY_ONCE, AT_LEAST_ONCE]
-
The checkpointing mode (exactly-once vs. at-least-once).
The tolerable checkpoint failure number. If set to 0, that means we do not tolerance any checkpoint failure.
-
-
-
execution.checkpointing.unaligned
-
false
-
Boolean
-
Enables unaligned checkpoints, which greatly reduce checkpointing times under backpressure.
Unaligned checkpoints contain data stored in buffers as part of the checkpoint state, which allows checkpoint barriers to overtake these buffers. Thus, the checkpoint duration becomes independent of the current throughput as checkpoint barriers are effectively not embedded into the stream of data anymore.
Unaligned checkpoints can only be enabled if `execution.checkpointing.mode` is `EXACTLY_ONCE` and if `execution.checkpointing.max-concurrent-checkpoints` is 1
Defines the ACL (open|creator) to be configured on ZK node. The configuration value can be set to “creator” if the ZooKeeper server configuration has the “authProvider” property mapped to use SASLAuthenticationProvider and the cluster is configured to run in secure mode (Kerberos).
The time in ms that the client waits for the leader address, e.g., Dispatcher or WebMonitorEndpoint
-
-
-
rest.client.max-content-length
-
104857600
-
Integer
-
The maximum content length in bytes that the client will handle.
-
-
-
rest.connection-timeout
-
15000
-
Long
-
The maximum time in ms for the client to establish a TCP connection.
-
-
-
rest.idleness-timeout
-
300000
-
Long
-
The maximum time in ms for a connection to stay idle before failing.
-
-
-
rest.retry.delay
-
3000
-
Long
-
The time in ms that the client waits between retries (See also `rest.retry.max-attempts`).
-
-
-
rest.retry.max-attempts
-
20
-
Integer
-
The number of retries the client will attempt if a retryable operations fails.
-
-
-
rest.server.max-content-length
-
104857600
-
Integer
-
The maximum content length in bytes that the server will handle.
-
-
-
rest.server.numThreads
-
4
-
Integer
-
The number of threads for the asynchronous processing of requests.
-
-
-
rest.server.thread-priority
-
5
-
Integer
-
Thread priority of the REST server's executor for processing asynchronous requests. Lowering the thread priority will give Flink's main components more CPU time whereas increasing will allocate more time for the REST server's processing.
Enable the slot spread out allocation strategy. This strategy tries to spread out the slots evenly across all available `TaskExecutors`.
-
-
-
slot.idle.timeout
-
50000
-
Long
-
The timeout in milliseconds for a idle slot in Slot Pool.
-
-
-
slot.request.timeout
-
300000
-
Long
-
The timeout in milliseconds for requesting a slot from Slot Pool.
-
-
-
slotmanager.number-of-slots.max
-
2147483647
-
Integer
-
Defines the maximum number of slots that the Flink cluster allocates. This configuration option is meant for limiting the resource consumption for batch workloads. It is not recommended to configure this option for streaming workloads, which may fail if there are not enough slots. Note that this configuration option does not take effect for standalone clusters, where how many slots are allocated is not controlled by Flink.
The timeout (in ms) for flushing the `close_notify` that was triggered by closing a channel. If the `close_notify` was not flushed in the given timeout the channel will be closed forcibly. (-1 = use system default)
-
-
-
security.ssl.internal.handshake-timeout
-
-1
-
Integer
-
The timeout (in ms) during SSL handshake. (-1 = use system default)
-
-
-
security.ssl.internal.session-cache-size
-
-1
-
Integer
-
The size of the cache used for storing SSL session objects. According to here, you should always set this to an appropriate number to not run into a bug with stalling IO threads during garbage collection. (-1 = use system default).
-
-
-
security.ssl.internal.session-timeout
-
-1
-
Integer
-
The timeout (in ms) for the cached SSL session objects. (-1 = use system default)
-
-
-
security.ssl.provider
-
"JDK"
-
String
-
The SSL engine provider to use for the ssl transport:
`JDK`: default Java-based SSL engine
`OPENSSL`: openSSL-based SSL engine using system libraries
`OPENSSL` is based on netty-tcnative and comes in two flavours:
dynamically linked: This will use your system's openSSL libraries (if compatible) and requires `opt/flink-shaded-netty-tcnative-dynamic-*.jar` to be copied to `lib/`
statically linked: Due to potential licensing issues with openSSL (see LEGAL-393), we cannot ship pre-built libraries. However, you can build the required library yourself and put it into `lib/`: `git clone https://github.com/apache/flink-shaded.git && cd flink-shaded && mvn clean package -Pinclude-netty-tcnative-static -pl flink-shaded-netty-tcnative-static`
Option whether the state backend should use an asynchronous snapshot method where possible and configurable. Some state backends may not support asynchronous snapshots, or only support asynchronous snapshots, and ignore this option.
-
-
-
state.backend.fs.memory-threshold
-
20 kb
-
MemorySize
-
The minimum size of state data files. All state chunks smaller than that are stored inline in the root checkpoint metadata file. The max memory threshold for this configuration is 1MB.
-
-
-
state.backend.fs.write-buffer-size
-
4096
-
Integer
-
The default size of the write buffer for the checkpoint streams that write to file systems. The actual write buffer size is determined to be the maximum of the value of this option and option 'state.backend.fs.memory-threshold'.
Starting duration between restarts if `restart-strategy` has been set to `exponential-delay`. It can be specified using notation: "1 min", "20 s"
-
-
-
restart-strategy.exponential-delay.jitter-factor
-
0.1
-
Double
-
Jitter specified as a portion of the backoff if `restart-strategy` has been set to `exponential-delay`. It represents how large random value will be added or subtracted to the backoff. Useful when you want to avoid restarting multiple jobs at the same time.
-
-
-
restart-strategy.exponential-delay.max-backoff
-
5 min
-
Duration
-
The highest possible duration between restarts if `restart-strategy` has been set to `exponential-delay`. It can be specified using notation: "1 min", "20 s"
Threshold when the backoff is reset to its initial value if `restart-strategy` has been set to `exponential-delay`. It specifies how long the job must be running without failure to reset the exponentially increasing backoff to its initial value. It can be specified using notation: "1 min", "20 s"
Delay between two consecutive restart attempts if `restart-strategy` has been set to `failure-rate`. It can be specified using notation: "1 min", "20 s"
The number of times that Flink retries the execution before the job is declared as failed if `restart-strategy` has been set to `fixed-delay`.
-
-
-
restart-strategy.fixed-delay.delay
-
1 s
-
Duration
-
Delay between two consecutive restart attempts if `restart-strategy` has been set to `fixed-delay`. Delaying the retries can be helpful when the program interacts with external systems where for example connections or pending transactions should reach a timeout before re-execution is attempted. It can be specified using notation: "1 min", "20 s"
The address that should be used by clients to connect to the server. Attention: This option is respected only if the high-availability configuration is NONE.
-
-
-
rest.await-leader-timeout
-
30000
-
Long
-
The time in ms that the client waits for the leader address, e.g., Dispatcher or WebMonitorEndpoint
-
-
-
rest.bind-address
-
(none)
-
String
-
The address that the server binds itself.
-
-
-
rest.bind-port
-
"8081"
-
String
-
The port that the server binds itself. Accepts a list of ports (“50100,50101”), ranges (“50100-50200”) or a combination of both. It is recommended to set a range of ports to avoid collisions when multiple Rest servers are running on the same machine.
-
-
-
rest.client.max-content-length
-
104857600
-
Integer
-
The maximum content length in bytes that the client will handle.
-
-
-
rest.connection-timeout
-
15000
-
Long
-
The maximum time in ms for the client to establish a TCP connection.
-
-
-
rest.idleness-timeout
-
300000
-
Long
-
The maximum time in ms for a connection to stay idle before failing.
-
-
-
rest.port
-
8081
-
Integer
-
The port that the client connects to. If rest.bind-port has not been specified, then the REST server will bind to this port. Attention: This option is respected only if the high-availability configuration is NONE.
-
-
-
rest.retry.delay
-
3000
-
Long
-
The time in ms that the client waits between retries (See also `rest.retry.max-attempts`).
-
-
-
rest.retry.max-attempts
-
20
-
Integer
-
The number of retries the client will attempt if a retryable operations fails.
-
-
-
rest.server.max-content-length
-
104857600
-
Integer
-
The maximum content length in bytes that the server will handle.
-
-
-
rest.server.numThreads
-
4
-
Integer
-
The number of threads for the asynchronous processing of requests.
-
-
-
rest.server.thread-priority
-
5
-
Integer
-
Thread priority of the REST server's executor for processing asynchronous requests. Lowering the thread priority will give Flink's main components more CPU time whereas increasing will allocate more time for the REST server's processing.
Returns the status for the delete operation of a cluster data set.
-
-
-
Path parameters
-
-
-
-
-
triggerid - 32-character hexadecimal string that identifies an asynchronous operation trigger ID. The ID was returned then the operation was triggered.
Uploads a jar to the cluster. The jar must be sent as multi-part data. Make sure that the "Content-Type" header is set to "application/x-java-archive", as some http libraries do not add the header by default.
-Using 'curl' you can upload a jar via 'curl -X POST -H "Expect:" -F "jarfile=@path/to/flink-job.jar" http://hostname:port/jars/upload'.
Deletes a jar previously uploaded via '/jars/upload'.
-
-
-
Path parameters
-
-
-
-
-
jarid - String value that identifies a jar. When uploading the jar a path is returned, where the filename is the ID. This value is equivalent to the `id` field in the list of uploaded jars (/jars).
-
-
-
-
-
-
-
-
-
-{}
-
-
-
-
-
-
-
-
-
-
-{}
-
-
-
-
-
-
-
-
-
-
/jars/:jarid/plan
-
-
-
Verb: GET
-
Response code: 200 OK
-
-
-
Returns the dataflow plan of a job contained in a jar previously uploaded via '/jars/upload'. Program arguments can be passed both via the JSON request (recommended) or query parameters.
-
-
-
Path parameters
-
-
-
-
-
jarid - String value that identifies a jar. When uploading the jar a path is returned, where the filename is the ID. This value is equivalent to the `id` field in the list of uploaded jars (/jars).
-
-
-
-
-
Query parameters
-
-
-
-
-
program-args (optional): Deprecated, please use 'programArg' instead. String value that specifies the arguments for the program or plan
-
programArg (optional): Comma-separated list of program arguments.
-
entry-class (optional): String value that specifies the fully qualified name of the entry point class. Overrides the class defined in the jar file manifest.
-
parallelism (optional): Positive integer value that specifies the desired parallelism for the job.
Returns the dataflow plan of a job contained in a jar previously uploaded via '/jars/upload'. Program arguments can be passed both via the JSON request (recommended) or query parameters.
-
-
-
Path parameters
-
-
-
-
-
jarid - String value that identifies a jar. When uploading the jar a path is returned, where the filename is the ID. This value is equivalent to the `id` field in the list of uploaded jars (/jars).
-
-
-
-
-
Query parameters
-
-
-
-
-
program-args (optional): Deprecated, please use 'programArg' instead. String value that specifies the arguments for the program or plan
-
programArg (optional): Comma-separated list of program arguments.
-
entry-class (optional): String value that specifies the fully qualified name of the entry point class. Overrides the class defined in the jar file manifest.
-
parallelism (optional): Positive integer value that specifies the desired parallelism for the job.
Submits a job by running a jar previously uploaded via '/jars/upload'. Program arguments can be passed both via the JSON request (recommended) or query parameters.
-
-
-
Path parameters
-
-
-
-
-
jarid - String value that identifies a jar. When uploading the jar a path is returned, where the filename is the ID. This value is equivalent to the `id` field in the list of uploaded jars (/jars).
-
-
-
-
-
Query parameters
-
-
-
-
-
allowNonRestoredState (optional): Boolean value that specifies whether the job submission should be rejected if the savepoint contains state that cannot be mapped back to the job.
-
savepointPath (optional): String value that specifies the path of the savepoint to restore the job from.
-
program-args (optional): Deprecated, please use 'programArg' instead. String value that specifies the arguments for the program or plan
-
programArg (optional): Comma-separated list of program arguments.
-
entry-class (optional): String value that specifies the fully qualified name of the entry point class. Overrides the class defined in the jar file manifest.
-
parallelism (optional): Positive integer value that specifies the desired parallelism for the job.
Submits a job. This call is primarily intended to be used by the Flink client. This call expects a multipart/form-data request that consists of file uploads for the serialized JobGraph, jars and distributed cache artifacts and an attribute named "request" for the JSON payload.
jobid - 32-character hexadecimal string value that identifies a job.
-
-
-
-
-
-
-
-
-
-{}
-
-
-
-
-
-
-
-
-
-
-{
- "type" : "any"
-}
-
-
-
-
-
-
-
-
-
-
/jobs/:jobid/exceptions
-
-
-
Verb: GET
-
Response code: 200 OK
-
-
-
Returns the non-recoverable exceptions that have been observed by the job. The truncated flag defines whether more exceptions occurred, but are not listed, because the response would otherwise get too big.
-
-
-
Path parameters
-
-
-
-
-
jobid - 32-character hexadecimal string value that identifies a job.
-
-
-
-
-
Query parameters
-
-
-
-
-
maxExceptions (optional): Comma-separated list of integer values that specifies the upper limit of exceptions to return.
jobid - 32-character hexadecimal string value that identifies a job.
-
triggerid - 32-character hexadecimal string that identifies an asynchronous operation trigger ID. The ID was returned then the operation was triggered.
jobid - 32-character hexadecimal string value that identifies a job.
-
triggerid - 32-character hexadecimal string that identifies an asynchronous operation trigger ID. The ID was returned then the operation was triggered.
Stops a job with a savepoint. Optionally, it can also emit a MAX_WATERMARK before taking the savepoint to flush out any state waiting for timers to fire. This async operation would return a 'triggerid' for further query identifier.
-
-
-
Path parameters
-
-
-
-
-
jobid - 32-character hexadecimal string value that identifies a job.
Returns the status of a savepoint disposal operation.
-
-
-
Path parameters
-
-
-
-
-
triggerid - 32-character hexadecimal string that identifies an asynchronous operation trigger ID. The ID was returned then the operation was triggered.
Provides access to aggregated task manager metrics.
-
-
-
Query parameters
-
-
-
-
-
get (optional): Comma-separated list of string values to select specific metrics.
-
agg (optional): Comma-separated list of aggregation modes which should be calculated. Available aggregations are: "min, max, sum, avg".
-
taskmanagers (optional): Comma-separated list of 32-character hexadecimal strings to select specific task managers.
-
-
-
-
-
-
-
-
-
-{}
-
-
-
-
-
-
-
-
-
-
-{
- "type" : "any"
-}
-
-
-
-
-
-
-
-
-
-
/taskmanagers/:taskmanagerid
-
-
-
Verb: GET
-
Response code: 200 OK
-
-
-
Returns details for a task manager. "metrics.memorySegmentsAvailable" and "metrics.memorySegmentsTotal" are deprecated. Please use "metrics.nettyShuffleMemorySegmentsAvailable" and "metrics.nettyShuffleMemorySegmentsTotal" instead.
-
-
-
Path parameters
-
-
-
-
-
taskmanagerid - 32-character hexadecimal string that identifies a task manager.
Defines the restart strategy to use in case of job failures. Accepted values are:
`none`, `off`, `disable`: No restart strategy.
`fixeddelay`, `fixed-delay`: Fixed delay restart strategy. More details can be found here.
`failurerate`, `failure-rate`: Failure rate restart strategy. More details can be found here.
`exponentialdelay`, `exponential-delay`: Exponential delay restart strategy. More details can be found here.
If checkpointing is disabled, the default value is `none`. If checkpointing is enabled, the default value is `fixed-delay` with `Integer.MAX_VALUE` restart attempts and '`1 s`' delay.
A comma-separated list of login contexts to provide the Kerberos credentials to (for example, `Client,KafkaClient` to use the credentials for ZooKeeper authentication and for Kafka authentication)
-
-
-
security.kerberos.login.keytab
-
(none)
-
String
-
Absolute path to a Kerberos keytab file that contains the user credentials.
-
-
-
security.kerberos.login.principal
-
(none)
-
String
-
Kerberos principal name associated with the keytab.
-
-
-
security.kerberos.login.use-ticket-cache
-
true
-
Boolean
-
Indicates whether to read from your Kerberos ticket cache.
Enable or disable the OPTIONS hint used to specify table options dynamically, if disabled, an exception would be thrown if any OPTIONS hint is specified
-
-
-
table.generated-code.max-length
BatchStreaming
-
64000
-
Integer
-
Specifies a threshold where generated code will be split into sub-function calls. Java has a maximum method length of 64 KB. This setting allows for finer granularity if necessary.
-
-
-
table.local-time-zone
BatchStreaming
-
"default"
-
String
-
The local time zone defines current session time zone id. It is used when converting to/from <code>TIMESTAMP WITH LOCAL TIME ZONE</code>. Internally, timestamps with local time zone are always represented in the UTC time zone. However, when converting to data types that don't include a time zone (e.g. TIMESTAMP, TIME, or simply STRING), the session time zone is used during conversion. The input of option is either an abbreviation such as "PST", a full name such as "America/Los_Angeles", or a custom timezone id such as "GMT-8:00".
-
-
-
table.sql-dialect
BatchStreaming
-
"default"
-
String
-
The SQL dialect defines how to parse a SQL query. A different SQL dialect may support different SQL grammar. Currently supported dialects are: default and hive
diff --git a/docs/_includes/sidenav.html b/docs/_includes/sidenav.html
deleted file mode 100644
index ed767f23a0f8c..0000000000000
--- a/docs/_includes/sidenav.html
+++ /dev/null
@@ -1,184 +0,0 @@
-
-
-{%- comment -%}
-==============================================================================
-Extract the active nav IDs.
-==============================================================================
-{%- endcomment -%}
-
-{%- assign active_nav_ids = site.array -%}
-{%- assign parent_id = page.nav-parent_id -%}
-
-{%- for i in (1..10) -%}
- {%- if parent_id -%}
- {%- assign active_nav_ids = active_nav_ids | push: parent_id -%}
- {%- assign current = (site.pages_by_language[page.language] | where: "nav-id" , parent_id | sort: "nav-pos") -%}
- {%- if current.size > 0 -%}
- {%- assign parent_id = current[0].nav-parent_id -%}
- {%- else -%}
- {%- break -%}
- {%- endif -%}
- {%- else -%}
- {%- break -%}
- {%- endif -%}
-{%- endfor -%}
-
-{%- if page.language == "en" -%}
- {%- capture baseurl_i18n -%}{{ site.baseurl }}{%- endcapture -%}
-{%- else if page.language == "zh" -%}
- {%- capture baseurl_i18n -%}{{ site.baseurl }}/{{ page.language }}{%- endcapture -%}
-{%- endif -%}
-
-{%- comment -%}
-==============================================================================
-Build the nested list from nav-id and nav-parent_id relations.
-==============================================================================
-This builds a nested list from all pages. The fields used to determine the
-structure are:
-
-- 'nav-id' => ID of this page. Other pages can use this ID as their
- parent ID.
-- 'nav-parent_id' => ID of the parent. This page will be listed under
- the page with id 'nav-parent_id'.
-
-Level 0 is made up of all pages, which have nav-parent_id set to 'root'.
-
-The 'title' of the page is used as the default link text. You can
-override this via 'nav-title'. The relative position per navigational
-level is determined by 'nav-pos'.
-{%- endcomment -%}
-
-{%- assign elementsPosStack = site.array -%}
-{%- assign posStack = site.array -%}
-
-{%- assign elements = site.array -%}
-{%- assign all_pages_by_nav_parent = (site.pages_by_language[page.language] | where_exp: "item", "item.nav-parent_id != nil" | group_by: "nav-parent_id") -%}
-{%- assign children = (all_pages_by_nav_parent | where: "name" , "root") -%}
-{%- assign children = (children[0].items | sort: "nav-pos") -%}
-{%- if children.size > 0 -%}
- {%- assign elements = elements | push: children -%}
-{%- endif -%}
-
-{%- assign elementsPos = 0 -%}
-{%- assign pos = 0 -%}
-
-
-
v{{ site.version_title }}
-
-
-{%- for i in (1..10000) -%}
- {%- if pos >= elements[elementsPos].size -%}
- {%- if elementsPos == 0 -%}
- {%- break -%}
- {%- else -%}
- {%- assign elementsPos = elementsPosStack | last -%}
- {%- assign pos = posStack | last %}
-
- {%- assign elementsPosStack = elementsPosStack | pop -%}
- {%- assign posStack = posStack | pop -%}
- {%- endif -%}
- {%- else -%}
- {%- assign this = elements[elementsPos][pos] -%}
-
- {%- if this.url == page.url -%}
- {%- assign active = true -%}
- {%- elsif this.nav-id and active_nav_ids contains this.nav-id -%}
- {%- assign active = true -%}
- {%- else -%}
- {%- assign active = false -%}
- {%- endif -%}
-
- {%- capture title -%}{%- if this.nav-title -%}{{ this.nav-title }}{%- else -%}{{ this.title }}{%- endif -%}{%- endcapture -%}
- {%- capture target -%}"{{ site.baseurl }}{{ this.url }}"{%- if active %} class="active"{%- endif -%}{%- endcapture -%}
- {%- capture overview_target -%}"{{ site.baseurl }}{{ this.url }}"{%- if this.url == page.url -%} class="active"{%- endif -%}{%- endcapture -%}
-
- {% if this.section-break %}{% endif -%}
-
- {%- assign pos = pos | plus: 1 -%}
- {%- if this.nav-id -%}
- {%- assign children = (all_pages_by_nav_parent | where: "name" , this.nav-id) -%}
- {%- if children.size > 0 -%}
- {%- assign children = (children[0].items | sort: "nav-pos") -%}
- {%- capture collapse_target -%}"#collapse-{{ i }}" data-toggle="collapse"{%- if active -%} class="active"{%- endif -%}{%- endcapture -%}
- {%- capture expand -%}{%- unless active -%} {%- endunless -%}{%- endcapture %}
-
In order to use the {{ include.connector.name }} {{ include.connector.category }} the following
-dependencies are required for both projects using a build automation tool (such as Maven or SBT)
-and SQL Client with SQL JAR bundles.
-
-
-{% comment %}
- The 'liquify' filter makes it possible to include liquid variables such as e.g. site.version.
-{% endcomment %}
-
-{% if include.connector.versions == nil %}
-
-
-{% comment %}
-The 'liquify' filter makes it possible to include liquid variables such as e.g. site.version.
-{% endcomment %}
-
-{% if include.connector.versions == nil %}
-
- {% if page.language == "en" %}
- This documentation is for an out-of-date version of Apache Flink. We recommend you use the latest stable version.
- {% else if page.language == "zh" %}
- 本文档是 Apache Flink 的旧版本。建议访问 最新的稳定版本。
- {% endif %}
-
- {% endif %}
-
-
-
- {% comment %}
- This is the base for all content. The content from the layouts found in
- the _layouts directory goes here.
- {% endcomment %}
-
-
- {% include sidenav.html %}
-
-
- {%- if page.mathjax -%}
- {%- include latex_commands.html -%}
- {%- endif %}
-
- {{ content }}
-
{{ page.title }}{% if page.is_beta %} Beta{% endif %}
-{% if site.show_outdated_warning %}
-
- {%- if page.language == "en" %}
- This documentation is for an out-of-date version of Apache Flink. We recommend you use the latest stable version.
- {%- else if page.language == "zh" %}
- 本文档是 Apache Flink 的旧版本。建议访问 最新的稳定版本。
- {%- endif %}
-
-
- The page {{ page.title }} has been moved. Redirecting to {% link {{ page.redirect }}.md %} in 1 second.
-
-{% else if page.language == "zh" %}
-
-
-
-
'{{ page.title }}' 页面已被移动
-
- {{ page.title }} 页面已经被移动了。将在 1 秒后重定向到 {% link {{ page.redirect }}.zh.md %} 。
-
-{% endif %}
diff --git a/docs/_plugins/build_time.rb b/docs/_plugins/build_time.rb
deleted file mode 100644
index 61aa5e83bf682..0000000000000
--- a/docs/_plugins/build_time.rb
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-module Jekyll
- class BuildTimeTag < Liquid::Tag
-
- def initialize(tag_name, input, tokens)
- super
- end
-
- def render(context)
- Time.now.strftime("%D, %r %Z")
- end
- end
-end
-
-Liquid::Template.register_tag('build_time', Jekyll::BuildTimeTag)
\ No newline at end of file
diff --git a/docs/_plugins/gh_link.rb b/docs/_plugins/gh_link.rb
deleted file mode 100644
index bdaa2d44ac3b2..0000000000000
--- a/docs/_plugins/gh_link.rb
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# ---------------------------------------------------------
-# Expands a github link shortcut into a proper markdown link
-# ---------------------------------------------------------
-
-module Jekyll
- class GitHubLinkTag < Liquid::Tag
-
- def initialize(tag_name, input, tokens)
- super
- @input = input
- end
-
- def render(context)
- input = @input.sub(/".*"/, "").split
- name = @input.match(/".*"/).to_s.gsub(/"/, "")#@input.split.drop(2).join(" ")
- config = context.registers[:site].config
-
- path = input[0]
- file = path.split('/').last
-
- page_gh_tag = context["page"]["gh_link_tag"]
- # tag precendence:
- # 1. input[1],
- # 2. 'gh_link_tag' of page frontmatter
- # 3. "master" (default)
- gh_tag = input[1].nil? ? (page_gh_tag.nil? ? "master" : page_gh_tag) : input[1]
- name = name.to_s == '' ? file : name
- #refname = input[2].nil? ? file : input[2]
-
- "[#{name}](#{config["github_url"]}/blob/#{gh_tag}/#{path})"
- end
- end
-end
-
-Liquid::Template.register_tag('gh_link', Jekyll::GitHubLinkTag)
diff --git a/docs/_plugins/include_without_header.rb b/docs/_plugins/include_without_header.rb
deleted file mode 100644
index 8a7792e3f5557..0000000000000
--- a/docs/_plugins/include_without_header.rb
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-module Jekyll
- module Tags
-
- class IncludeWithoutHeaderTag < Liquid::Tag
-
- def initialize(tag_name, text, tokens)
- super
- @file = text.strip
- end
-
- def render(context)
- source = File.expand_path(context.registers[:site].config['source'])
- path = File.join(source, @file)
- content = File.read(path, :encoding => 'UTF-8')
- content = content.split(//, 2)[1]
- partial = Liquid::Template.parse(content)
- partial.render!(context)
- end
- end
- end
-end
-
-Liquid::Template.register_tag("include_without_header", Jekyll::Tags::IncludeWithoutHeaderTag)
diff --git a/docs/_plugins/info.rb b/docs/_plugins/info.rb
deleted file mode 100644
index ef3c210b92eaf..0000000000000
--- a/docs/_plugins/info.rb
+++ /dev/null
@@ -1,37 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-module Jekyll
- class InfoTag < Liquid::Tag
-
- def initialize(tag_name, text, tokens)
- super
- @text = text
- end
-
- def render(context)
- if @text.to_s == ''
- @text = "Info"
- end
-
- @text = @text.strip! || @text if !@text.nil?
- "#{@text}"
- end
- end
-end
-
-Liquid::Template.register_tag('info', Jekyll::InfoTag)
diff --git a/docs/_plugins/liquify.rb b/docs/_plugins/liquify.rb
deleted file mode 100644
index 57528f6873e05..0000000000000
--- a/docs/_plugins/liquify.rb
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# *A Jekyll filter that can parse Liquid in Liquid variables
-#
-# Usage:
-# e.g. Welcome to {{ page.title | liquify }}!
-
-module Jekyll
- module LiquifyFilter
- def liquify(input)
- Liquid::Template.parse(input).render(@context)
- end
- end
-end
-
-Liquid::Template.register_filter(Jekyll::LiquifyFilter)
diff --git a/docs/_plugins/panel.rb b/docs/_plugins/panel.rb
deleted file mode 100644
index 1dfef6346ac0c..0000000000000
--- a/docs/_plugins/panel.rb
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-module Jekyll
- class PanelTag < Liquid::Tag
-
- def initialize(tag_name, text, tokens)
- super
- @text = text
- end
-
- def render(context)
- @text = @text.strip! || @text if !@text.nil?
- "
#{@text}
"
- end
- end
-end
-
-Liquid::Template.register_tag('panel', Jekyll::PanelTag)
diff --git a/docs/_plugins/removeDuplicateLicenseHeaders.rb b/docs/_plugins/removeDuplicateLicenseHeaders.rb
deleted file mode 100644
index 2ac653fb40bae..0000000000000
--- a/docs/_plugins/removeDuplicateLicenseHeaders.rb
+++ /dev/null
@@ -1,75 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-
-# http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# ---------------------------------------------------------
-# Ensures that the documentation contains the Apache License
-# headers once, not repeatedly for each include.
-# ---------------------------------------------------------
-
-module Jekyll
-
- module LicenseRemover
-
- AL2 = "\n"
-
- def writeFile(dest, content)
- path = self.destination(dest)
- FileUtils.mkdir_p(File.dirname(path))
- File.open(path, 'w') do |f|
- # remove all Apache Licenses
- withoutLicense = content.gsub(//,'')
- # put single Apache License on top
- singleLicense = AL2+withoutLicense
- # write file out
- f.write(singleLicense)
- end
- end
-
- end
-
- class Post
- include LicenseRemover
- def write(dest)
- self.writeFile(dest, self.output)
- end
- end
-
- class Page
- include LicenseRemover
- def write(dest)
- self.writeFile(dest, self.output)
- end
- end
-
-end
diff --git a/docs/_plugins/top.rb b/docs/_plugins/top.rb
deleted file mode 100644
index b79781c8c95df..0000000000000
--- a/docs/_plugins/top.rb
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-module Jekyll
- class TopTag < Liquid::Tag
-
- def initialize(tag_name, text, tokens)
- super
- end
-
- def render(context)
- " Back to top"
- end
- end
-end
-
-Liquid::Template.register_tag('top', Jekyll::TopTag)
diff --git a/docs/_plugins/warn.rb b/docs/_plugins/warn.rb
deleted file mode 100644
index c8bd3af19ce8f..0000000000000
--- a/docs/_plugins/warn.rb
+++ /dev/null
@@ -1,41 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# ---------------------------------------------------------
-# Expands a github link shortcut into a proper markdown link
-# ---------------------------------------------------------
-
-module Jekyll
- class WarnTag < Liquid::Tag
-
- def initialize(tag_name, text, tokens)
- super
- @text = text
- end
-
- def render(context)
- if @text.to_s == ''
- @text = "Warning"
- end
-
- @text = @text.strip! || @text if !@text.nil?
- "#{@text}"
- end
- end
-end
-
-Liquid::Template.register_tag('warn', Jekyll::WarnTag)
diff --git a/docs/annotations.xml b/docs/annotations.xml
deleted file mode 100644
index a857770aa6047..0000000000000
--- a/docs/annotations.xml
+++ /dev/null
@@ -1,66 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/docs/assets/_custom.scss b/docs/assets/_custom.scss
new file mode 100644
index 0000000000000..33ccbb4060a8b
--- /dev/null
+++ b/docs/assets/_custom.scss
@@ -0,0 +1,239 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+@import "github";
+
+.link {
+ padding-bottom: 5px;
+}
+
+.appetizer {
+ color: #FBB142;
+}
+
+.maindish {
+ color: #7E4F89;
+}
+
+.dessert {
+ color: #E6526F;
+}
+
+.book-menu nav {
+ background: #f8f8f8;
+}
+
+.book-page {
+ padding: 2rem 2rem;
+}
+
+.book-search input {
+ background: white;
+}
+
+.markdown a {
+ text-decoration: none;
+ color: #05b;
+}
+
+.markdown a:visited {
+ text-decoration: none;
+ color: #05b;
+}
+
+.markdown {
+ line-height: 1.43;
+
+ h1,
+ h2,
+ h3,
+ h4,
+ h5,
+ h6 {
+ font-weight: 500;
+ padding-top: 0;
+ margin-top: 1em;
+ }
+}
+
+body {
+ letter-spacing: normal;
+ -webkit-font-smoothing: auto;
+}
+
+aside nav ul {
+ li {
+ margin: 0.5em 0;
+ }
+}
+
+.book-search {
+ border: 2px solid #ebebeb;
+}
+
+@media screen and (max-width: 768px) {
+ .toc {
+ display: none;
+ }
+}
+
+aside.book-menu nav {
+ a:hover {
+ font-weight: bold;
+ opacity: 1.0;
+ }
+
+ a.active {
+ font-weight: bold;
+ color: var(--body-font-color);
+ }
+}
+
+aside.book-menu > li {
+ padding: 10px 5px 5px 5px;
+}
+
+aside.book-toc {
+ h3 {
+ margin-top: 0;
+ padding-top: 0;
+ font-size: 1.2em;
+ }
+}
+
+html {
+ line-height: 1.43;
+}
+
+h1, h2, h3, h4, h5, h6 {
+ line-height: 1.1;
+}
+
+h1, h2, h3 {
+ margin-top: 20px;
+ margin-bottom: 10px;
+}
+
+h2, h3, h4 {
+ padding-top: 1em;
+}
+
+h1 {
+ font-size: 36px;
+}
+
+h2 {
+ font-size: 30px;
+ border-bottom: 1px solid #e5e5e5;
+}
+
+h3 {
+ font-size: 24px;
+}
+
+h4 {
+ font-size: 18px;
+}
+
+.markdown code {
+ background: white;
+ padding: 0;
+ border-radius: 0;
+}
+
+pre.chroma code {
+ line-height: 1.43;
+}
+
+.book-languages {
+ border: 2px solid black;
+}
+
+.menu-break {
+ opacity: 0.1;
+}
+
+#book-search-results {
+ padding: 2px;
+ background-color: white;
+}
+
+.label {
+ display: inline;
+ padding: .2em .6em .3em;
+ font-size: 75%;
+ font-weight: 700;
+ line-height: 1;
+ color: #fff;
+ text-align: center;
+ white-space: nowrap;
+ vertical-align: baseline;
+ border-radius: .25em;
+ background-color: #337ab7;
+}
+
+.expand-toc {
+ position: fixed;
+ top: 2em;
+ right: 5em;
+ display: none;
+}
+
+.container {
+ max-width: 90rem;
+}
+
+#book-search-input:focus {
+ outline: none;
+}
+
+.rest-api h5 {
+ margin-top: .5em;
+ margin-bottom: .5em;
+ font-size: 1em;
+}
+
+.rest-api tbody {
+ display: table;
+ width: 100%;
+ background: white;
+}
+
+.rest-api td {
+ background: white;
+}
+
+.rest-api .book-expand label {
+ padding: 0rem 0rem;
+ background: white;
+}
+
+.rest-api .book-expand {
+ background: white;
+}
+
+.rest-api .book-expand .book-expand-head {
+ background: white;
+}
+
+.configuration td {
+ background: white;
+}
+
+.markdown table tr:nth-child(2n) {
+ background: white;
+}
\ No newline at end of file
diff --git a/docs/assets/_fonts.scss b/docs/assets/_fonts.scss
new file mode 100644
index 0000000000000..dc57189cf04b6
--- /dev/null
+++ b/docs/assets/_fonts.scss
@@ -0,0 +1,25 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+body {
+ font-family: "Helvetica Neue",Helvetica,Arial,sans-serif;
+ font-size: 14px;
+}
+
+code {
+ font-family: "Menlo", "Lucida Console", monospace;
+}
\ No newline at end of file
diff --git a/docs/assets/github.css b/docs/assets/github.css
new file mode 100644
index 0000000000000..25600e34e7103
--- /dev/null
+++ b/docs/assets/github.css
@@ -0,0 +1,87 @@
+/**
+ * Syntax highlighting generated via
+ * hugo gen chromastyles --style=github > chroma.css
+ */
+
+/* Background */ .chroma { background-color: #ffffff }
+/* Other */ .chroma .x { }
+/* Error */ .chroma .err { color: #a61717; background-color: #e3d2d2 }
+/* LineTableTD */ .chroma .lntd { vertical-align: top; padding: 0; margin: 0; border: 0; }
+/* LineTable */ .chroma .lntable { border-spacing: 0; padding: 0; margin: 0; border: 0; width: auto; overflow: auto; display: block; }
+/* LineHighlight */ .chroma .hl { display: block; width: 100%;background-color: #ffffcc }
+/* LineNumbersTable */ .chroma .lnt { margin-right: 0.4em; padding: 0 0.4em 0 0.4em;color: #7f7f7f }
+/* LineNumbers */ .chroma .ln { margin-right: 0.4em; padding: 0 0.4em 0 0.4em;color: #7f7f7f }
+/* Keyword */ .chroma .k { color: #000000; font-weight: bold }
+/* KeywordConstant */ .chroma .kc { color: #000000; font-weight: bold }
+/* KeywordDeclaration */ .chroma .kd { color: #000000; font-weight: bold }
+/* KeywordNamespace */ .chroma .kn { color: #000000; font-weight: bold }
+/* KeywordPseudo */ .chroma .kp { color: #000000; font-weight: bold }
+/* KeywordReserved */ .chroma .kr { color: #000000; font-weight: bold }
+/* KeywordType */ .chroma .kt { color: #445588; font-weight: bold }
+/* Name */ .chroma .n { }
+/* NameAttribute */ .chroma .na { color: #008080 }
+/* NameBuiltin */ .chroma .nb { color: #0086b3 }
+/* NameBuiltinPseudo */ .chroma .bp { color: #999999 }
+/* NameClass */ .chroma .nc { color: #445588; font-weight: bold }
+/* NameConstant */ .chroma .no { color: #008080 }
+/* NameDecorator */ .chroma .nd { color: #3c5d5d; font-weight: bold }
+/* NameEntity */ .chroma .ni { color: #800080 }
+/* NameException */ .chroma .ne { color: #990000; font-weight: bold }
+/* NameFunction */ .chroma .nf { color: #990000; font-weight: bold }
+/* NameFunctionMagic */ .chroma .fm { }
+/* NameLabel */ .chroma .nl { color: #990000; font-weight: bold }
+/* NameNamespace */ .chroma .nn { color: #555555 }
+/* NameOther */ .chroma .nx { }
+/* NameProperty */ .chroma .py { }
+/* NameTag */ .chroma .nt { color: #000080 }
+/* NameVariable */ .chroma .nv { color: #008080 }
+/* NameVariableClass */ .chroma .vc { color: #008080 }
+/* NameVariableGlobal */ .chroma .vg { color: #008080 }
+/* NameVariableInstance */ .chroma .vi { color: #008080 }
+/* NameVariableMagic */ .chroma .vm { }
+/* Literal */ .chroma .l { }
+/* LiteralDate */ .chroma .ld { }
+/* LiteralString */ .chroma .s { color: #dd1144 }
+/* LiteralStringAffix */ .chroma .sa { color: #dd1144 }
+/* LiteralStringBacktick */ .chroma .sb { color: #dd1144 }
+/* LiteralStringChar */ .chroma .sc { color: #dd1144 }
+/* LiteralStringDelimiter */ .chroma .dl { color: #dd1144 }
+/* LiteralStringDoc */ .chroma .sd { color: #dd1144 }
+/* LiteralStringDouble */ .chroma .s2 { color: #dd1144 }
+/* LiteralStringEscape */ .chroma .se { color: #dd1144 }
+/* LiteralStringHeredoc */ .chroma .sh { color: #dd1144 }
+/* LiteralStringInterpol */ .chroma .si { color: #dd1144 }
+/* LiteralStringOther */ .chroma .sx { color: #dd1144 }
+/* LiteralStringRegex */ .chroma .sr { color: #009926 }
+/* LiteralStringSingle */ .chroma .s1 { color: #dd1144 }
+/* LiteralStringSymbol */ .chroma .ss { color: #990073 }
+/* LiteralNumber */ .chroma .m { color: #009999 }
+/* LiteralNumberBin */ .chroma .mb { color: #009999 }
+/* LiteralNumberFloat */ .chroma .mf { color: #009999 }
+/* LiteralNumberHex */ .chroma .mh { color: #009999 }
+/* LiteralNumberInteger */ .chroma .mi { color: #009999 }
+/* LiteralNumberIntegerLong */ .chroma .il { color: #009999 }
+/* LiteralNumberOct */ .chroma .mo { color: #009999 }
+/* Operator */ .chroma .o { color: #000000; font-weight: bold }
+/* OperatorWord */ .chroma .ow { color: #000000; font-weight: bold }
+/* Punctuation */ .chroma .p { }
+/* Comment */ .chroma .c { color: #999988; font-style: italic }
+/* CommentHashbang */ .chroma .ch { color: #999988; font-style: italic }
+/* CommentMultiline */ .chroma .cm { color: #999988; font-style: italic }
+/* CommentSingle */ .chroma .c1 { color: #999988; font-style: italic }
+/* CommentSpecial */ .chroma .cs { color: #999999; font-weight: bold; font-style: italic }
+/* CommentPreproc */ .chroma .cp { color: #999999; font-weight: bold; font-style: italic }
+/* CommentPreprocFile */ .chroma .cpf { color: #999999; font-weight: bold; font-style: italic }
+/* Generic */ .chroma .g { }
+/* GenericDeleted */ .chroma .gd { color: #000000; background-color: #ffdddd }
+/* GenericEmph */ .chroma .ge { color: #000000; font-style: italic }
+/* GenericError */ .chroma .gr { color: #aa0000 }
+/* GenericHeading */ .chroma .gh { color: #999999 }
+/* GenericInserted */ .chroma .gi { color: #000000; background-color: #ddffdd }
+/* GenericOutput */ .chroma .go { color: #888888 }
+/* GenericPrompt */ .chroma .gp { color: #555555 }
+/* GenericStrong */ .chroma .gs { font-weight: bold }
+/* GenericSubheading */ .chroma .gu { color: #aaaaaa }
+/* GenericTraceback */ .chroma .gt { color: #aa0000 }
+/* GenericUnderline */ .chroma .gl { text-decoration: underline }
+/* TextWhitespace */ .chroma .w { color: #bbbbbb }
diff --git a/docs/assets/search-data.js b/docs/assets/search-data.js
new file mode 100644
index 0000000000000..620fc380cf2b7
--- /dev/null
+++ b/docs/assets/search-data.js
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+'use strict';
+
+(function () {
+ const indexCfg = {{ with i18n "bookSearchConfig" }}
+ {{ . }};
+ {{ else }}
+ {};
+ {{ end }}
+
+ indexCfg.doc = {
+ id: 'id',
+ field: ['title', 'content'],
+ store: ['title', 'href', 'section'],
+ };
+
+ const index = FlexSearch.create('balance', indexCfg);
+ window.bookSearchIndex = index;
+
+ {{- $pages := where .Site.Pages "Kind" "in" (slice "page" "section") -}}
+ {{- $pages = where $pages "Params.booksearchexclude" "!=" true -}}
+ {{- $pages = where $pages "Content" "not in" (slice nil "") -}}
+
+ {{ range $index, $page := $pages }}
+ index.add({
+ 'id': {{ $index }},
+ 'href': '{{ $page.RelPermalink }}',
+ 'title': {{ (partial "docs/simple-title" $page) | jsonify }},
+ 'section': {{ (partial "docs/simple-title" $page.Parent) | jsonify }},
+ 'content': {{ $page.Plain | jsonify }}
+ });
+ {{- end -}}
+})();
diff --git a/docs/build_docs.sh b/docs/build_docs.sh
index b2ceff4e66016..36eba0cc2de51 100755
--- a/docs/build_docs.sh
+++ b/docs/build_docs.sh
@@ -17,94 +17,12 @@
# limitations under the License.
################################################################################
-RUBY=${RUBY:-ruby}
-GEM=${GEM:-gem}
-CACHE_DIR=${CACHE_DIR:-".rubydeps"}
-
-set -e
-cd "$(dirname ${BASH_SOURCE[0]})"
-
-DIR="`pwd`"
-
-# We need at least bundler to proceed
-if [ "`command -v bundle`" == "" ]; then
- RUBYGEM_BINDIR=""
-
- # Adjust the PATH to discover locally installed ruby gem binaries
- export PATH="$(${RUBY} -e 'puts Gem.user_dir')/bin:$PATH"
-
- if [ "`command -v bundle`" == "" ]; then
- echo "WARN: Could not find bundle."
- echo "Attempting to install locally. If this doesn't work, please install with 'gem install bundler'."
-
- # install bundler locally
- ${GEM} install --user-install --no-format-executable bundler
- fi
+if ! command -v hugo &> /dev/null
+then
+ echo "Hugo must be installed to run the docs locally"
+ echo "Please see docs/README.md for more details"
+ exit 1
fi
+git submodule update --init --recursive
-# Install Ruby dependencies locally
-bundle install --path ${CACHE_DIR}
-
-DOCS_SRC=${DIR}
-DOCS_DST=${DOCS_SRC}/content
-
-# default jekyll command is to just build site
-JEKYLL_CMD="build"
-
-JEKYLL_CONFIG=""
-
-# config options that only apply to the barebone "build" without any arguments.
-JEKYLL_BUILD_CONFIG=${JEKYLL_BUILD_CONFIG:-}
-
-DOC_LANGUAGES="en zh"
-
-# if -p flag is provided, serve site on localhost
-# -i is like -p, but incremental (only rebuilds the modified file)
-# -e builds only english documentation
-# -z builds only chinese documentation
-while getopts "piez" opt; do
- case $opt in
- p)
- JEKYLL_CMD="serve --baseurl= --watch"
- ;;
- i)
- [[ `${RUBY} -v` =~ 'ruby 1' ]] && echo "Error: building the docs with the incremental option requires at least ruby 2.0" && exit 1
- JEKYLL_CMD="serve --baseurl= --watch --incremental"
- ;;
- e)
- JEKYLL_CONFIG="--config _config.yml,_config_dev_en.yml"
- ;;
- z)
- JEKYLL_CONFIG="--config _config.yml,_config_dev_zh.yml"
- ;;
- *) echo "usage: $0 [-e|-z] [-i|-p]" >&2
- exit 1 ;;
- esac
-done
-
-# use 'bundle exec' to insert the local Ruby dependencies
-
-if [ "${JEKYLL_CMD}" = "build" ] && [ -z "${JEKYLL_CONFIG}" ]; then
- # run parallel builds for all languages if not serving or creating a single language only
-
- # run processes and store pids
- echo "Spawning parallel builds for languages: ${DOC_LANGUAGES}..."
- pids=""
- for lang in ${DOC_LANGUAGES}; do
- bundle exec jekyll ${JEKYLL_CMD} ${JEKYLL_BUILD_CONFIG} --config _config.yml,_config_dev_${lang}.yml --source "${DOCS_SRC}" --destination "${DOCS_DST}_${lang}" &
- pid=$!
- pids="${pids} ${pid}"
- done
-
- # wait for all pids (since jekyll returns 0 even in case of failures, we do not parse exit codes)
- wait ${pids}
- rm -rf "${DOCS_DST}"
- mkdir -p "${DOCS_DST}"
- for lang in ${DOC_LANGUAGES}; do
- cp -aln "${DOCS_DST}_${lang}/." "${DOCS_DST}"
- rm -rf "${DOCS_DST}_${lang}"
- done
- exit 0
-else
- bundle exec jekyll ${JEKYLL_CMD} ${JEKYLL_CONFIG} --source "${DOCS_SRC}" --destination "${DOCS_DST}"
-fi
+hugo -b "" serve
diff --git a/docs/check_links.sh b/docs/check_links.sh
deleted file mode 100755
index dbe3766df96ef..0000000000000
--- a/docs/check_links.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env bash
-################################################################################
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-DOCS_CHECK_DIR="`dirname \"$0\"`" # relative
-DOCS_CHECK_DIR="`( cd \"$DOCS_CHECK_DIR\" && pwd -P)`" # absolutized and normalized
-if [ -z "$DOCS_CHECK_DIR" ] ; then
- # error; for some reason, the path is not accessible
- # to the script (e.g. permissions re-evaled after suid)
- exit 1 # fail
-fi
-
-echo "Check docs directory: $DOCS_CHECK_DIR"
-
-target=${1:-"http://localhost:4000"}
-
-# Crawl the docs, ignoring robots.txt, storing nothing locally
-wget --spider -r -nd -nv -e robots=off -p -o $DOCS_CHECK_DIR/spider.log "$target"
-
-# Abort for anything other than 0 and 4 ("Network failure")
-status=$?
-
-if [ $status -ne 0 ] && [ $status -ne 4 ]; then
- exit $status
-fi
-
-# Fail the build if any broken links are found
-no_broken_links_str_count=$(grep 'Found no broken links' $DOCS_CHECK_DIR/spider.log | wc -l)
-if [ $no_broken_links_str_count -ne 1 ]; then
- grep -B 1 "Remote file does not exist -- broken link!!!" $DOCS_CHECK_DIR/spider.log
- echo "---------------------------------------------------------------------------"
- echo "Check the spider.log file for errors!"
- exit 1
-fi
-
-echo 'All links in docs are valid!'
-exit 0
diff --git a/docs/concepts/flink-architecture.md b/docs/concepts/flink-architecture.md
deleted file mode 100644
index eb6bf00d9cfb1..0000000000000
--- a/docs/concepts/flink-architecture.md
+++ /dev/null
@@ -1,252 +0,0 @@
----
-title: Flink Architecture
-nav-id: flink-architecture
-nav-pos: 4
-nav-title: Flink Architecture
-nav-parent_id: concepts
----
-
-
-Flink is a distributed system and requires effective allocation and management
-of compute resources in order to execute streaming applications. It integrates
-with all common cluster resource managers such as [Hadoop
-YARN](https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/YARN.html),
-[Apache Mesos](https://mesos.apache.org/) and
-[Kubernetes](https://kubernetes.io/), but can also be set up to run as a
-standalone cluster or even as a library.
-
-This section contains an overview of Flink’s architecture and describes how its
-main components interact to execute applications and recover from failures.
-
-* This will be replaced by the TOC
-{:toc}
-
-## Anatomy of a Flink Cluster
-
-The Flink runtime consists of two types of processes: a _JobManager_ and one or more _TaskManagers_.
-
-
-
-The *Client* is not part of the runtime and program execution, but is used to
-prepare and send a dataflow to the JobManager. After that, the client can
-disconnect (_detached mode_), or stay connected to receive progress reports
-(_attached mode_). The client runs either as part of the Java/Scala program
-that triggers the execution, or in the command line process `./bin/flink run
-...`.
-
-The JobManager and TaskManagers can be started in various ways: directly on
-the machines as a [standalone cluster]({% link
-deployment/resource-providers/standalone/index.md %}), in containers, or managed by resource
-frameworks like [YARN]({% link deployment/resource-providers/yarn.md
-%}) or [Mesos]({% link deployment/resource-providers/mesos.md %}).
-TaskManagers connect to JobManagers, announcing themselves as available, and
-are assigned work.
-
-### JobManager
-
-The _JobManager_ has a number of responsibilities related to coordinating the distributed execution of Flink Applications:
-it decides when to schedule the next task (or set of tasks), reacts to finished
-tasks or execution failures, coordinates checkpoints, and coordinates recovery on
-failures, among others. This process consists of three different components:
-
- * **ResourceManager**
-
- The _ResourceManager_ is responsible for resource de-/allocation and
- provisioning in a Flink cluster — it manages **task slots**, which are the
- unit of resource scheduling in a Flink cluster (see [TaskManagers](#taskmanagers)).
- Flink implements multiple ResourceManagers for different environments and
- resource providers such as YARN, Mesos, Kubernetes and standalone
- deployments. In a standalone setup, the ResourceManager can only distribute
- the slots of available TaskManagers and cannot start new TaskManagers on
- its own.
-
- * **Dispatcher**
-
- The _Dispatcher_ provides a REST interface to submit Flink applications for
- execution and starts a new JobMaster for each submitted job. It
- also runs the Flink WebUI to provide information about job executions.
-
- * **JobMaster**
-
- A _JobMaster_ is responsible for managing the execution of a single
- [JobGraph]({% link concepts/glossary.md %}#logical-graph).
- Multiple jobs can run simultaneously in a Flink cluster, each having its
- own JobMaster.
-
-There is always at least one JobManager. A high-availability setup might have
-multiple JobManagers, one of which is always the *leader*, and the others are
-*standby* (see [High Availability (HA)]({% link deployment/ha/index.md %})).
-
-### TaskManagers
-
-The *TaskManagers* (also called *workers*) execute the tasks of a dataflow, and buffer and exchange the data
-streams.
-
-There must always be at least one TaskManager. The smallest unit of resource scheduling in a TaskManager is a task _slot_. The number of task slots in a
-TaskManager indicates the number of concurrent processing tasks. Note that
-multiple operators may execute in a task slot (see [Tasks and Operator
-Chains](#tasks-and-operator-chains)).
-
-{% top %}
-
-## Tasks and Operator Chains
-
-For distributed execution, Flink *chains* operator subtasks together into
-*tasks*. Each task is executed by one thread. Chaining operators together into
-tasks is a useful optimization: it reduces the overhead of thread-to-thread
-handover and buffering, and increases overall throughput while decreasing
-latency. The chaining behavior can be configured; see the [chaining docs]({%
-link dev/stream/operators/index.md %}#task-chaining-and-resource-groups) for details.
-
-The sample dataflow in the figure below is executed with five subtasks, and
-hence with five parallel threads.
-
-
-
-{% top %}
-
-## Task Slots and Resources
-
-Each worker (TaskManager) is a *JVM process*, and may execute one or more
-subtasks in separate threads. To control how many tasks a TaskManager accepts, it
-has so called **task slots** (at least one).
-
-Each *task slot* represents a fixed subset of resources of the TaskManager. A
-TaskManager with three slots, for example, will dedicate 1/3 of its managed
-memory to each slot. Slotting the resources means that a subtask will not
-compete with subtasks from other jobs for managed memory, but instead has a
-certain amount of reserved managed memory. Note that no CPU isolation happens
-here; currently slots only separate the managed memory of tasks.
-
-By adjusting the number of task slots, users can define how subtasks are
-isolated from each other. Having one slot per TaskManager means that each task
-group runs in a separate JVM (which can be started in a separate container, for
-example). Having multiple slots means more subtasks share the same JVM. Tasks
-in the same JVM share TCP connections (via multiplexing) and heartbeat
-messages. They may also share data sets and data structures, thus reducing the
-per-task overhead.
-
-
-
-By default, Flink allows subtasks to share slots even if they are subtasks of
-different tasks, so long as they are from the same job. The result is that one
-slot may hold an entire pipeline of the job. Allowing this *slot sharing* has
-two main benefits:
-
- - A Flink cluster needs exactly as many task slots as the highest parallelism
- used in the job. No need to calculate how many tasks (with varying
- parallelism) a program contains in total.
-
- - It is easier to get better resource utilization. Without slot sharing, the
- non-intensive *source/map()* subtasks would block as many resources as the
- resource intensive *window* subtasks. With slot sharing, increasing the
- base parallelism in our example from two to six yields full utilization of
- the slotted resources, while making sure that the heavy subtasks are fairly
- distributed among the TaskManagers.
-
-
-
-## Flink Application Execution
-
-A _Flink Application_ is any user program that spawns one or multiple Flink
-jobs from its ``main()`` method. The execution of these jobs can happen in a
-local JVM (``LocalEnvironment``) or on a remote setup of clusters with multiple
-machines (``RemoteEnvironment``). For each program, the
-[``ExecutionEnvironment``]({{ site.javadocs_baseurl }}/api/java/) provides methods to
-control the job execution (e.g. setting the parallelism) and to interact with
-the outside world (see [Anatomy of a Flink Program]({%
-link dev/datastream_api.md %}#anatomy-of-a-flink-program)).
-
-The jobs of a Flink Application can either be submitted to a long-running
-[Flink Session Cluster]({%
-link concepts/glossary.md %}#flink-session-cluster), a dedicated [Flink Job
-Cluster]({% link concepts/glossary.md %}#flink-job-cluster), or a
-[Flink Application Cluster]({%
-link concepts/glossary.md %}#flink-application-cluster). The difference between
-these options is mainly related to the cluster’s lifecycle and to resource
-isolation guarantees.
-
-### Flink Session Cluster
-
-* **Cluster Lifecycle**: in a Flink Session Cluster, the client connects to a
- pre-existing, long-running cluster that can accept multiple job submissions.
- Even after all jobs are finished, the cluster (and the JobManager) will
- keep running until the session is manually stopped. The lifetime of a Flink
- Session Cluster is therefore not bound to the lifetime of any Flink Job.
-
-* **Resource Isolation**: TaskManager slots are allocated by the
- ResourceManager on job submission and released once the job is finished.
- Because all jobs are sharing the same cluster, there is some competition for
- cluster resources — like network bandwidth in the submit-job phase. One
- limitation of this shared setup is that if one TaskManager crashes, then all
- jobs that have tasks running on this TaskManager will fail; in a similar way, if
- some fatal error occurs on the JobManager, it will affect all jobs running
- in the cluster.
-
-* **Other considerations**: having a pre-existing cluster saves a considerable
- amount of time applying for resources and starting TaskManagers. This is
- important in scenarios where the execution time of jobs is very short and a
- high startup time would negatively impact the end-to-end user experience — as
- is the case with interactive analysis of short queries, where it is desirable
- that jobs can quickly perform computations using existing resources.
-
-
Note: Formerly, a Flink Session Cluster was also known as a Flink Cluster in session mode.
-
-### Flink Job Cluster
-
-* **Cluster Lifecycle**: in a Flink Job Cluster, the available cluster manager
- (like YARN or Kubernetes) is used to spin up a cluster for each submitted job
- and this cluster is available to that job only. Here, the client first
- requests resources from the cluster manager to start the JobManager and
- submits the job to the Dispatcher running inside this process. TaskManagers
- are then lazily allocated based on the resource requirements of the job. Once
- the job is finished, the Flink Job Cluster is torn down.
-
-* **Resource Isolation**: a fatal error in the JobManager only affects the one job running in that Flink Job Cluster.
-
-* **Other considerations**: because the ResourceManager has to apply and wait
- for external resource management components to start the TaskManager
- processes and allocate resources, Flink Job Clusters are more suited to large
- jobs that are long-running, have high-stability requirements and are not
- sensitive to longer startup times.
-
-
Note: Formerly, a Flink Job Cluster was also known as a Flink Cluster in job (or per-job) mode.
-
-### Flink Application Cluster
-
-* **Cluster Lifecycle**: a Flink Application Cluster is a dedicated Flink
- cluster that only executes jobs from one Flink Application and where the
- ``main()`` method runs on the cluster rather than the client. The job
- submission is a one-step process: you don’t need to start a Flink cluster
- first and then submit a job to the existing cluster session; instead, you
- package your application logic and dependencies into a executable job JAR and
- the cluster entrypoint (``ApplicationClusterEntryPoint``)
- is responsible for calling the ``main()`` method to extract the JobGraph.
- This allows you to deploy a Flink Application like any other application on
- Kubernetes, for example. The lifetime of a Flink Application Cluster is
- therefore bound to the lifetime of the Flink Application.
-
-* **Resource Isolation**: in a Flink Application Cluster, the ResourceManager
- and Dispatcher are scoped to a single Flink Application, which provides a
- better separation of concerns than the Flink Session Cluster.
-
-
Note: A Flink Job Cluster can be seen as a “run-on-client” alternative to Flink Application Clusters.
-
-{% top %}
diff --git a/docs/concepts/glossary.md b/docs/concepts/glossary.md
deleted file mode 100644
index c45cf1fc052de..0000000000000
--- a/docs/concepts/glossary.md
+++ /dev/null
@@ -1,190 +0,0 @@
----
-title: Glossary
-nav-pos: 10
-nav-title: Glossary
-nav-parent_id: concepts
----
-
-
-#### Flink Application Cluster
-
-A Flink Application Cluster is a dedicated [Flink Cluster](#flink-cluster) that
-only executes [Flink Jobs](#flink-job) from one [Flink
-Application](#flink-application). The lifetime of the [Flink
-Cluster](#flink-cluster) is bound to the lifetime of the Flink Application.
-
-#### Flink Job Cluster
-
-A Flink Job Cluster is a dedicated [Flink Cluster](#flink-cluster) that only
-executes a single [Flink Job](#flink-job). The lifetime of the
-[Flink Cluster](#flink-cluster) is bound to the lifetime of the Flink Job.
-
-#### Flink Cluster
-
-A distributed system consisting of (typically) one [JobManager](#flink-jobmanager) and one or more
-[Flink TaskManager](#flink-taskmanager) processes.
-
-#### Event
-
-An event is a statement about a change of the state of the domain modelled by the
-application. Events can be input and/or output of a stream or batch processing application.
-Events are special types of [records](#Record).
-
-#### ExecutionGraph
-
-see [Physical Graph](#physical-graph)
-
-#### Function
-
-Functions are implemented by the user and encapsulate the
-application logic of a Flink program. Most Functions are wrapped by a corresponding
-[Operator](#operator).
-
-#### Instance
-
-The term *instance* is used to describe a specific instance of a specific type (usually
-[Operator](#operator) or [Function](#function)) during runtime. As Apache Flink is mostly written in
-Java, this corresponds to the definition of *Instance* or *Object* in Java. In the context of Apache
-Flink, the term *parallel instance* is also frequently used to emphasize that multiple instances of
-the same [Operator](#operator) or [Function](#function) type are running in parallel.
-
-#### Flink Application
-
-A Flink application is a Java Application that submits one or multiple [Flink
-Jobs](#flink-job) from the `main()` method (or by some other means). Submitting
-jobs is usually done by calling `execute()` on an execution environment.
-
-The jobs of an application can either be submitted to a long running [Flink
-Session Cluster](#flink-session-cluster), to a dedicated [Flink Application
-Cluster](#flink-application-cluster), or to a [Flink Job
-Cluster](#flink-job-cluster).
-
-#### Flink Job
-
-A Flink Job is the runtime representation of a [logical graph](#logical-graph)
-(also often called dataflow graph) that is created and submitted by calling
-`execute()` in a [Flink Application](#flink-application).
-
-#### JobGraph
-
-see [Logical Graph](#logical-graph)
-
-#### Flink JobManager
-
-The JobManager is the orchestrator of a [Flink Cluster](#flink-cluster). It contains three distinct
-components: Flink Resource Manager, Flink Dispatcher and one [Flink JobMaster](#flink-jobmaster)
-per running [Flink Job](#flink-job).
-
-#### Flink JobMaster
-
-JobMasters are one of the components running in the [JobManager](#flink-jobmanager). A JobMaster is
-responsible for supervising the execution of the [Tasks](#task) of a single job.
-
-#### Logical Graph
-
-A logical graph is a directed graph where the nodes are [Operators](#operator)
-and the edges define input/output-relationships of the operators and correspond
-to data streams or data sets. A logical graph is created by submitting jobs
-from a [Flink Application](#flink-application).
-
-Logical graphs are also often referred to as *dataflow graphs*.
-
-#### Managed State
-
-Managed State describes application state which has been registered with the framework. For
-Managed State, Apache Flink will take care about persistence and rescaling among other things.
-
-#### Operator
-
-Node of a [Logical Graph](#logical-graph). An Operator performs a certain operation, which is
-usually executed by a [Function](#function). Sources and Sinks are special Operators for data
-ingestion and data egress.
-
-#### Operator Chain
-
-An Operator Chain consists of two or more consecutive [Operators](#operator) without any
-repartitioning in between. Operators within the same Operator Chain forward records to each other
-directly without going through serialization or Flink's network stack.
-
-#### Partition
-
-A partition is an independent subset of the overall data stream or data set. A data stream or
-data set is divided into partitions by assigning each [record](#Record) to one or more partitions.
-Partitions of data streams or data sets are consumed by [Tasks](#task) during runtime. A
-transformation which changes the way a data stream or data set is partitioned is often called
-repartitioning.
-
-#### Physical Graph
-
-A physical graph is the result of translating a [Logical Graph](#logical-graph) for execution in a
-distributed runtime. The nodes are [Tasks](#task) and the edges indicate input/output-relationships
-or [partitions](#partition) of data streams or data sets.
-
-#### Record
-
-Records are the constituent elements of a data set or data stream. [Operators](#operator) and
-[Functions](#Function) receive records as input and emit records as output.
-
-#### (Runtime) Execution Mode
-
-DataStream API programs can be executed in one of two execution modes: `BATCH`
-or `STREAMING`. See [Execution Mode]({% link dev/datastream_execution_mode.md
-%}) for more details.
-
-#### Flink Session Cluster
-
-A long-running [Flink Cluster](#flink-cluster) which accepts multiple [Flink Jobs](#flink-job) for
-execution. The lifetime of this Flink Cluster is not bound to the lifetime of any Flink Job.
-Formerly, a Flink Session Cluster was also known as a Flink Cluster in *session mode*. Compare to
-[Flink Application Cluster](#flink-application-cluster).
-
-#### State Backend
-
-For stream processing programs, the State Backend of a [Flink Job](#flink-job) determines how its
-[state](#managed-state) is stored on each TaskManager (Java Heap of TaskManager or (embedded)
-RocksDB) as well as where it is written upon a checkpoint (Java Heap of
-[JobManager](#flink-jobmanager) or Filesystem).
-
-#### Sub-Task
-
-A Sub-Task is a [Task](#task) responsible for processing a [partition](#partition) of
-the data stream. The term "Sub-Task" emphasizes that there are multiple parallel Tasks for the same
-[Operator](#operator) or [Operator Chain](#operator-chain).
-
-#### Task
-
-Node of a [Physical Graph](#physical-graph). A task is the basic unit of work, which is executed by
-Flink's runtime. Tasks encapsulate exactly one parallel instance of an
-[Operator](#operator) or [Operator Chain](#operator-chain).
-
-#### Flink TaskManager
-
-TaskManagers are the worker processes of a [Flink Cluster](#flink-cluster). [Tasks](#task) are
-scheduled to TaskManagers for execution. They communicate with each other to exchange data between
-subsequent Tasks.
-
-#### Transformation
-
-A Transformation is applied on one or more data streams or data sets and results in one or more
-output data streams or data sets. A transformation might change a data stream or data set on a
-per-record basis, but might also only change its partitioning or perform an aggregation. While
-[Operators](#operator) and [Functions](#function) are the "physical" parts of Flink's API,
-Transformations are only an API concept. Specifically, most transformations are
-implemented by certain [Operators](#operator).
diff --git a/docs/concepts/glossary.zh.md b/docs/concepts/glossary.zh.md
deleted file mode 100644
index dc78acaeb0ec8..0000000000000
--- a/docs/concepts/glossary.zh.md
+++ /dev/null
@@ -1,144 +0,0 @@
----
-title: 词汇表
-nav-pos: 10
-nav-title: 词汇表
-nav-parent_id: concepts
----
-
-
-#### Flink Application Cluster
-
-A Flink Application Cluster is a dedicated [Flink Cluster](#flink-cluster) that
-only executes [Flink Jobs](#flink-job) from one [Flink
-Application](#flink-application). The lifetime of the [Flink
-Cluster](#flink-cluster) is bound to the lifetime of the Flink Application.
-
-#### Flink Job Cluster
-
-A Flink Job Cluster is a dedicated [Flink Cluster](#flink-cluster) that only
-executes a single [Flink Job](#flink-job). The lifetime of the
-[Flink Cluster](#flink-cluster) is bound to the lifetime of the Flink Job.
-
-#### Flink Cluster
-
-一般情况下,Flink 集群是由一个 [Flink JobManager](#flink-jobmanager) 和一个或多个 [Flink TaskManager](#flink-taskmanager) 进程组成的分布式系统。
-
-#### Event
-
-Event 是对应用程序建模的域的状态更改的声明。它可以同时为流或批处理应用程序的 input 和 output,也可以单独是 input 或者 output 中的一种。Event 是特殊类型的 [Record](#record)。
-
-#### ExecutionGraph
-
-见 [Physical Graph](#physical-graph)。
-
-#### Function
-
-Function 是由用户实现的,并封装了 Flink 程序的应用程序逻辑。大多数 Function 都由相应的 [Operator](#operator) 封装。
-
-#### Instance
-
-Instance 常用于描述运行时的特定类型(通常是 [Operator](#operator) 或者 [Function](#function))的一个具体实例。由于 Apache Flink 主要是用 Java 编写的,所以,这与 Java 中的 *Instance* 或 *Object* 的定义相对应。在 Apache Flink 的上下文中,*parallel instance* 也常用于强调同一 [Operator](#operator) 或者 [Function](#function) 的多个 instance 以并行的方式运行。
-
-#### Flink Application
-
-A Flink application is a Java Application that submits one or multiple [Flink
-Jobs](#flink-job) from the `main()` method (or by some other means). Submitting
-jobs is usually done by calling `execute()` on an execution environment.
-
-The jobs of an application can either be submitted to a long running [Flink
-Session Cluster](#flink-session-cluster), to a dedicated [Flink Application
-Cluster](#flink-application-cluster), or to a [Flink Job
-Cluster](#flink-job-cluster).
-
-#### Flink Job
-
-A Flink Job is the runtime representation of a [logical graph](#logical-graph)
-(also often called dataflow graph) that is created and submitted by calling
-`execute()` in a [Flink Application](#flink-application).
-
-#### JobGraph
-
-见 [Logical Graph](#logical-graph)。
-
-#### Flink JobManager
-
-Flink JobManager 是 [Flink Cluster](#flink-cluster) 的主节点。它包含三个不同的组件:Flink Resource Manager、Flink Dispatcher、运行每个 [Flink Job](#flink-job) 的 [Flink JobMaster](#flink-jobmaster)。
-
-
-#### Flink JobMaster
-
-JobMaster 是在 [Flink JobManager](#flink-jobmanager) 运行中的组件之一。JobManager 负责监督单个作业 [Task](#task) 的执行。以前,整个 [Flink JobManager](#flink-jobmanager) 都叫做 JobManager。
-
-#### Logical Graph
-
-A logical graph is a directed graph where the nodes are [Operators](#operator)
-and the edges define input/output-relationships of the operators and correspond
-to data streams or data sets. A logical graph is created by submitting jobs
-from a [Flink Application](#flink-application).
-
-Logical graphs are also often referred to as *dataflow graphs*.
-
-#### Managed State
-
-Managed State 描述了已在框架中注册的应用程序的托管状态。对于托管状态,Apache Flink 会负责持久化和重伸缩等事宜。
-
-#### Operator
-
-[Logical Graph](#logical-graph) 的节点。算子执行某种操作,该操作通常由 [Function](#function) 执行。Source 和 Sink 是数据输入和数据输出的特殊算子。
-
-#### Operator Chain
-
-算子链由两个或多个连续的 [Operator](#operator) 组成,两者之间没有任何的重新分区。同一算子链内的算子可以彼此直接传递 record,而无需通过序列化或 Flink 的网络栈。
-
-#### Partition
-
-分区是整个数据流或数据集的独立子集。通过将每个 [Record](#record) 分配给一个或多个分区,来把数据流或数据集划分为多个分区。在运行期间,[Task](#task) 会消费数据流或数据集的分区。改变数据流或数据集分区方式的转换通常称为重分区。
-
-#### Physical Graph
-
-Physical graph 是一个在分布式运行时,把 [Logical Graph](#logical-graph) 转换为可执行的结果。节点是 [Task](#task),边表示数据流或数据集的输入/输出关系或 [partition](#partition)。
-
-#### Record
-
-Record 是数据集或数据流的组成元素。[Operator](#operator) 和 [Function](#Function)接收 record 作为输入,并将 record 作为输出发出。
-
-#### Flink Session Cluster
-
-长时间运行的 [Flink Cluster](#flink-cluster),它可以接受多个 [Flink Job](#flink-job) 的执行。此 [Flink Cluster](#flink-cluster) 的生命周期不受任何 [Flink Job](#flink-job) 生命周期的约束限制。以前,Flink Session Cluster 也称为 *session mode* 的 [Flink Cluster](#flink-cluster),和 [Flink Application Cluster](#flink-application-cluster) 相对应。
-
-#### State Backend
-
-对于流处理程序,[Flink Job](#flink-job) 的 State Backend 决定了其 [state](#managed-state) 是如何存储在每个 TaskManager 上的( TaskManager 的 Java 堆栈或嵌入式 RocksDB),以及它在 checkpoint 时的写入位置( [Flink JobManager](#flink-jobmanager) 的 Java 堆或者 Filesystem)。
-
-#### Sub-Task
-
-Sub-Task 是负责处理数据流 [Partition](#partition) 的 [Task](#task)。"Sub-Task"强调的是同一个 [Operator](#operator) 或者 [Operator Chain](#operator-chain) 具有多个并行的 Task 。
-
-#### Task
-
-Task 是 [Physical Graph](#physical-graph) 的节点。它是基本的工作单元,由 Flink 的 runtime 来执行。Task 正好封装了一个 [Operator](#operator) 或者 [Operator Chain](#operator-chain) 的 *parallel instance*。
-
-#### Flink TaskManager
-
-TaskManager 是 [Flink Cluster](#flink-cluster) 的工作进程。[Task](#task) 被调度到 TaskManager 上执行。TaskManager 相互通信,只为在后续的 Task 之间交换数据。
-
-#### Transformation
-
-Transformation 应用于一个或多个数据流或数据集,并产生一个或多个输出数据流或数据集。Transformation 可能会在每个记录的基础上更改数据流或数据集,但也可以只更改其分区或执行聚合。虽然 [Operator](#operator) 和 [Function](#function) 是 Flink API 的“物理”部分,但 Transformation 只是一个 API 概念。具体来说,大多数(但不是全部)Transformation 是由某些 [Operator](#operator) 实现的。
diff --git a/docs/concepts/index.md b/docs/concepts/index.md
deleted file mode 100644
index c17113d8fb0c1..0000000000000
--- a/docs/concepts/index.md
+++ /dev/null
@@ -1,89 +0,0 @@
----
-title: Concepts
-nav-id: concepts
-nav-pos: 3
-nav-title: ' Concepts'
-nav-parent_id: root
-nav-show_overview: true
-permalink: /concepts/index.html
----
-
-
-The [Hands-on Training]({% link learn-flink/index.md %}) explains the basic concepts
-of stateful and timely stream processing that underlie Flink's APIs, and provides examples of how
-these mechanisms are used in applications. Stateful stream processing is introduced in the context
-of [Data Pipelines & ETL]({% link learn-flink/etl.md %}#stateful-transformations)
-and is further developed in the section on [Fault Tolerance]({% link learn-flink/fault_tolerance.md %}). Timely stream processing is introduced in the section on
-[Streaming Analytics]({% link learn-flink/streaming_analytics.md %}).
-
-This _Concepts in Depth_ section provides a deeper understanding of how Flink's architecture and runtime
-implement these concepts.
-
-## Flink's APIs
-
-Flink offers different levels of abstraction for developing streaming/batch applications.
-
-
-
- - The lowest level abstraction simply offers **stateful and timely stream processing**. It is
- embedded into the [DataStream API]({% link dev/datastream_api.md %}) via the [Process
- Function]({% link dev/stream/operators/process_function.md %}). It allows
- users to freely process events from one or more streams, and provides consistent, fault tolerant
- *state*. In addition, users can register event time and processing time callbacks, allowing
- programs to realize sophisticated computations.
-
- - In practice, many applications do not need the low-level
- abstractions described above, and can instead program against the **Core APIs**: the
- [DataStream API]({% link dev/datastream_api.md %})
- (bounded/unbounded streams) and the [DataSet API]({% link
- dev/batch/index.md %}) (bounded data sets). These fluent APIs offer the
- common building blocks for data processing, like various forms of
- user-specified transformations, joins, aggregations, windows, state, etc.
- Data types processed in these APIs are represented as classes in the
- respective programming languages.
-
- The low level *Process Function* integrates with the *DataStream API*,
- making it possible to use the lower-level abstraction on an as-needed basis.
- The *DataSet API* offers additional primitives on bounded data sets,
- like loops/iterations.
-
- - The **Table API** is a declarative DSL centered around *tables*, which may
- be dynamically changing tables (when representing streams). The [Table
- API]({% link dev/table/index.md %}) follows the
- (extended) relational model: Tables have a schema attached (similar to
- tables in relational databases) and the API offers comparable operations,
- such as select, project, join, group-by, aggregate, etc. Table API
- programs declaratively define *what logical operation should be done*
- rather than specifying exactly *how the code for the operation looks*.
- Though the Table API is extensible by various types of user-defined
- functions, it is less expressive than the *Core APIs*, and more concise to
- use (less code to write). In addition, Table API programs also go through
- an optimizer that applies optimization rules before execution.
-
- One can seamlessly convert between tables and *DataStream*/*DataSet*,
- allowing programs to mix the *Table API* with the *DataStream* and
- *DataSet* APIs.
-
- - The highest level abstraction offered by Flink is **SQL**. This abstraction
- is similar to the *Table API* both in semantics and expressiveness, but
- represents programs as SQL query expressions. The [SQL](
- {% link dev/table/index.md %}#sql) abstraction closely interacts with the
- Table API, and SQL queries can be executed over tables defined in the
- *Table API*.
diff --git a/docs/concepts/index.zh.md b/docs/concepts/index.zh.md
deleted file mode 100644
index a6efcf294e49d..0000000000000
--- a/docs/concepts/index.zh.md
+++ /dev/null
@@ -1,49 +0,0 @@
----
-title: 概念透析
-nav-id: concepts
-nav-pos: 3
-nav-title: ' 概念透析'
-nav-parent_id: root
-nav-show_overview: true
-permalink: /concepts/index.html
----
-
-
-[实践练习]({% link learn-flink/index.zh.md %})章节介绍了作为 Flink API 根基的有状态实时流处理的基本概念,并且举例说明了如何在 Flink 应用中使用这些机制。其中 [Data Pipelines & ETL]({% link learn-flink/etl.zh.md %}#stateful-transformations) 小节介绍了有状态流处理的概念,并且在 [Fault Tolerance]({% link learn-flink/fault_tolerance.zh.md %}) 小节中进行了深入介绍。[Streaming Analytics]({% link learn-flink/streaming_analytics.zh.md %}) 小节介绍了实时流处理的概念。
-
-本章将深入分析 Flink 分布式运行时架构如何实现这些概念。
-
-## Flink 中的 API
-
-Flink 为流式/批式处理应用程序的开发提供了不同级别的抽象。
-
-
-
- - Flink API 最底层的抽象为**有状态实时流处理**。其抽象实现是 [Process Function]({% link dev/stream/operators/process_function.zh.md %}),并且 **Process Function** 被 Flink 框架集成到了 [DataStream API]({% link dev/datastream_api.zh.md %}) 中来为我们使用。它允许用户在应用程序中自由地处理来自单流或多流的事件(数据),并提供具有全局一致性和容错保障的*状态*。此外,用户可以在此层抽象中注册事件时间(event time)和处理时间(processing time)回调方法,从而允许程序可以实现复杂计算。
-
- - Flink API 第二层抽象是 **Core APIs**。实际上,许多应用程序不需要使用到上述最底层抽象的 API,而是可以使用 **Core APIs** 进行编程:其中包含 [DataStream API]({% link dev/datastream_api.zh.md %})(应用于有界/无界数据流场景)和 [DataSet API]({% link dev/batch/index.zh.md %})(应用于有界数据集场景)两部分。Core APIs 提供的流式 API(Fluent API)为数据处理提供了通用的模块组件,例如各种形式的用户自定义转换(transformations)、联接(joins)、聚合(aggregations)、窗口(windows)和状态(state)操作等。此层 API 中处理的数据类型在每种编程语言中都有其对应的类。
-
- *Process Function* 这类底层抽象和 *DataStream API* 的相互集成使得用户可以选择使用更底层的抽象 API 来实现自己的需求。*DataSet API* 还额外提供了一些原语,比如循环/迭代(loop/iteration)操作。
-
- - Flink API 第三层抽象是 **Table API**。**Table API** 是以表(Table)为中心的声明式编程(DSL)API,例如在流式数据场景下,它可以表示一张正在动态改变的表。[Table API]({% link dev/table/index.zh.md %}) 遵循(扩展)关系模型:即表拥有 schema(类似于关系型数据库中的 schema),并且 Table API 也提供了类似于关系模型中的操作,比如 select、project、join、group-by 和 aggregate 等。Table API 程序是以声明的方式定义*应执行的逻辑操作*,而不是确切地指定程序*应该执行的代码*。尽管 Table API 使用起来很简洁并且可以由各种类型的用户自定义函数扩展功能,但还是比 Core API 的表达能力差。此外,Table API 程序在执行之前还会使用优化器中的优化规则对用户编写的表达式进行优化。
-
- 表和 *DataStream*/*DataSet* 可以进行无缝切换,Flink 允许用户在编写应用程序时将 *Table API* 与 *DataStream*/*DataSet* API 混合使用。
-
- - Flink API 最顶层抽象是 **SQL**。这层抽象在语义和程序表达式上都类似于 *Table API*,但是其程序实现都是 SQL 查询表达式。[SQL]({% link dev/table/index.zh.md %}#sql) 抽象与 Table API 抽象之间的关联是非常紧密的,并且 SQL 查询语句可以在 *Table API* 中定义的表上执行。
diff --git a/docs/concepts/stateful-stream-processing.md b/docs/concepts/stateful-stream-processing.md
deleted file mode 100644
index ba5bbb976d32f..0000000000000
--- a/docs/concepts/stateful-stream-processing.md
+++ /dev/null
@@ -1,370 +0,0 @@
----
-title: Stateful Stream Processing
-nav-id: stateful-stream-processing
-nav-pos: 2
-nav-title: Stateful Stream Processing
-nav-parent_id: concepts
----
-
-
-* This will be replaced by the TOC
-{:toc}
-
-## What is State?
-
-While many operations in a dataflow simply look at one individual *event at a
-time* (for example an event parser), some operations remember information
-across multiple events (for example window operators). These operations are
-called **stateful**.
-
-Some examples of stateful operations:
-
- - When an application searches for certain event patterns, the state will
- store the sequence of events encountered so far.
- - When aggregating events per minute/hour/day, the state holds the pending
- aggregates.
- - When training a machine learning model over a stream of data points, the
- state holds the current version of the model parameters.
- - When historic data needs to be managed, the state allows efficient access
- to events that occurred in the past.
-
-Flink needs to be aware of the state in order to make it fault tolerant using
-[checkpoints]({% link dev/stream/state/checkpointing.md %})
-and [savepoints]({%link ops/state/savepoints.md %}).
-
-Knowledge about the state also allows for rescaling Flink applications, meaning
-that Flink takes care of redistributing state across parallel instances.
-
-[Queryable state]({% link dev/stream/state/queryable_state.md
-%}) allows you to access state from outside of Flink during runtime.
-
-When working with state, it might also be useful to read about [Flink's state
-backends]({% link ops/state/state_backends.md %}). Flink
-provides different state backends that specify how and where state is stored.
-
-{% top %}
-
-## Keyed State
-
-Keyed state is maintained in what can be thought of as an embedded key/value
-store. The state is partitioned and distributed strictly together with the
-streams that are read by the stateful operators. Hence, access to the key/value
-state is only possible on *keyed streams*, i.e. after a keyed/partitioned data
-exchange, and is restricted to the values associated with the current event's
-key. Aligning the keys of streams and state makes sure that all state updates
-are local operations, guaranteeing consistency without transaction overhead.
-This alignment also allows Flink to redistribute the state and adjust the
-stream partitioning transparently.
-
-
-
-Keyed State is further organized into so-called *Key Groups*. Key Groups are
-the atomic unit by which Flink can redistribute Keyed State; there are exactly
-as many Key Groups as the defined maximum parallelism. During execution each
-parallel instance of a keyed operator works with the keys for one or more Key
-Groups.
-
-## State Persistence
-
-Flink implements fault tolerance using a combination of **stream replay** and
-**checkpointing**. A checkpoint marks a specific point in each of the
-input streams along with the corresponding state for each of the operators. A
-streaming dataflow can be resumed from a checkpoint while maintaining
-consistency *(exactly-once processing semantics)* by restoring the state of the
-operators and replaying the records from the point of the checkpoint.
-
-The checkpoint interval is a means of trading off the overhead of fault
-tolerance during execution with the recovery time (the number of records that
-need to be replayed).
-
-The fault tolerance mechanism continuously draws snapshots of the distributed
-streaming data flow. For streaming applications with small state, these
-snapshots are very light-weight and can be drawn frequently without much impact
-on performance. The state of the streaming applications is stored at a
-configurable place, usually in a distributed file system.
-
-In case of a program failure (due to machine-, network-, or software failure),
-Flink stops the distributed streaming dataflow. The system then restarts the
-operators and resets them to the latest successful checkpoint. The input
-streams are reset to the point of the state snapshot. Any records that are
-processed as part of the restarted parallel dataflow are guaranteed to not have
-affected the previously checkpointed state.
-
-{% info Note %} By default, checkpointing is disabled. See [Checkpointing]({%
-link dev/stream/state/checkpointing.md %}) for details on how to enable and
-configure checkpointing.
-
-{% info Note %} For this mechanism to realize its full guarantees, the data
-stream source (such as message queue or broker) needs to be able to rewind the
-stream to a defined recent point. [Apache Kafka](http://kafka.apache.org) has
-this ability and Flink's connector to Kafka exploits this. See [Fault
-Tolerance Guarantees of Data Sources and Sinks]({% link
-dev/connectors/guarantees.md %}) for more information about the guarantees
-provided by Flink's connectors.
-
-{% info Note %} Because Flink's checkpoints are realized through distributed
-snapshots, we use the words *snapshot* and *checkpoint* interchangeably. Often
-we also use the term *snapshot* to mean either *checkpoint* or *savepoint*.
-
-### Checkpointing
-
-The central part of Flink's fault tolerance mechanism is drawing consistent
-snapshots of the distributed data stream and operator state. These snapshots
-act as consistent checkpoints to which the system can fall back in case of a
-failure. Flink's mechanism for drawing these snapshots is described in
-"[Lightweight Asynchronous Snapshots for Distributed
-Dataflows](http://arxiv.org/abs/1506.08603)". It is inspired by the standard
-[Chandy-Lamport
-algorithm](http://research.microsoft.com/en-us/um/people/lamport/pubs/chandy.pdf)
-for distributed snapshots and is specifically tailored to Flink's execution
-model.
-
-Keep in mind that everything to do with checkpointing can be done
-asynchronously. The checkpoint barriers don't travel in lock step and
-operations can asynchronously snapshot their state.
-
-Since Flink 1.11, checkpoints can be taken with or without alignment. In this
-section, we describe aligned checkpoints first.
-
-#### Barriers
-
-A core element in Flink's distributed snapshotting are the *stream barriers*.
-These barriers are injected into the data stream and flow with the records as
-part of the data stream. Barriers never overtake records, they flow strictly in
-line. A barrier separates the records in the data stream into the set of
-records that goes into the current snapshot, and the records that go into the
-next snapshot. Each barrier carries the ID of the snapshot whose records it
-pushed in front of it. Barriers do not interrupt the flow of the stream and are
-hence very lightweight. Multiple barriers from different snapshots can be in
-the stream at the same time, which means that various snapshots may happen
-concurrently.
-
-
-
-
-
-Stream barriers are injected into the parallel data flow at the stream sources.
-The point where the barriers for snapshot *n* are injected (let's call it
-Sn) is the position in the source stream up to which the
-snapshot covers the data. For example, in Apache Kafka, this position would be
-the last record's offset in the partition. This position Sn
-is reported to the *checkpoint coordinator* (Flink's JobManager).
-
-The barriers then flow downstream. When an intermediate operator has received a
-barrier for snapshot *n* from all of its input streams, it emits a barrier for
-snapshot *n* into all of its outgoing streams. Once a sink operator (the end of
-a streaming DAG) has received the barrier *n* from all of its input streams, it
-acknowledges that snapshot *n* to the checkpoint coordinator. After all sinks
-have acknowledged a snapshot, it is considered completed.
-
-Once snapshot *n* has been completed, the job will never again ask the source
-for records from before Sn, since at that point these records
-(and their descendant records) will have passed through the entire data flow
-topology.
-
-
-
-
-
-Operators that receive more than one input stream need to *align* the input
-streams on the snapshot barriers. The figure above illustrates this:
-
- - As soon as the operator receives snapshot barrier *n* from an incoming
- stream, it cannot process any further records from that stream until it has
- received the barrier *n* from the other inputs as well. Otherwise, it would
- mix records that belong to snapshot *n* and with records that belong to
- snapshot *n+1*.
- - Once the last stream has received barrier *n*, the operator emits all
- pending outgoing records, and then emits snapshot *n* barriers itself.
- - It snapshots the state and resumes processing records from all input streams,
- processing records from the input buffers before processing the records
- from the streams.
- - Finally, the operator writes the state asynchronously to the state backend.
-
-Note that the alignment is needed for all operators with multiple inputs and for
-operators after a shuffle when they consume output streams of multiple upstream
-subtasks.
-
-#### Snapshotting Operator State
-
-When operators contain any form of *state*, this state must be part of the
-snapshots as well.
-
-Operators snapshot their state at the point in time when they have received all
-snapshot barriers from their input streams, and before emitting the barriers to
-their output streams. At that point, all updates to the state from records
-before the barriers have been made, and no updates that depend on records
-from after the barriers have been applied. Because the state of a snapshot may
-be large, it is stored in a configurable *[state backend]({%
-link ops/state/state_backends.md %})*. By default, this is the JobManager's
-memory, but for production use a distributed reliable storage should be
-configured (such as HDFS). After the state has been stored, the operator
-acknowledges the checkpoint, emits the snapshot barrier into the output
-streams, and proceeds.
-
-The resulting snapshot now contains:
-
- - For each parallel stream data source, the offset/position in the stream
- when the snapshot was started
- - For each operator, a pointer to the state that was stored as part of the
- snapshot
-
-
-
-
-
-#### Recovery
-
-Recovery under this mechanism is straightforward: Upon a failure, Flink selects
-the latest completed checkpoint *k*. The system then re-deploys the entire
-distributed dataflow, and gives each operator the state that was snapshotted as
-part of checkpoint *k*. The sources are set to start reading the stream from
-position Sk. For example in Apache Kafka, that means telling
-the consumer to start fetching from offset Sk.
-
-If state was snapshotted incrementally, the operators start with the state of
-the latest full snapshot and then apply a series of incremental snapshot
-updates to that state.
-
-See [Restart Strategies]({% link dev/task_failure_recovery.md
-%}#restart-strategies) for more information.
-
-### Unaligned Checkpointing
-
-Starting with Flink 1.11, checkpointing can also be performed unaligned.
-The basic idea is that checkpoints can overtake all in-flight data as long as
-the in-flight data becomes part of the operator state.
-
-Note that this approach is actually closer to the [Chandy-Lamport algorithm
-](http://research.microsoft.com/en-us/um/people/lamport/pubs/chandy.pdf), but
-Flink still inserts the barrier in the sources to avoid overloading the
-checkpoint coordinator.
-
-
-
-
-
-The figure depicts how an operator handles unaligned checkpoint barriers:
-
-- The operator reacts on the first barrier that is stored in its input buffers.
-- It immediately forwards the barrier to the downstream operator by adding it
- to the end of the output buffers.
-- The operator marks all overtaken records to be stored asynchronously and
- creates a snapshot of its own state.
-
-Consequently, the operator only briefly stops the processing of input to mark
-the buffers, forwards the barrier, and creates the snapshot of the other state.
-
-Unaligned checkpointing ensures that barriers are arriving at the sink as fast
-as possible. It's especially suited for applications with at least one slow
-moving data path, where alignment times can reach hours. However, since it's
-adding additional I/O pressure, it doesn't help when the I/O to the state
-backends is the bottleneck. See the more in-depth discussion in
-[ops]({% link ops/state/checkpoints.md %}#unaligned-checkpoints)
-for other limitations.
-
-Note that savepoints will always be aligned.
-
-#### Unaligned Recovery
-
-Operators first recover the in-flight data before starting processing any data
-from upstream operators in unaligned checkpointing. Aside from that, it
-performs the same steps as during [recovery of aligned checkpoints](#recovery).
-
-### State Backends
-
-The exact data structures in which the key/values indexes are stored depends on
-the chosen [state backend]({% link
-ops/state/state_backends.md %}). One state backend stores data in an in-memory
-hash map, another state backend uses [RocksDB](http://rocksdb.org) as the
-key/value store. In addition to defining the data structure that holds the
-state, the state backends also implement the logic to take a point-in-time
-snapshot of the key/value state and store that snapshot as part of a
-checkpoint. State backends can be configured without changing your application
-logic.
-
-
-
-{% top %}
-
-### Savepoints
-
-All programs that use checkpointing can resume execution from a **savepoint**.
-Savepoints allow both updating your programs and your Flink cluster without
-losing any state.
-
-[Savepoints]({% link ops/state/savepoints.md %}) are
-**manually triggered checkpoints**, which take a snapshot of the program and
-write it out to a state backend. They rely on the regular checkpointing
-mechanism for this.
-
-Savepoints are similar to checkpoints except that they are
-**triggered by the user** and **don't automatically expire** when newer
-checkpoints are completed.
-
-{% top %}
-
-### Exactly Once vs. At Least Once
-
-The alignment step may add latency to the streaming program. Usually, this
-extra latency is on the order of a few milliseconds, but we have seen cases
-where the latency of some outliers increased noticeably. For applications that
-require consistently super low latencies (few milliseconds) for all records,
-Flink has a switch to skip the stream alignment during a checkpoint. Checkpoint
-snapshots are still drawn as soon as an operator has seen the checkpoint
-barrier from each input.
-
-When the alignment is skipped, an operator keeps processing all inputs, even
-after some checkpoint barriers for checkpoint *n* arrived. That way, the
-operator also processes elements that belong to checkpoint *n+1* before the
-state snapshot for checkpoint *n* was taken. On a restore, these records will
-occur as duplicates, because they are both included in the state snapshot of
-checkpoint *n*, and will be replayed as part of the data after checkpoint *n*.
-
-{% info Note %} Alignment happens only for operators with multiple predecessors
-(joins) as well as operators with multiple senders (after a stream
-repartitioning/shuffle). Because of that, dataflows with only embarrassingly
-parallel streaming operations (`map()`, `flatMap()`, `filter()`, ...) actually
-give *exactly once* guarantees even in *at least once* mode.
-
-{% top %}
-
-## State and Fault Tolerance in Batch Programs
-
-Flink executes [batch programs]({% link dev/batch/index.md %}) as a special case of
-streaming programs, where the streams are bounded (finite number of elements).
-A *DataSet* is treated internally as a stream of data. The concepts above thus
-apply to batch programs in the same way as well as they apply to streaming
-programs, with minor exceptions:
-
- - [Fault tolerance for batch programs]({% link dev/task_failure_recovery.md %})
- does not use checkpointing. Recovery happens by fully replaying the
- streams. That is possible, because inputs are bounded. This pushes the
- cost more towards the recovery, but makes the regular processing cheaper,
- because it avoids checkpoints.
-
- - Stateful operations in the DataSet API use simplified in-memory/out-of-core
- data structures, rather than key/value indexes.
-
- - The DataSet API introduces special synchronized (superstep-based)
- iterations, which are only possible on bounded streams. For details, check
- out the [iteration docs]({% link dev/batch/iterations.md %}).
-
-{% top %}
diff --git a/docs/concepts/stateful-stream-processing.zh.md b/docs/concepts/stateful-stream-processing.zh.md
deleted file mode 100644
index 2df6628365209..0000000000000
--- a/docs/concepts/stateful-stream-processing.zh.md
+++ /dev/null
@@ -1,324 +0,0 @@
----
-title: 有状态流处理
-nav-id: stateful-stream-processing
-nav-pos: 2
-nav-title: 有状态流处理
-nav-parent_id: concepts
----
-
-
-* This will be replaced by the TOC
-{:toc}
-
-## What is State?
-
-While many operations in a dataflow simply look at one individual *event at a
-time* (for example an event parser), some operations remember information
-across multiple events (for example window operators). These operations are
-called **stateful**.
-
-Some examples of stateful operations:
-
- - When an application searches for certain event patterns, the state will
- store the sequence of events encountered so far.
- - When aggregating events per minute/hour/day, the state holds the pending
- aggregates.
- - When training a machine learning model over a stream of data points, the
- state holds the current version of the model parameters.
- - When historic data needs to be managed, the state allows efficient access
- to events that occurred in the past.
-
-Flink needs to be aware of the state in order to make it fault tolerant using
-[checkpoints]({% link dev/stream/state/checkpointing.zh.md %})
-and [savepoints]({%link ops/state/savepoints.zh.md %}).
-
-Knowledge about the state also allows for rescaling Flink applications, meaning
-that Flink takes care of redistributing state across parallel instances.
-
-[Queryable state]({% link dev/stream/state/queryable_state.zh.md
-%}) allows you to access state from outside of Flink during runtime.
-
-When working with state, it might also be useful to read about [Flink's state
-backends]({% link ops/state/state_backends.zh.md %}). Flink
-provides different state backends that specify how and where state is stored.
-
-{% top %}
-
-## Keyed State
-
-Keyed state is maintained in what can be thought of as an embedded key/value
-store. The state is partitioned and distributed strictly together with the
-streams that are read by the stateful operators. Hence, access to the key/value
-state is only possible on *keyed streams*, i.e. after a keyed/partitioned data
-exchange, and is restricted to the values associated with the current event's
-key. Aligning the keys of streams and state makes sure that all state updates
-are local operations, guaranteeing consistency without transaction overhead.
-This alignment also allows Flink to redistribute the state and adjust the
-stream partitioning transparently.
-
-
-
-Keyed State is further organized into so-called *Key Groups*. Key Groups are
-the atomic unit by which Flink can redistribute Keyed State; there are exactly
-as many Key Groups as the defined maximum parallelism. During execution each
-parallel instance of a keyed operator works with the keys for one or more Key
-Groups.
-
-## State Persistence
-
-Flink implements fault tolerance using a combination of **stream replay** and
-**checkpointing**. A checkpoint marks a specific point in each of the
-input streams along with the corresponding state for each of the operators. A
-streaming dataflow can be resumed from a checkpoint while maintaining
-consistency *(exactly-once processing semantics)* by restoring the state of the
-operators and replaying the records from the point of the checkpoint.
-
-The checkpoint interval is a means of trading off the overhead of fault
-tolerance during execution with the recovery time (the number of records that
-need to be replayed).
-
-The fault tolerance mechanism continuously draws snapshots of the distributed
-streaming data flow. For streaming applications with small state, these
-snapshots are very light-weight and can be drawn frequently without much impact
-on performance. The state of the streaming applications is stored at a
-configurable place, usually in a distributed file system.
-
-In case of a program failure (due to machine-, network-, or software failure),
-Flink stops the distributed streaming dataflow. The system then restarts the
-operators and resets them to the latest successful checkpoint. The input
-streams are reset to the point of the state snapshot. Any records that are
-processed as part of the restarted parallel dataflow are guaranteed to not have
-affected the previously checkpointed state.
-
-{% info Note %} By default, checkpointing is disabled. See [Checkpointing]({%
-link dev/stream/state/checkpointing.zh.md %}) for details on how to enable and
-configure checkpointing.
-
-{% info Note %} For this mechanism to realize its full guarantees, the data
-stream source (such as message queue or broker) needs to be able to rewind the
-stream to a defined recent point. [Apache Kafka](http://kafka.apache.org) has
-this ability and Flink's connector to Kafka exploits this. See [Fault
-Tolerance Guarantees of Data Sources and Sinks]({% link
-dev/connectors/guarantees.zh.md %}) for more information about the guarantees
-provided by Flink's connectors.
-
-{% info Note %} Because Flink's checkpoints are realized through distributed
-snapshots, we use the words *snapshot* and *checkpoint* interchangeably. Often
-we also use the term *snapshot* to mean either *checkpoint* or *savepoint*.
-
-### Checkpointing
-
-The central part of Flink's fault tolerance mechanism is drawing consistent
-snapshots of the distributed data stream and operator state. These snapshots
-act as consistent checkpoints to which the system can fall back in case of a
-failure. Flink's mechanism for drawing these snapshots is described in
-"[Lightweight Asynchronous Snapshots for Distributed
-Dataflows](http://arxiv.org/abs/1506.08603)". It is inspired by the standard
-[Chandy-Lamport
-algorithm](http://research.microsoft.com/en-us/um/people/lamport/pubs/chandy.pdf)
-for distributed snapshots and is specifically tailored to Flink's execution
-model.
-
-Keep in mind that everything to do with checkpointing can be done
-asynchronously. The checkpoint barriers don't travel in lock step and
-operations can asynchronously snapshot their state.
-
-
-#### Barriers
-
-A core element in Flink's distributed snapshotting are the *stream barriers*.
-These barriers are injected into the data stream and flow with the records as
-part of the data stream. Barriers never overtake records, they flow strictly in
-line. A barrier separates the records in the data stream into the set of
-records that goes into the current snapshot, and the records that go into the
-next snapshot. Each barrier carries the ID of the snapshot whose records it
-pushed in front of it. Barriers do not interrupt the flow of the stream and are
-hence very lightweight. Multiple barriers from different snapshots can be in
-the stream at the same time, which means that various snapshots may happen
-concurrently.
-
-
-
-
-
-Stream barriers are injected into the parallel data flow at the stream sources.
-The point where the barriers for snapshot *n* are injected (let's call it
-Sn) is the position in the source stream up to which the
-snapshot covers the data. For example, in Apache Kafka, this position would be
-the last record's offset in the partition. This position Sn
-is reported to the *checkpoint coordinator* (Flink's JobManager).
-
-The barriers then flow downstream. When an intermediate operator has received a
-barrier for snapshot *n* from all of its input streams, it emits a barrier for
-snapshot *n* into all of its outgoing streams. Once a sink operator (the end of
-a streaming DAG) has received the barrier *n* from all of its input streams, it
-acknowledges that snapshot *n* to the checkpoint coordinator. After all sinks
-have acknowledged a snapshot, it is considered completed.
-
-Once snapshot *n* has been completed, the job will never again ask the source
-for records from before Sn, since at that point these records
-(and their descendant records) will have passed through the entire data flow
-topology.
-
-
-
-
-
-Operators that receive more than one input stream need to *align* the input
-streams on the snapshot barriers. The figure above illustrates this:
-
- - As soon as the operator receives snapshot barrier *n* from an incoming
- stream, it cannot process any further records from that stream until it has
- received the barrier *n* from the other inputs as well. Otherwise, it would
- mix records that belong to snapshot *n* and with records that belong to
- snapshot *n+1*.
- - Streams that report barrier *n* are temporarily set aside. Records that are
- received from these streams are not processed, but put into an input
- buffer.
- - Once the last stream has received barrier *n*, the operator emits all
- pending outgoing records, and then emits snapshot *n* barriers itself.
- - After that, it resumes processing records from all input streams,
- processing records from the input buffers before processing the records
- from the streams.
-
-#### Snapshotting Operator State
-
-When operators contain any form of *state*, this state must be part of the
-snapshots as well.
-
-Operators snapshot their state at the point in time when they have received all
-snapshot barriers from their input streams, and before emitting the barriers to
-their output streams. At that point, all updates to the state from records
-before the barriers will have been made, and no updates that depend on records
-from after the barriers have been applied. Because the state of a snapshot may
-be large, it is stored in a configurable *[state backend]({%
-link ops/state/state_backends.zh.md %})*. By default, this is the JobManager's
-memory, but for production use a distributed reliable storage should be
-configured (such as HDFS). After the state has been stored, the operator
-acknowledges the checkpoint, emits the snapshot barrier into the output
-streams, and proceeds.
-
-The resulting snapshot now contains:
-
- - For each parallel stream data source, the offset/position in the stream
- when the snapshot was started
- - For each operator, a pointer to the state that was stored as part of the
- snapshot
-
-
-
-
-
-#### Recovery
-
-Recovery under this mechanism is straightforward: Upon a failure, Flink selects
-the latest completed checkpoint *k*. The system then re-deploys the entire
-distributed dataflow, and gives each operator the state that was snapshotted as
-part of checkpoint *k*. The sources are set to start reading the stream from
-position Sk. For example in Apache Kafka, that means telling
-the consumer to start fetching from offset Sk.
-
-If state was snapshotted incrementally, the operators start with the state of
-the latest full snapshot and then apply a series of incremental snapshot
-updates to that state.
-
-See [Restart Strategies]({% link dev/task_failure_recovery.zh.md
-%}#restart-strategies) for more information.
-
-### State Backends
-
-The exact data structures in which the key/values indexes are stored depends on
-the chosen [state backend]({% link
-ops/state/state_backends.zh.md %}). One state backend stores data in an in-memory
-hash map, another state backend uses [RocksDB](http://rocksdb.org) as the
-key/value store. In addition to defining the data structure that holds the
-state, the state backends also implement the logic to take a point-in-time
-snapshot of the key/value state and store that snapshot as part of a
-checkpoint. State backends can be configured without changing your application
-logic.
-
-
-
-{% top %}
-
-### Savepoints
-
-All programs that use checkpointing can resume execution from a **savepoint**.
-Savepoints allow both updating your programs and your Flink cluster without
-losing any state.
-
-[Savepoints]({% link ops/state/savepoints.zh.md %}) are
-**manually triggered checkpoints**, which take a snapshot of the program and
-write it out to a state backend. They rely on the regular checkpointing
-mechanism for this.
-
-Savepoints are similar to checkpoints except that they are
-**triggered by the user** and **don't automatically expire** when newer
-checkpoints are completed.
-
-{% top %}
-
-### Exactly Once vs. At Least Once
-
-The alignment step may add latency to the streaming program. Usually, this
-extra latency is on the order of a few milliseconds, but we have seen cases
-where the latency of some outliers increased noticeably. For applications that
-require consistently super low latencies (few milliseconds) for all records,
-Flink has a switch to skip the stream alignment during a checkpoint. Checkpoint
-snapshots are still drawn as soon as an operator has seen the checkpoint
-barrier from each input.
-
-When the alignment is skipped, an operator keeps processing all inputs, even
-after some checkpoint barriers for checkpoint *n* arrived. That way, the
-operator also processes elements that belong to checkpoint *n+1* before the
-state snapshot for checkpoint *n* was taken. On a restore, these records will
-occur as duplicates, because they are both included in the state snapshot of
-checkpoint *n*, and will be replayed as part of the data after checkpoint *n*.
-
-{% info Note %} Alignment happens only for operators with multiple predecessors
-(joins) as well as operators with multiple senders (after a stream
-repartitioning/shuffle). Because of that, dataflows with only embarrassingly
-parallel streaming operations (`map()`, `flatMap()`, `filter()`, ...) actually
-give *exactly once* guarantees even in *at least once* mode.
-
-{% top %}
-
-## State and Fault Tolerance in Batch Programs
-
-Flink executes [batch programs]({% link dev/batch/index.zh.md %}) as a special case of
-streaming programs, where the streams are bounded (finite number of elements).
-A *DataSet* is treated internally as a stream of data. The concepts above thus
-apply to batch programs in the same way as well as they apply to streaming
-programs, with minor exceptions:
-
- - [Fault tolerance for batch programs]({% link dev/task_failure_recovery.zh.md %})
- does not use checkpointing. Recovery happens by fully replaying the
- streams. That is possible, because inputs are bounded. This pushes the
- cost more towards the recovery, but makes the regular processing cheaper,
- because it avoids checkpoints.
-
- - Stateful operations in the DataSet API use simplified in-memory/out-of-core
- data structures, rather than key/value indexes.
-
- - The DataSet API introduces special synchronized (superstep-based)
- iterations, which are only possible on bounded streams. For details, check
- out the [iteration docs]({% link dev/batch/iterations.zh.md %}).
-
-{% top %}
diff --git a/docs/concepts/timely-stream-processing.md b/docs/concepts/timely-stream-processing.md
deleted file mode 100644
index f8a5b8dfa5435..0000000000000
--- a/docs/concepts/timely-stream-processing.md
+++ /dev/null
@@ -1,214 +0,0 @@
----
-title: Timely Stream Processing
-nav-id: timely-stream-processing
-nav-pos: 3
-nav-title: Timely Stream Processing
-nav-parent_id: concepts
----
-
-
-* This will be replaced by the TOC
-{:toc}
-
-## Introduction
-
-Timely stream processing is an extension of [stateful stream processing]({% link
-concepts/stateful-stream-processing.md %}) in which time plays some role in the
-computation. Among other things, this is the case when you do time series
-analysis, when doing aggregations based on certain time periods (typically
-called windows), or when you do event processing where the time when an event
-occurred is important.
-
-In the following sections we will highlight some of the topics that you should
-consider when working with timely Flink Applications.
-
-{% top %}
-
-## Notions of Time: Event Time and Processing Time
-
-When referring to time in a streaming program (for example to define windows),
-one can refer to different notions of *time*:
-
-- **Processing time:** Processing time refers to the system time of the machine
- that is executing the respective operation.
-
- When a streaming program runs on processing time, all time-based operations
- (like time windows) will use the system clock of the machines that run the
- respective operator. An hourly processing time window will include all
- records that arrived at a specific operator between the times when the system
- clock indicated the full hour. For example, if an application begins running
- at 9:15am, the first hourly processing time window will include events
- processed between 9:15am and 10:00am, the next window will include events
- processed between 10:00am and 11:00am, and so on.
-
- Processing time is the simplest notion of time and requires no coordination
- between streams and machines. It provides the best performance and the
- lowest latency. However, in distributed and asynchronous environments
- processing time does not provide determinism, because it is susceptible to
- the speed at which records arrive in the system (for example from the message
- queue), to the speed at which the records flow between operators inside the
- system, and to outages (scheduled, or otherwise).
-
-- **Event time:** Event time is the time that each individual event occurred on
- its producing device. This time is typically embedded within the records
- before they enter Flink, and that *event timestamp* can be extracted from
- each record. In event time, the progress of time depends on the data, not on
- any wall clocks. Event time programs must specify how to generate *Event Time
- Watermarks*, which is the mechanism that signals progress in event time. This
- watermarking mechanism is described in a later section,
- [below](#event-time-and-watermarks).
-
- In a perfect world, event time processing would yield completely consistent
- and deterministic results, regardless of when events arrive, or their
- ordering. However, unless the events are known to arrive in-order (by
- timestamp), event time processing incurs some latency while waiting for
- out-of-order events. As it is only possible to wait for a finite period of
- time, this places a limit on how deterministic event time applications can
- be.
-
- Assuming all of the data has arrived, event time operations will behave as
- expected, and produce correct and consistent results even when working with
- out-of-order or late events, or when reprocessing historic data. For example,
- an hourly event time window will contain all records that carry an event
- timestamp that falls into that hour, regardless of the order in which they
- arrive, or when they are processed. (See the section on [late
- events](#late-elements) for more information.)
-
- Note that sometimes when event time programs are processing live data in
- real-time, they will use some *processing time* operations in order to
- guarantee that they are progressing in a timely fashion.
-
-
-
-{% top %}
-
-## Event Time and Watermarks
-
-*Note: Flink implements many techniques from the Dataflow Model. For a good
-introduction to event time and watermarks, have a look at the articles below.*
-
- - [Streaming
- 101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) by
- Tyler Akidau
- - The [Dataflow Model
- paper](https://research.google.com/pubs/archive/43864.pdf)
-
-
-A stream processor that supports *event time* needs a way to measure the
-progress of event time. For example, a window operator that builds hourly
-windows needs to be notified when event time has passed beyond the end of an
-hour, so that the operator can close the window in progress.
-
-*Event time* can progress independently of *processing time* (measured by wall
-clocks). For example, in one program the current *event time* of an operator
-may trail slightly behind the *processing time* (accounting for a delay in
-receiving the events), while both proceed at the same speed. On the other
-hand, another streaming program might progress through weeks of event time with
-only a few seconds of processing, by fast-forwarding through some historic data
-already buffered in a Kafka topic (or another message queue).
-
-------
-
-The mechanism in Flink to measure progress in event time is **watermarks**.
-Watermarks flow as part of the data stream and carry a timestamp *t*. A
-*Watermark(t)* declares that event time has reached time *t* in that stream,
-meaning that there should be no more elements from the stream with a timestamp
-*t' <= t* (i.e. events with timestamps older or equal to the watermark).
-
-The figure below shows a stream of events with (logical) timestamps, and
-watermarks flowing inline. In this example the events are in order (with
-respect to their timestamps), meaning that the watermarks are simply periodic
-markers in the stream.
-
-
-
-Watermarks are crucial for *out-of-order* streams, as illustrated below, where
-the events are not ordered by their timestamps. In general a watermark is a
-declaration that by that point in the stream, all events up to a certain
-timestamp should have arrived. Once a watermark reaches an operator, the
-operator can advance its internal *event time clock* to the value of the
-watermark.
-
-
-
-Note that event time is inherited by a freshly created stream element (or
-elements) from either the event that produced them or from watermark that
-triggered creation of those elements.
-
-### Watermarks in Parallel Streams
-
-Watermarks are generated at, or directly after, source functions. Each parallel
-subtask of a source function usually generates its watermarks independently.
-These watermarks define the event time at that particular parallel source.
-
-As the watermarks flow through the streaming program, they advance the event
-time at the operators where they arrive. Whenever an operator advances its
-event time, it generates a new watermark downstream for its successor
-operators.
-
-Some operators consume multiple input streams; a union, for example, or
-operators following a *keyBy(...)* or *partition(...)* function. Such an
-operator's current event time is the minimum of its input streams' event times.
-As its input streams update their event times, so does the operator.
-
-The figure below shows an example of events and watermarks flowing through
-parallel streams, and operators tracking event time.
-
-
-
-## Lateness
-
-It is possible that certain elements will violate the watermark condition,
-meaning that even after the *Watermark(t)* has occurred, more elements with
-timestamp *t' <= t* will occur. In fact, in many real world setups, certain
-elements can be arbitrarily delayed, making it impossible to specify a time by
-which all elements of a certain event timestamp will have occurred.
-Furthermore, even if the lateness can be bounded, delaying the watermarks by
-too much is often not desirable, because it causes too much delay in the
-evaluation of event time windows.
-
-For this reason, streaming programs may explicitly expect some *late* elements.
-Late elements are elements that arrive after the system's event time clock (as
-signaled by the watermarks) has already passed the time of the late element's
-timestamp. See [Allowed Lateness]({% link
-dev/stream/operators/windows.md %}#allowed-lateness) for more information on
-how to work with late elements in event time windows.
-
-## Windowing
-
-Aggregating events (e.g., counts, sums) works differently on streams than in
-batch processing. For example, it is impossible to count all elements in a
-stream, because streams are in general infinite (unbounded). Instead,
-aggregates on streams (counts, sums, etc), are scoped by **windows**, such as
-*"count over the last 5 minutes"*, or *"sum of the last 100 elements"*.
-
-Windows can be *time driven* (example: every 30 seconds) or *data driven*
-(example: every 100 elements). One typically distinguishes different types of
-windows, such as *tumbling windows* (no overlap), *sliding windows* (with
-overlap), and *session windows* (punctuated by a gap of inactivity).
-
-
-
-Please check out this [blog
-post](https://flink.apache.org/news/2015/12/04/Introducing-windows.html) for
-additional examples of windows or take a look a [window documentation]({% link
-dev/stream/operators/windows.md %}) of the DataStream API.
-
-{% top %}
diff --git a/docs/concepts/timely-stream-processing.zh.md b/docs/concepts/timely-stream-processing.zh.md
deleted file mode 100644
index 54fcf1bea32fe..0000000000000
--- a/docs/concepts/timely-stream-processing.zh.md
+++ /dev/null
@@ -1,214 +0,0 @@
----
-title: 及时流处理
-nav-id: timely-stream-processing
-nav-pos: 3
-nav-title: 及时流处理
-nav-parent_id: concepts
----
-
-
-* This will be replaced by the TOC
-{:toc}
-
-## Introduction
-
-Timely steam processing is an extension of [stateful stream processing]({% link
-concepts/stateful-stream-processing.zh.md %}) in which time plays some role in the
-computation. Among other things, this is the case when you do time series
-analysis, when doing aggregations based on certain time periods (typically
-called windows), or when you do event processing where the time when an event
-occurred is important.
-
-In the following sections we will highlight some of the topics that you should
-consider when working with timely Flink Applications.
-
-{% top %}
-
-## Notions of Time: Event Time and Processing Time
-
-When referring to time in a streaming program (for example to define windows),
-one can refer to different notions of *time*:
-
-- **Processing time:** Processing time refers to the system time of the machine
- that is executing the respective operation.
-
- When a streaming program runs on processing time, all time-based operations
- (like time windows) will use the system clock of the machines that run the
- respective operator. An hourly processing time window will include all
- records that arrived at a specific operator between the times when the system
- clock indicated the full hour. For example, if an application begins running
- at 9:15am, the first hourly processing time window will include events
- processed between 9:15am and 10:00am, the next window will include events
- processed between 10:00am and 11:00am, and so on.
-
- Processing time is the simplest notion of time and requires no coordination
- between streams and machines. It provides the best performance and the
- lowest latency. However, in distributed and asynchronous environments
- processing time does not provide determinism, because it is susceptible to
- the speed at which records arrive in the system (for example from the message
- queue), to the speed at which the records flow between operators inside the
- system, and to outages (scheduled, or otherwise).
-
-- **Event time:** Event time is the time that each individual event occurred on
- its producing device. This time is typically embedded within the records
- before they enter Flink, and that *event timestamp* can be extracted from
- each record. In event time, the progress of time depends on the data, not on
- any wall clocks. Event time programs must specify how to generate *Event Time
- Watermarks*, which is the mechanism that signals progress in event time. This
- watermarking mechanism is described in a later section,
- [below](#event-time-and-watermarks).
-
- In a perfect world, event time processing would yield completely consistent
- and deterministic results, regardless of when events arrive, or their
- ordering. However, unless the events are known to arrive in-order (by
- timestamp), event time processing incurs some latency while waiting for
- out-of-order events. As it is only possible to wait for a finite period of
- time, this places a limit on how deterministic event time applications can
- be.
-
- Assuming all of the data has arrived, event time operations will behave as
- expected, and produce correct and consistent results even when working with
- out-of-order or late events, or when reprocessing historic data. For example,
- an hourly event time window will contain all records that carry an event
- timestamp that falls into that hour, regardless of the order in which they
- arrive, or when they are processed. (See the section on [late
- events](#late-elements) for more information.)
-
- Note that sometimes when event time programs are processing live data in
- real-time, they will use some *processing time* operations in order to
- guarantee that they are progressing in a timely fashion.
-
-
-
-{% top %}
-
-## Event Time and Watermarks
-
-*Note: Flink implements many techniques from the Dataflow Model. For a good
-introduction to event time and watermarks, have a look at the articles below.*
-
- - [Streaming
- 101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) by
- Tyler Akidau
- - The [Dataflow Model
- paper](https://research.google.com/pubs/archive/43864.pdf)
-
-
-A stream processor that supports *event time* needs a way to measure the
-progress of event time. For example, a window operator that builds hourly
-windows needs to be notified when event time has passed beyond the end of an
-hour, so that the operator can close the window in progress.
-
-*Event time* can progress independently of *processing time* (measured by wall
-clocks). For example, in one program the current *event time* of an operator
-may trail slightly behind the *processing time* (accounting for a delay in
-receiving the events), while both proceed at the same speed. On the other
-hand, another streaming program might progress through weeks of event time with
-only a few seconds of processing, by fast-forwarding through some historic data
-already buffered in a Kafka topic (or another message queue).
-
-------
-
-The mechanism in Flink to measure progress in event time is **watermarks**.
-Watermarks flow as part of the data stream and carry a timestamp *t*. A
-*Watermark(t)* declares that event time has reached time *t* in that stream,
-meaning that there should be no more elements from the stream with a timestamp
-*t' <= t* (i.e. events with timestamps older or equal to the watermark).
-
-The figure below shows a stream of events with (logical) timestamps, and
-watermarks flowing inline. In this example the events are in order (with
-respect to their timestamps), meaning that the watermarks are simply periodic
-markers in the stream.
-
-
-
-Watermarks are crucial for *out-of-order* streams, as illustrated below, where
-the events are not ordered by their timestamps. In general a watermark is a
-declaration that by that point in the stream, all events up to a certain
-timestamp should have arrived. Once a watermark reaches an operator, the
-operator can advance its internal *event time clock* to the value of the
-watermark.
-
-
-
-Note that event time is inherited by a freshly created stream element (or
-elements) from either the event that produced them or from watermark that
-triggered creation of those elements.
-
-### Watermarks in Parallel Streams
-
-Watermarks are generated at, or directly after, source functions. Each parallel
-subtask of a source function usually generates its watermarks independently.
-These watermarks define the event time at that particular parallel source.
-
-As the watermarks flow through the streaming program, they advance the event
-time at the operators where they arrive. Whenever an operator advances its
-event time, it generates a new watermark downstream for its successor
-operators.
-
-Some operators consume multiple input streams; a union, for example, or
-operators following a *keyBy(...)* or *partition(...)* function. Such an
-operator's current event time is the minimum of its input streams' event times.
-As its input streams update their event times, so does the operator.
-
-The figure below shows an example of events and watermarks flowing through
-parallel streams, and operators tracking event time.
-
-
-
-## Lateness
-
-It is possible that certain elements will violate the watermark condition,
-meaning that even after the *Watermark(t)* has occurred, more elements with
-timestamp *t' <= t* will occur. In fact, in many real world setups, certain
-elements can be arbitrarily delayed, making it impossible to specify a time by
-which all elements of a certain event timestamp will have occurred.
-Furthermore, even if the lateness can be bounded, delaying the watermarks by
-too much is often not desirable, because it causes too much delay in the
-evaluation of event time windows.
-
-For this reason, streaming programs may explicitly expect some *late* elements.
-Late elements are elements that arrive after the system's event time clock (as
-signaled by the watermarks) has already passed the time of the late element's
-timestamp. See [Allowed Lateness]({% link
-dev/stream/operators/windows.zh.md %}#allowed-lateness) for more information on
-how to work with late elements in event time windows.
-
-## Windowing
-
-Aggregating events (e.g., counts, sums) works differently on streams than in
-batch processing. For example, it is impossible to count all elements in a
-stream, because streams are in general infinite (unbounded). Instead,
-aggregates on streams (counts, sums, etc), are scoped by **windows**, such as
-*"count over the last 5 minutes"*, or *"sum of the last 100 elements"*.
-
-Windows can be *time driven* (example: every 30 seconds) or *data driven*
-(example: every 100 elements). One typically distinguishes different types of
-windows, such as *tumbling windows* (no overlap), *sliding windows* (with
-overlap), and *session windows* (punctuated by a gap of inactivity).
-
-
-
-Please check out this [blog
-post](https://flink.apache.org/news/2015/12/04/Introducing-windows.html) for
-additional examples of windows or take a look a [window documentation]({% link
-dev/stream/operators/windows.zh.md %}) of the DataStream API.
-
-{% top %}
diff --git a/docs/config.toml b/docs/config.toml
new file mode 100644
index 0000000000000..aeeb39a74fbd5
--- /dev/null
+++ b/docs/config.toml
@@ -0,0 +1,103 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+baseURL = '//ci.apache.org/projects/flink/flink-docs-master'
+languageCode = "en-us"
+title = "Apache Flink"
+enableGitInfo = false
+theme = "book"
+pygmentsUseClasses = true
+
+[params]
+ # Flag whether this is a stable version or not.
+ # Used for the quickstart page.
+ IsStable = false
+
+ # Flag to indicate whether an outdated warning should be shown.
+ ShowOutDatedWarning = false
+
+ # This are the version referenced in the docs. Please only use these variables
+ # to reference a specific Flink version, because this is the only place where
+ # we change the version for the complete docs when forking of a release branch
+ # etc.
+ # The full version string as referenced in Maven (e.g. 1.2.1)
+ Version = "1.14-SNAPSHOT"
+
+ # For stable releases, leave the bugfix version out (e.g. 1.2). For snapshot
+ # release this should be the same as the regular version
+ VersionTitle = "1.14-SNAPSHOT"
+
+ # The branch for this version of Apache Flink
+ Branch = "master"
+
+ # The github repository for Apache Flink
+ Repo = "//github.com/apache/flink"
+
+ GithubRepo = "https://github.com/apache/flink.git"
+
+ # Flink training exercises
+ TrainingExercises = "//github.com/apache/flink-training"
+
+ # This suffix is appended to the Scala-dependent Maven artifact names
+ ScalaVersion = "_2.11"
+
+ ProjectHomepage = "//flink.apache.org"
+
+ JavaDocs = "//ci.apache.org/projects/flink/flink-docs-master/api/java/"
+
+ ScalaDocs = "//ci.apache.org/projects/flink/flink-docs-master/api/scala/index.html#org.apache.flink.api.scala.package"
+
+ PyDocs = "//ci.apache.org/projects/flink/flink-docs-master/api/python/"
+
+ # External links at the bottom
+ # of the menu
+ MenuLinks = [
+ ["Project Homepage", "//flink.apache.org"],
+ ["JavaDocs", "//ci.apache.org/projects/flink/flink-docs-master/api/java/"],
+ ["ScalaDocs", "//ci.apache.org/projects/flink/flink-docs-master/api/scala/index.html#org.apache.flink.api.scala.package"],
+ ["PyDocs", "//ci.apache.org/projects/flink/flink-docs-master/api/python/"]
+ ]
+
+ PreviousDocs = [
+ ["1.12", "http://ci.apache.org/projects/flink/flink-docs-release-1.12"],
+ ["1.11", "http://ci.apache.org/projects/flink/flink-docs-release-1.11"],
+ ["1.10", "http://ci.apache.org/projects/flink/flink-docs-release-1.10"],
+ ["1.9", "http://ci.apache.org/projects/flink/flink-docs-release-1.9"],
+ ["1.8", "http://ci.apache.org/projects/flink/flink-docs-release-1.8"],
+ ["1.7", "http://ci.apache.org/projects/flink/flink-docs-release-1.7"],
+ ["1.6", "http://ci.apache.org/projects/flink/flink-docs-release-1.6"],
+ ["1.5", "http://ci.apache.org/projects/flink/flink-docs-release-1.5"],
+ ["1.4", "http://ci.apache.org/projects/flink/flink-docs-release-1.4"],
+ ["1.3", "http://ci.apache.org/projects/flink/flink-docs-release-1.3"],
+ ["1.2", "http://ci.apache.org/projects/flink/flink-docs-release-1.2"],
+ ["1.1", "http://ci.apache.org/projects/flink/flink-docs-release-1.1"],
+ ["1.0", "http://ci.apache.org/projects/flink/flink-docs-release-1.0"]
+ ]
+
+[markup]
+[markup.goldmark.renderer]
+ unsafe = true
+
+[languages]
+[languages.en]
+ languageName = 'English'
+ contentDir = 'content'
+ weight = 1
+
+[languages.zh]
+ languageName = '中文版'
+ contentDir = 'content.zh'
+ weight = 2
diff --git a/docs/connectors/index.md b/docs/connectors/index.md
deleted file mode 100644
index 5a744166b3ccc..0000000000000
--- a/docs/connectors/index.md
+++ /dev/null
@@ -1,28 +0,0 @@
----
-title: "Connectors"
-nav-id: connectors-root
-nav-title: ' Connectors'
-nav-parent_id: root
-nav-pos: 7
----
-
-
-* toc
-{:toc}
diff --git a/docs/connectors/index.zh.md b/docs/connectors/index.zh.md
deleted file mode 100644
index 5a744166b3ccc..0000000000000
--- a/docs/connectors/index.zh.md
+++ /dev/null
@@ -1,28 +0,0 @@
----
-title: "Connectors"
-nav-id: connectors-root
-nav-title: ' Connectors'
-nav-parent_id: root
-nav-pos: 7
----
-
-
-* toc
-{:toc}
diff --git a/docs/content.zh/_index.md b/docs/content.zh/_index.md
new file mode 100644
index 0000000000000..c1e782d1fdcaf
--- /dev/null
+++ b/docs/content.zh/_index.md
@@ -0,0 +1,88 @@
+---
+title: Apache Flink Documentation
+type: docs
+bookToc: false
+aliases:
+ - /zh/examples/index.html
+ - /zh/getting-started/examples/index.html
+---
+
+
+# Apache Flink Documentation
+
+{{< center >}}
+**Apache Flink** is a framework and distributed processing engine for stateful computations over *unbounded* and *bounded* data streams. Flink has been designed to run in *all common cluster environments* perform computations at *in-memory* speed and at *any scale*.
+{{< /center >}}
+
+{{< columns >}}
+
+### Try Flink
+
+If you’re interested in playing around with Flink, try one of our tutorials:
+
+* [Fraud Detection with the DataStream API]({{< ref "docs/try-flink/datastream" >}})
+* [Real Time Reporting with the Table API]({{< ref "docs/try-flink/table_api" >}})
+* [Intro to PyFlink]({{< ref "docs/dev/python/overview" >}})
+* [Flink Operations Playground]({{< ref "docs/try-flink/flink-operations-playground" >}})
+
+### Learn Flink
+
+* To dive in deeper, the [Hands-on Training]({{< ref "docs/learn-flink/overview" >}}) includes a set of lessons and exercises that provide a step-by-step introduction to Flink.
+
+* The [Concepts]({{< ref "docs/concepts/overview" >}}) section explains what you need to know about Flink before exploring the reference documentation.
+
+### Get Help with Flink
+
+If you get stuck, check out our [community support resources](https://flink.apache.org/community.html). In particular, Apache Flink’s user mailing list is consistently ranked as one of the most active of any Apache project, and is a great way to get help quickly.
+
+<--->
+
+### Explore Flink
+
+The reference documentation covers all the details. Some starting points:
+
+{{< columns >}}
+* [DataStream API]({{< ref "docs/dev/datastream/overview" >}})
+* [Table API & SQL]({{< ref "docs/dev/table/overview" >}})
+* [Stateful Functions](https://ci.apache.org/projects/flink/flink-statefun-docs-stable/)
+
+<--->
+
+* [Configuration]({{< ref "docs/deployment/config" >}})
+* [Rest API]({{< ref "docs/ops/rest_api" >}})
+* [CLI]({{< ref "docs/deployment/cli" >}})
+{{< /columns >}}
+
+### Deploy Flink
+
+Before putting your Flink job into production, read the [Production Readiness Checklist]({{< ref "docs/ops/production_ready" >}}).
+For an overview of possible deployment targets, see [Clusters and Deployments]({{< ref "docs/deployment/overview" >}}).
+
+### Upgrade Flink
+
+Release notes cover important changes between Flink versions. Please read them carefully if you plan to upgrade your Flink setup.
+
+
+See the release notes for [Flink 1.12]({{< ref "/release-notes/flink-1.12.md" >}}), [Flink 1.11]({{< ref "/release-notes/flink-1.11.md" >}}), [Flink 1.10]({{< ref "/release-notes/flink-1.10.md" >}}), [Flink 1.9]({{< ref "/release-notes/flink-1.9.md" >}}), [Flink 1.8]({{< ref "/release-notes/flink-1.8.md" >}}), or [Flink 1.7]({{< ref "/release-notes/flink-1.7.md" >}}).
+
+{{< /columns >}}
\ No newline at end of file
diff --git a/docs/content.zh/docs/concepts/_index.md b/docs/content.zh/docs/concepts/_index.md
new file mode 100644
index 0000000000000..040815aa98e63
--- /dev/null
+++ b/docs/content.zh/docs/concepts/_index.md
@@ -0,0 +1,25 @@
+---
+title: 概念透析
+icon:
+bold: true
+bookCollapseSection: true
+weight: 3
+---
+
\ No newline at end of file
diff --git a/docs/content.zh/docs/concepts/flink-architecture.md b/docs/content.zh/docs/concepts/flink-architecture.md
new file mode 100644
index 0000000000000..e4c7b0f5174a4
--- /dev/null
+++ b/docs/content.zh/docs/concepts/flink-architecture.md
@@ -0,0 +1,139 @@
+---
+title: Flink 架构
+weight: 4
+type: docs
+nav-title: Flink 架构
+---
+
+
+# Flink 架构
+
+Flink 是一个分布式系统,需要有效分配和管理计算资源才能执行流应用程序。它集成了所有常见的集群资源管理器,例如[Hadoop YARN](https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/YARN.html)、[Apache Mesos](https://mesos.apache.org/)和[Kubernetes](https://kubernetes.io/),但也可以设置作为独立集群甚至库运行。
+
+本节概述了 Flink 架构,并且描述了其主要组件如何交互以执行应用程序和从故障中恢复。
+
+## Flink 集群剖析
+
+Flink 运行时由两种类型的进程组成:一个 _JobManager_ 和一个或者多个 _TaskManager_。
+
+{{< img src="/fig/processes.svg" alt="The processes involved in executing a Flink dataflow" class="offset" width="70%" >}}
+
+*Client* 不是运行时和程序执行的一部分,而是用于准备数据流并将其发送给 JobManager。之后,客户端可以断开连接(_分离模式_),或保持连接来接收进程报告(_附加模式_)。客户端可以作为触发执行 Java/Scala 程序的一部分运行,也可以在命令行进程`./bin/flink run ...`中运行。
+
+可以通过多种方式启动 JobManager 和 TaskManager:直接在机器上作为[standalone 集群]({{< ref "docs/deployment/resource-providers/standalone/overview" >}})启动、在容器中启动、或者通过[YARN]({{< ref "docs/deployment/resource-providers/yarn" >}})或[Mesos]({{< ref "docs/deployment/resource-providers/mesos" >}})等资源框架管理并启动。TaskManager 连接到 JobManagers,宣布自己可用,并被分配工作。
+
+### JobManager
+
+_JobManager_ 具有许多与协调 Flink 应用程序的分布式执行有关的职责:它决定何时调度下一个 task(或一组 task)、对完成的 task 或执行失败做出反应、协调 checkpoint、并且协调从失败中恢复等等。这个进程由三个不同的组件组成:
+
+ * **ResourceManager**
+
+ _ResourceManager_ 负责 Flink 集群中的资源提供、回收、分配 - 它管理 **task slots**,这是 Flink 集群中资源调度的单位(请参考[TaskManagers](#taskmanagers))。Flink 为不同的环境和资源提供者(例如 YARN、Mesos、Kubernetes 和 standalone 部署)实现了对应的 ResourceManager。在 standalone 设置中,ResourceManager 只能分配可用 TaskManager 的 slots,而不能自行启动新的 TaskManager。
+
+ * **Dispatcher**
+
+ _Dispatcher_ 提供了一个 REST 接口,用来提交 Flink 应用程序执行,并为每个提交的作业启动一个新的 JobMaster。它还运行 Flink WebUI 用来提供作业执行信息。
+
+ * **JobMaster**
+
+ _JobMaster_ 负责管理单个[JobGraph]({{< ref "docs/concepts/glossary" >}}#logical-graph)的执行。Flink 集群中可以同时运行多个作业,每个作业都有自己的 JobMaster。
+
+始终至少有一个 JobManager。高可用(HA)设置中可能有多个 JobManager,其中一个始终是 *leader*,其他的则是 *standby*(请参考 [高可用(HA)]({{< ref "docs/deployment/ha/overview" >}}))。
+
+### TaskManagers
+
+*TaskManager*(也称为 *worker*)执行作业流的 task,并且缓存和交换数据流。
+
+必须始终至少有一个 TaskManager。在 TaskManager 中资源调度的最小单位是 task _slot_。TaskManager 中 task slot 的数量表示并发处理 task 的数量。请注意一个 task slot 中可以执行多个算子(请参考[Tasks 和算子链](#tasks-and-operator-chains))。
+
+{{< top >}}
+
+## Tasks 和算子链
+
+对于分布式执行,Flink 将算子的 subtasks *链接*成 *tasks*。每个 task 由一个线程执行。将算子链接成 task 是个有用的优化:它减少线程间切换、缓冲的开销,并且减少延迟的同时增加整体吞吐量。链行为是可以配置的;请参考[链文档]({{< ref "docs/dev/datastream/operators/overview" >}}#task-chaining-and-resource-groups)以获取详细信息。
+
+下图中样例数据流用 5 个 subtask 执行,因此有 5 个并行线程。
+
+{{< img src="/fig/tasks_chains.svg" alt="Operator chaining into Tasks" class="offset" width="80%" >}}
+
+{{< top >}}
+
+## Task Slots 和资源
+
+每个 worker(TaskManager)都是一个 *JVM 进程*,可以在单独的线程中执行一个或多个 subtask。为了控制一个 TaskManager 中接受多少个 task,就有了所谓的 **task slots**(至少一个)。
+
+每个 *task slot* 代表 TaskManager 中资源的固定子集。例如,具有 3 个 slot 的 TaskManager,会将其托管内存 1/3 用于每个 slot。分配资源意味着 subtask 不会与其他作业的 subtask 竞争托管内存,而是具有一定数量的保留托管内存。注意此处没有 CPU 隔离;当前 slot 仅分离 task 的托管内存。
+
+通过调整 task slot 的数量,用户可以定义 subtask 如何互相隔离。每个 TaskManager 有一个 slot,这意味着每个 task 组都在单独的 JVM 中运行(例如,可以在单独的容器中启动)。具有多个 slot 意味着更多 subtask 共享同一 JVM。同一 JVM 中的 task 共享 TCP 连接(通过多路复用)和心跳信息。它们还可以共享数据集和数据结构,从而减少了每个 task 的开销。
+
+{{< img src="/fig/tasks_slots.svg" alt="A TaskManager with Task Slots and Tasks" class="offset" width="80%" >}}
+
+默认情况下,Flink 允许 subtask 共享 slot,即便它们是不同的 task 的 subtask,只要是来自于同一作业即可。结果就是一个 slot 可以持有整个作业管道。允许 *slot 共享*有两个主要优点:
+
+ - Flink 集群所需的 task slot 和作业中使用的最大并行度恰好一样。无需计算程序总共包含多少个 task(具有不同并行度)。
+
+ - 容易获得更好的资源利用。如果没有 slot 共享,非密集 subtask(*source/map()*)将阻塞和密集型 subtask(*window*) 一样多的资源。通过 slot 共享,我们示例中的基本并行度从 2 增加到 6,可以充分利用分配的资源,同时确保繁重的 subtask 在 TaskManager 之间公平分配。
+
+{{< img src="/fig/slot_sharing.svg" alt="TaskManagers with shared Task Slots" class="offset" width="80%" >}}
+
+## Flink 应用程序执行
+
+_Flink 应用程序_ 是从其 ``main()`` 方法产生的一个或多个 Flink 作业的任何用户程序。这些作业的执行可以在本地 JVM(`LocalEnvironment``)中进行,或具有多台机器的集群的远程设置(``RemoteEnvironment``)中进行。对于每个程序,[``ExecutionEnvironment``]({{ site.javadocs_baseurl }}/api/java/) 提供了一些方法来控制作业执行(例如设置并行度)并与外界交互(请参考 [Flink 程序剖析]({{< ref "docs/dev/datastream/overview" >}}#anatomy-of-a-flink-program) )。
+
+Flink 应用程序的作业可以被提交到长期运行的 [Flink Session 集群]({{< ref "docs/concepts/glossary" >}}#flink-session-cluster)、专用的 [Flink Job 集群]({{< ref "docs/concepts/glossary" >}}#flink-job-cluster) 或 [Flink Application 集群]({{< ref "docs/concepts/glossary" >}}#flink-application-cluster)。这些选项之间的差异主要与集群的生命周期和资源隔离保证有关。
+
+### Flink Session 集群
+
+* **集群生命周期**:在 Flink Session 集群中,客户端连接到一个预先存在的、长期运行的集群,该集群可以接受多个作业提交。即使所有作业完成后,集群(和 JobManager)仍将继续运行直到手动停止 session 为止。因此,Flink Session 集群的寿命不受任何 Flink 作业寿命的约束。
+
+* **资源隔离**:TaskManager slot 由 ResourceManager 在提交作业时分配,并在作业完成时释放。由于所有作业都共享同一集群,因此在集群资源方面存在一些竞争 — 例如提交工作阶段的网络带宽。此共享设置的局限性在于,如果 TaskManager 崩溃,则在此 TaskManager 上运行 task 的所有作业都将失败;类似的,如果 JobManager 上发生一些致命错误,它将影响集群中正在运行的所有作业。
+
+* **其他注意事项**:拥有一个预先存在的集群可以节省大量时间申请资源和启动 TaskManager。有种场景很重要,作业执行时间短并且启动时间长会对端到端的用户体验产生负面的影响 — 就像对简短查询的交互式分析一样,希望作业可以使用现有资源快速执行计算。
+
+{{< hint info >}}
+以前,Flink Session 集群也被称为 session 模式下的 Flink 集群。
+{{< /hint >}}
+
+### Flink Job 集群
+
+* **集群生命周期**:在 Flink Job 集群中,可用的集群管理器(例如 YARN)用于为每个提交的作业启动一个集群,并且该集群仅可用于该作业。在这里,客户端首先从集群管理器请求资源启动 JobManager,然后将作业提交给在这个进程中运行的 Dispatcher。然后根据作业的资源请求惰性的分配 TaskManager。一旦作业完成,Flink Job 集群将被拆除。
+
+* **资源隔离**:JobManager 中的致命错误仅影响在 Flink Job 集群中运行的一个作业。
+
+* **其他注意事项**:由于 ResourceManager 必须应用并等待外部资源管理组件来启动 TaskManager 进程和分配资源,因此 Flink Job 集群更适合长期运行、具有高稳定性要求且对较长的启动时间不敏感的大型作业。
+
+{{< hint info >}}
+以前,Flink Job 集群也被称为 job (or per-job) 模式下的 Flink 集群。
+{{< /hint >}}
+{{< hint info >}}
+Kubernetes 不支持 Flink Job 集群。 请参考 [Standalone Kubernetes]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}}#per-job-cluster-mode) 和 [Native Kubernetes]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}#per-job-cluster-mode)。
+{{< /hint >}}
+
+### Flink Application 集群
+
+* **集群生命周期**:Flink Application 集群是专用的 Flink 集群,仅从 Flink 应用程序执行作业,并且 ``main()``方法在集群上而不是客户端上运行。提交作业是一个单步骤过程:无需先启动 Flink 集群,然后将作业提交到现有的 session 集群;相反,将应用程序逻辑和依赖打包成一个可执行的作业 JAR 中,并且集群入口(``ApplicationClusterEntryPoint``)负责调用 ``main()``方法来提取 JobGraph。例如,这允许你像在 Kubernetes 上部署任何其他应用程序一样部署 Flink 应用程序。因此,Flink Application 集群的寿命与 Flink 应用程序的寿命有关。
+
+* **资源隔离**:在 Flink Application 集群中,ResourceManager 和 Dispatcher 作用于单个的 Flink 应用程序,相比于 Flink Session 集群,它提供了更好的隔离。
+
+{{< hint info >}}
+Flink Job 集群可以看做是 Flink Application 集群”客户端运行“的替代方案。
+{{< /hint >}}
+
+{{< top >}}
diff --git a/docs/content.zh/docs/concepts/glossary.md b/docs/content.zh/docs/concepts/glossary.md
new file mode 100644
index 0000000000000..c1ea57929b9c4
--- /dev/null
+++ b/docs/content.zh/docs/concepts/glossary.md
@@ -0,0 +1,145 @@
+---
+title: 词汇表
+weight: 11
+type: docs
+bookToc: false
+---
+
+
+# 词汇表
+
+#### Flink Application Cluster
+
+A Flink Application Cluster is a dedicated [Flink Cluster](#flink-cluster) that
+only executes [Flink Jobs](#flink-job) from one [Flink Application](#flink-application).
+The lifetime of the [Flink Cluster](#flink-cluster) is bound to the lifetime of the Flink Application.
+
+#### Flink Job Cluster
+
+A Flink Job Cluster is a dedicated [Flink Cluster](#flink-cluster) that only
+executes a single [Flink Job](#flink-job). The lifetime of the
+[Flink Cluster](#flink-cluster) is bound to the lifetime of the Flink Job.
+
+#### Flink Cluster
+
+一般情况下,Flink 集群是由一个 [Flink JobManager](#flink-jobmanager) 和一个或多个 [Flink TaskManager](#flink-taskmanager) 进程组成的分布式系统。
+
+#### Event
+
+Event 是对应用程序建模的域的状态更改的声明。它可以同时为流或批处理应用程序的 input 和 output,也可以单独是 input 或者 output 中的一种。Event 是特殊类型的 [Record](#record)。
+
+#### ExecutionGraph
+
+见 [Physical Graph](#physical-graph)。
+
+#### Function
+
+Function 是由用户实现的,并封装了 Flink 程序的应用程序逻辑。大多数 Function 都由相应的 [Operator](#operator) 封装。
+
+#### Instance
+
+Instance 常用于描述运行时的特定类型(通常是 [Operator](#operator) 或者 [Function](#function))的一个具体实例。由于 Apache Flink 主要是用 Java 编写的,所以,这与 Java 中的 *Instance* 或 *Object* 的定义相对应。在 Apache Flink 的上下文中,*parallel instance* 也常用于强调同一 [Operator](#operator) 或者 [Function](#function) 的多个 instance 以并行的方式运行。
+
+#### Flink Application
+
+A Flink application is a Java Application that submits one or multiple [Flink
+Jobs](#flink-job) from the `main()` method (or by some other means). Submitting
+jobs is usually done by calling `execute()` on an execution environment.
+
+The jobs of an application can either be submitted to a long running [Flink
+Session Cluster](#flink-session-cluster), to a dedicated [Flink Application
+Cluster](#flink-application-cluster), or to a [Flink Job
+Cluster](#flink-job-cluster).
+
+#### Flink Job
+
+A Flink Job is the runtime representation of a [logical graph](#logical-graph)
+(also often called dataflow graph) that is created and submitted by calling
+`execute()` in a [Flink Application](#flink-application).
+
+#### JobGraph
+
+见 [Logical Graph](#logical-graph)。
+
+#### Flink JobManager
+
+Flink JobManager 是 [Flink Cluster](#flink-cluster) 的主节点。它包含三个不同的组件:Flink Resource Manager、Flink Dispatcher、运行每个 [Flink Job](#flink-job) 的 [Flink JobMaster](#flink-jobmaster)。
+
+
+#### Flink JobMaster
+
+JobMaster 是在 [Flink JobManager](#flink-jobmanager) 运行中的组件之一。JobManager 负责监督单个作业 [Task](#task) 的执行。以前,整个 [Flink JobManager](#flink-jobmanager) 都叫做 JobManager。
+
+#### Logical Graph
+
+A logical graph is a directed graph where the nodes are [Operators](#operator)
+and the edges define input/output-relationships of the operators and correspond
+to data streams or data sets. A logical graph is created by submitting jobs
+from a [Flink Application](#flink-application).
+
+Logical graphs are also often referred to as *dataflow graphs*.
+
+#### Managed State
+
+Managed State 描述了已在框架中注册的应用程序的托管状态。对于托管状态,Apache Flink 会负责持久化和重伸缩等事宜。
+
+#### Operator
+
+[Logical Graph](#logical-graph) 的节点。算子执行某种操作,该操作通常由 [Function](#function) 执行。Source 和 Sink 是数据输入和数据输出的特殊算子。
+
+#### Operator Chain
+
+算子链由两个或多个连续的 [Operator](#operator) 组成,两者之间没有任何的重新分区。同一算子链内的算子可以彼此直接传递 record,而无需通过序列化或 Flink 的网络栈。
+
+#### Partition
+
+分区是整个数据流或数据集的独立子集。通过将每个 [Record](#record) 分配给一个或多个分区,来把数据流或数据集划分为多个分区。在运行期间,[Task](#task) 会消费数据流或数据集的分区。改变数据流或数据集分区方式的转换通常称为重分区。
+
+#### Physical Graph
+
+Physical graph 是一个在分布式运行时,把 [Logical Graph](#logical-graph) 转换为可执行的结果。节点是 [Task](#task),边表示数据流或数据集的输入/输出关系或 [partition](#partition)。
+
+#### Record
+
+Record 是数据集或数据流的组成元素。[Operator](#operator) 和 [Function](#Function)接收 record 作为输入,并将 record 作为输出发出。
+
+#### Flink Session Cluster
+
+长时间运行的 [Flink Cluster](#flink-cluster),它可以接受多个 [Flink Job](#flink-job) 的执行。此 [Flink Cluster](#flink-cluster) 的生命周期不受任何 [Flink Job](#flink-job) 生命周期的约束限制。以前,Flink Session Cluster 也称为 *session mode* 的 [Flink Cluster](#flink-cluster),和 [Flink Application Cluster](#flink-application-cluster) 相对应。
+
+#### State Backend
+
+对于流处理程序,[Flink Job](#flink-job) 的 State Backend 决定了其 [state](#managed-state) 是如何存储在每个 TaskManager 上的( TaskManager 的 Java 堆栈或嵌入式 RocksDB),以及它在 checkpoint 时的写入位置( [Flink JobManager](#flink-jobmanager) 的 Java 堆或者 Filesystem)。
+
+#### Sub-Task
+
+Sub-Task 是负责处理数据流 [Partition](#partition) 的 [Task](#task)。"Sub-Task"强调的是同一个 [Operator](#operator) 或者 [Operator Chain](#operator-chain) 具有多个并行的 Task 。
+
+#### Task
+
+Task 是 [Physical Graph](#physical-graph) 的节点。它是基本的工作单元,由 Flink 的 runtime 来执行。Task 正好封装了一个 [Operator](#operator) 或者 [Operator Chain](#operator-chain) 的 *parallel instance*。
+
+#### Flink TaskManager
+
+TaskManager 是 [Flink Cluster](#flink-cluster) 的工作进程。[Task](#task) 被调度到 TaskManager 上执行。TaskManager 相互通信,只为在后续的 Task 之间交换数据。
+
+#### Transformation
+
+Transformation 应用于一个或多个数据流或数据集,并产生一个或多个输出数据流或数据集。Transformation 可能会在每个记录的基础上更改数据流或数据集,但也可以只更改其分区或执行聚合。虽然 [Operator](#operator) 和 [Function](#function) 是 Flink API 的“物理”部分,但 Transformation 只是一个 API 概念。具体来说,大多数(但不是全部)Transformation 是由某些 [Operator](#operator) 实现的。
diff --git a/docs/content.zh/docs/concepts/overview.md b/docs/content.zh/docs/concepts/overview.md
new file mode 100644
index 0000000000000..12e512911f70b
--- /dev/null
+++ b/docs/content.zh/docs/concepts/overview.md
@@ -0,0 +1,50 @@
+---
+title: 概览
+weight: 1
+type: docs
+aliases:
+ - /zh/concepts/
+ - /zh/concepts/concepts.html
+---
+
+
+# 概念透析
+
+[实践练习]({{< ref "docs/learn-flink/overview" >}})章节介绍了作为 Flink API 根基的有状态实时流处理的基本概念,并且举例说明了如何在 Flink 应用中使用这些机制。其中 [Data Pipelines & ETL]({{< ref "docs/learn-flink/etl" >}}#stateful-transformations) 小节介绍了有状态流处理的概念,并且在 [Fault Tolerance]({{< ref "docs/learn-flink/fault_tolerance" >}}) 小节中进行了深入介绍。[Streaming Analytics]({{< ref "docs/learn-flink/streaming_analytics" >}}) 小节介绍了实时流处理的概念。
+
+本章将深入分析 Flink 分布式运行时架构如何实现这些概念。
+
+## Flink 中的 API
+
+Flink 为流式/批式处理应用程序的开发提供了不同级别的抽象。
+
+{{< img src="/fig/levels_of_abstraction.svg" alt="Programming levels of abstraction" class="offset" width="80%" >}}
+
+ - Flink API 最底层的抽象为**有状态实时流处理**。其抽象实现是 [Process Function]({{< ref "docs/dev/datastream/operators/process_function" >}}),并且 **Process Function** 被 Flink 框架集成到了 [DataStream API]({{< ref "docs/dev/datastream/overview" >}}) 中来为我们使用。它允许用户在应用程序中自由地处理来自单流或多流的事件(数据),并提供具有全局一致性和容错保障的*状态*。此外,用户可以在此层抽象中注册事件时间(event time)和处理时间(processing time)回调方法,从而允许程序可以实现复杂计算。
+
+ - Flink API 第二层抽象是 **Core APIs**。实际上,许多应用程序不需要使用到上述最底层抽象的 API,而是可以使用 **Core APIs** 进行编程:其中包含 [DataStream API]({{< ref "docs/dev/datastream/overview" >}})(应用于有界/无界数据流场景)和 [DataSet API]({{< ref "docs/dev/dataset/overview" >}})(应用于有界数据集场景)两部分。Core APIs 提供的流式 API(Fluent API)为数据处理提供了通用的模块组件,例如各种形式的用户自定义转换(transformations)、联接(joins)、聚合(aggregations)、窗口(windows)和状态(state)操作等。此层 API 中处理的数据类型在每种编程语言中都有其对应的类。
+
+ *Process Function* 这类底层抽象和 *DataStream API* 的相互集成使得用户可以选择使用更底层的抽象 API 来实现自己的需求。*DataSet API* 还额外提供了一些原语,比如循环/迭代(loop/iteration)操作。
+
+ - Flink API 第三层抽象是 **Table API**。**Table API** 是以表(Table)为中心的声明式编程(DSL)API,例如在流式数据场景下,它可以表示一张正在动态改变的表。[Table API]({{< ref "docs/dev/table/overview" >}}) 遵循(扩展)关系模型:即表拥有 schema(类似于关系型数据库中的 schema),并且 Table API 也提供了类似于关系模型中的操作,比如 select、project、join、group-by 和 aggregate 等。Table API 程序是以声明的方式定义*应执行的逻辑操作*,而不是确切地指定程序*应该执行的代码*。尽管 Table API 使用起来很简洁并且可以由各种类型的用户自定义函数扩展功能,但还是比 Core API 的表达能力差。此外,Table API 程序在执行之前还会使用优化器中的优化规则对用户编写的表达式进行优化。
+
+ 表和 *DataStream*/*DataSet* 可以进行无缝切换,Flink 允许用户在编写应用程序时将 *Table API* 与 *DataStream*/*DataSet* API 混合使用。
+
+ - Flink API 最顶层抽象是 **SQL**。这层抽象在语义和程序表达式上都类似于 *Table API*,但是其程序实现都是 SQL 查询表达式。[SQL]({{< ref "docs/dev/table/overview" >}}#sql) 抽象与 Table API 抽象之间的关联是非常紧密的,并且 SQL 查询语句可以在 *Table API* 中定义的表上执行。
diff --git a/docs/content.zh/docs/concepts/stateful-stream-processing.md b/docs/content.zh/docs/concepts/stateful-stream-processing.md
new file mode 100644
index 0000000000000..c78949508e896
--- /dev/null
+++ b/docs/content.zh/docs/concepts/stateful-stream-processing.md
@@ -0,0 +1,365 @@
+---
+title: 有状态流处理
+weight: 2
+type: docs
+---
+
+
+# 有状态流处理
+
+## What is State?
+
+While many operations in a dataflow simply look at one individual *event at a
+time* (for example an event parser), some operations remember information
+across multiple events (for example window operators). These operations are
+called **stateful**.
+
+Some examples of stateful operations:
+
+ - When an application searches for certain event patterns, the state will
+ store the sequence of events encountered so far.
+ - When aggregating events per minute/hour/day, the state holds the pending
+ aggregates.
+ - When training a machine learning model over a stream of data points, the
+ state holds the current version of the model parameters.
+ - When historic data needs to be managed, the state allows efficient access
+ to events that occurred in the past.
+
+Flink needs to be aware of the state in order to make it fault tolerant using
+[checkpoints]({{< ref "docs/dev/datastream/fault-tolerance/checkpointing" >}})
+and [savepoints]({{< ref "docs/ops/state/savepoints" >}}).
+
+Knowledge about the state also allows for rescaling Flink applications, meaning
+that Flink takes care of redistributing state across parallel instances.
+
+[Queryable state]({{< ref "docs/dev/datastream/fault-tolerance/queryable_state" >}}) allows you to access state from outside of Flink during runtime.
+
+When working with state, it might also be useful to read about [Flink's state
+backends]({{< ref "docs/ops/state/state_backends" >}}). Flink
+provides different state backends that specify how and where state is stored.
+
+{{< top >}}
+
+## Keyed State
+
+Keyed state is maintained in what can be thought of as an embedded key/value
+store. The state is partitioned and distributed strictly together with the
+streams that are read by the stateful operators. Hence, access to the key/value
+state is only possible on *keyed streams*, i.e. after a keyed/partitioned data
+exchange, and is restricted to the values associated with the current event's
+key. Aligning the keys of streams and state makes sure that all state updates
+are local operations, guaranteeing consistency without transaction overhead.
+This alignment also allows Flink to redistribute the state and adjust the
+stream partitioning transparently.
+
+{{< img src="/fig/state_partitioning.svg" alt="State and Partitioning" class="offset" width="50%" >}}
+
+Keyed State is further organized into so-called *Key Groups*. Key Groups are
+the atomic unit by which Flink can redistribute Keyed State; there are exactly
+as many Key Groups as the defined maximum parallelism. During execution each
+parallel instance of a keyed operator works with the keys for one or more Key
+Groups.
+
+## State Persistence
+
+Flink implements fault tolerance using a combination of **stream replay** and
+**checkpointing**. A checkpoint marks a specific point in each of the
+input streams along with the corresponding state for each of the operators. A
+streaming dataflow can be resumed from a checkpoint while maintaining
+consistency *(exactly-once processing semantics)* by restoring the state of the
+operators and replaying the records from the point of the checkpoint.
+
+The checkpoint interval is a means of trading off the overhead of fault
+tolerance during execution with the recovery time (the number of records that
+need to be replayed).
+
+The fault tolerance mechanism continuously draws snapshots of the distributed
+streaming data flow. For streaming applications with small state, these
+snapshots are very light-weight and can be drawn frequently without much impact
+on performance. The state of the streaming applications is stored at a
+configurable place, usually in a distributed file system.
+
+In case of a program failure (due to machine-, network-, or software failure),
+Flink stops the distributed streaming dataflow. The system then restarts the
+operators and resets them to the latest successful checkpoint. The input
+streams are reset to the point of the state snapshot. Any records that are
+processed as part of the restarted parallel dataflow are guaranteed to not have
+affected the previously checkpointed state.
+
+{{< hint warning >}}
+By default, checkpointing is disabled. See [Checkpointing]({{< ref "docs/dev/datastream/fault-tolerance/checkpointing" >}}) for details on how to enable and configure checkpointing.
+{{< /hint >}}
+
+{{< hint info >}}
+For this mechanism to realize its full guarantees, the data
+stream source (such as message queue or broker) needs to be able to rewind the
+stream to a defined recent point. [Apache Kafka](http://kafka.apache.org) has
+this ability and Flink's connector to Kafka exploits this. See [Fault
+Tolerance Guarantees of Data Sources and Sinks]({{< ref "docs/connectors/datastream/guarantees" >}}) for more information about the guarantees
+provided by Flink's connectors.
+{{< /hint >}}
+
+{{< hint info >}}
+Because Flink's checkpoints are realized through distributed
+snapshots, we use the words *snapshot* and *checkpoint* interchangeably. Often
+we also use the term *snapshot* to mean either *checkpoint* or *savepoint*.
+{{< /hint >}}
+
+### Checkpointing
+
+The central part of Flink's fault tolerance mechanism is drawing consistent
+snapshots of the distributed data stream and operator state. These snapshots
+act as consistent checkpoints to which the system can fall back in case of a
+failure. Flink's mechanism for drawing these snapshots is described in
+"[Lightweight Asynchronous Snapshots for Distributed
+Dataflows](http://arxiv.org/abs/1506.08603)". It is inspired by the standard
+[Chandy-Lamport algorithm](http://research.microsoft.com/en-us/um/people/lamport/pubs/chandy.pdf)
+for distributed snapshots and is specifically tailored to Flink's execution
+model.
+
+Keep in mind that everything to do with checkpointing can be done
+asynchronously. The checkpoint barriers don't travel in lock step and
+operations can asynchronously snapshot their state.
+
+Since Flink 1.11, checkpoints can be taken with or without alignment. In this
+section, we describe aligned checkpoints first.
+
+#### Barriers
+
+A core element in Flink's distributed snapshotting are the *stream barriers*.
+These barriers are injected into the data stream and flow with the records as
+part of the data stream. Barriers never overtake records, they flow strictly in
+line. A barrier separates the records in the data stream into the set of
+records that goes into the current snapshot, and the records that go into the
+next snapshot. Each barrier carries the ID of the snapshot whose records it
+pushed in front of it. Barriers do not interrupt the flow of the stream and are
+hence very lightweight. Multiple barriers from different snapshots can be in
+the stream at the same time, which means that various snapshots may happen
+concurrently.
+
+
+ {{< img src="/fig/stream_barriers.svg" alt="Checkpoint barriers in data streams" width="60%" >}}
+
+
+Stream barriers are injected into the parallel data flow at the stream sources.
+The point where the barriers for snapshot *n* are injected (let's call it
+Sn) is the position in the source stream up to which the
+snapshot covers the data. For example, in Apache Kafka, this position would be
+the last record's offset in the partition. This position Sn
+is reported to the *checkpoint coordinator* (Flink's JobManager).
+
+The barriers then flow downstream. When an intermediate operator has received a
+barrier for snapshot *n* from all of its input streams, it emits a barrier for
+snapshot *n* into all of its outgoing streams. Once a sink operator (the end of
+a streaming DAG) has received the barrier *n* from all of its input streams, it
+acknowledges that snapshot *n* to the checkpoint coordinator. After all sinks
+have acknowledged a snapshot, it is considered completed.
+
+Once snapshot *n* has been completed, the job will never again ask the source
+for records from before Sn, since at that point these records
+(and their descendant records) will have passed through the entire data flow
+topology.
+
+
+ {{< img src="/fig/stream_aligning.svg" alt="Aligning data streams at operators with multiple inputs" width="60%" >}}
+
+
+Operators that receive more than one input stream need to *align* the input
+streams on the snapshot barriers. The figure above illustrates this:
+
+ - As soon as the operator receives snapshot barrier *n* from an incoming
+ stream, it cannot process any further records from that stream until it has
+ received the barrier *n* from the other inputs as well. Otherwise, it would
+ mix records that belong to snapshot *n* and with records that belong to
+ snapshot *n+1*.
+ - Once the last stream has received barrier *n*, the operator emits all
+ pending outgoing records, and then emits snapshot *n* barriers itself.
+ - It snapshots the state and resumes processing records from all input streams,
+ processing records from the input buffers before processing the records
+ from the streams.
+ - Finally, the operator writes the state asynchronously to the state backend.
+
+Note that the alignment is needed for all operators with multiple inputs and for
+operators after a shuffle when they consume output streams of multiple upstream
+subtasks.
+
+#### Snapshotting Operator State
+
+When operators contain any form of *state*, this state must be part of the
+snapshots as well.
+
+Operators snapshot their state at the point in time when they have received all
+snapshot barriers from their input streams, and before emitting the barriers to
+their output streams. At that point, all updates to the state from records
+before the barriers have been made, and no updates that depend on records
+from after the barriers have been applied. Because the state of a snapshot may
+be large, it is stored in a configurable *[state backend]({{< ref "docs/ops/state/state_backends" >}})*. By default, this is the JobManager's
+memory, but for production use a distributed reliable storage should be
+configured (such as HDFS). After the state has been stored, the operator
+acknowledges the checkpoint, emits the snapshot barrier into the output
+streams, and proceeds.
+
+The resulting snapshot now contains:
+
+ - For each parallel stream data source, the offset/position in the stream
+ when the snapshot was started
+ - For each operator, a pointer to the state that was stored as part of the
+ snapshot
+
+
+ {{< img src="/fig/checkpointing.svg" alt="Illustration of the Checkpointing Mechanism" width="75%" >}}
+
+
+#### Recovery
+
+Recovery under this mechanism is straightforward: Upon a failure, Flink selects
+the latest completed checkpoint *k*. The system then re-deploys the entire
+distributed dataflow, and gives each operator the state that was snapshotted as
+part of checkpoint *k*. The sources are set to start reading the stream from
+position Sk. For example in Apache Kafka, that means telling
+the consumer to start fetching from offset Sk.
+
+If state was snapshotted incrementally, the operators start with the state of
+the latest full snapshot and then apply a series of incremental snapshot
+updates to that state.
+
+See [Restart Strategies]({{< ref "docs/dev/execution/task_failure_recovery" >}}#restart-strategies) for more information.
+
+### Unaligned Checkpointing
+
+Checkpointing can also be performed unaligned.
+The basic idea is that checkpoints can overtake all in-flight data as long as
+the in-flight data becomes part of the operator state.
+
+Note that this approach is actually closer to the [Chandy-Lamport algorithm
+](http://research.microsoft.com/en-us/um/people/lamport/pubs/chandy.pdf), but
+Flink still inserts the barrier in the sources to avoid overloading the
+checkpoint coordinator.
+
+{{< img src="/fig/stream_unaligning.svg" alt="Unaligned checkpointing" >}}
+
+The figure depicts how an operator handles unaligned checkpoint barriers:
+
+- The operator reacts on the first barrier that is stored in its input buffers.
+- It immediately forwards the barrier to the downstream operator by adding it
+ to the end of the output buffers.
+- The operator marks all overtaken records to be stored asynchronously and
+ creates a snapshot of its own state.
+
+Consequently, the operator only briefly stops the processing of input to mark
+the buffers, forwards the barrier, and creates the snapshot of the other state.
+
+Unaligned checkpointing ensures that barriers are arriving at the sink as fast
+as possible. It's especially suited for applications with at least one slow
+moving data path, where alignment times can reach hours. However, since it's
+adding additional I/O pressure, it doesn't help when the I/O to the state
+backends is the bottleneck. See the more in-depth discussion in
+[ops]({{< ref "docs/ops/state/checkpoints" >}}#unaligned-checkpoints)
+for other limitations.
+
+Note that savepoints will always be aligned.
+
+#### Unaligned Recovery
+
+Operators first recover the in-flight data before starting processing any data
+from upstream operators in unaligned checkpointing. Aside from that, it
+performs the same steps as during [recovery of aligned checkpoints](#recovery).
+
+### State Backends
+
+The exact data structures in which the key/values indexes are stored depends on
+the chosen [state backend]({{< ref "docs/ops/state/state_backends" >}}). One state backend stores data in an in-memory
+hash map, another state backend uses [RocksDB](http://rocksdb.org) as the
+key/value store. In addition to defining the data structure that holds the
+state, the state backends also implement the logic to take a point-in-time
+snapshot of the key/value state and store that snapshot as part of a
+checkpoint. State backends can be configured without changing your application
+logic.
+
+{{< img src="/fig/checkpoints.svg" alt="checkpoints and snapshots" class="offset" width="60%" >}}
+
+{{< top >}}
+
+### Savepoints
+
+All programs that use checkpointing can resume execution from a **savepoint**.
+Savepoints allow both updating your programs and your Flink cluster without
+losing any state.
+
+[Savepoints]({{< ref "docs/ops/state/savepoints" >}}) are
+**manually triggered checkpoints**, which take a snapshot of the program and
+write it out to a state backend. They rely on the regular checkpointing
+mechanism for this.
+
+Savepoints are similar to checkpoints except that they are
+**triggered by the user** and **don't automatically expire** when newer
+checkpoints are completed.
+
+{{< top >}}
+
+### Exactly Once vs. At Least Once
+
+The alignment step may add latency to the streaming program. Usually, this
+extra latency is on the order of a few milliseconds, but we have seen cases
+where the latency of some outliers increased noticeably. For applications that
+require consistently super low latencies (few milliseconds) for all records,
+Flink has a switch to skip the stream alignment during a checkpoint. Checkpoint
+snapshots are still drawn as soon as an operator has seen the checkpoint
+barrier from each input.
+
+When the alignment is skipped, an operator keeps processing all inputs, even
+after some checkpoint barriers for checkpoint *n* arrived. That way, the
+operator also processes elements that belong to checkpoint *n+1* before the
+state snapshot for checkpoint *n* was taken. On a restore, these records will
+occur as duplicates, because they are both included in the state snapshot of
+checkpoint *n*, and will be replayed as part of the data after checkpoint *n*.
+
+{{< hint info >}}
+Alignment happens only for operators with multiple predecessors
+(joins) as well as operators with multiple senders (after a stream
+repartitioning/shuffle). Because of that, dataflows with only embarrassingly
+parallel streaming operations (`map()`, `flatMap()`, `filter()`, ...) actually
+give *exactly once* guarantees even in *at least once* mode.
+{{< /hint >}}
+
+{{< top >}}
+
+## State and Fault Tolerance in Batch Programs
+
+Flink executes [batch programs]({{< ref "docs/dev/dataset/overview" >}}) as a special case of
+streaming programs, where the streams are bounded (finite number of elements).
+A *DataSet* is treated internally as a stream of data. The concepts above thus
+apply to batch programs in the same way as well as they apply to streaming
+programs, with minor exceptions:
+
+ - [Fault tolerance for batch programs]({{< ref "docs/dev/execution/task_failure_recovery" >}})
+ does not use checkpointing. Recovery happens by fully replaying the
+ streams. That is possible, because inputs are bounded. This pushes the
+ cost more towards the recovery, but makes the regular processing cheaper,
+ because it avoids checkpoints.
+
+ - Stateful operations in the DataSet API use simplified in-memory/out-of-core
+ data structures, rather than key/value indexes.
+
+ - The DataSet API introduces special synchronized (superstep-based)
+ iterations, which are only possible on bounded streams. For details, check
+ out the [iteration docs]({{< ref "docs/dev/dataset/iterations" >}}).
+
+{{< top >}}
diff --git a/docs/content.zh/docs/concepts/time.md b/docs/content.zh/docs/concepts/time.md
new file mode 100644
index 0000000000000..473aa0d8c1895
--- /dev/null
+++ b/docs/content.zh/docs/concepts/time.md
@@ -0,0 +1,204 @@
+---
+title: 及时流处理
+weight: 3
+type: docs
+---
+
+
+# 及时流处理
+
+## Introduction
+
+Timely stream processing is an extension of [stateful stream processing]({{< ref "docs/concepts/stateful-stream-processing" >}}) in which time plays some role in the
+computation. Among other things, this is the case when you do time series
+analysis, when doing aggregations based on certain time periods (typically
+called windows), or when you do event processing where the time when an event
+occurred is important.
+
+In the following sections we will highlight some of the topics that you should
+consider when working with timely Flink Applications.
+
+{{< top >}}
+
+## Notions of Time: Event Time and Processing Time
+
+When referring to time in a streaming program (for example to define windows),
+one can refer to different notions of *time*:
+
+- **Processing time:** Processing time refers to the system time of the machine
+ that is executing the respective operation.
+
+ When a streaming program runs on processing time, all time-based operations
+ (like time windows) will use the system clock of the machines that run the
+ respective operator. An hourly processing time window will include all
+ records that arrived at a specific operator between the times when the system
+ clock indicated the full hour. For example, if an application begins running
+ at 9:15am, the first hourly processing time window will include events
+ processed between 9:15am and 10:00am, the next window will include events
+ processed between 10:00am and 11:00am, and so on.
+
+ Processing time is the simplest notion of time and requires no coordination
+ between streams and machines. It provides the best performance and the
+ lowest latency. However, in distributed and asynchronous environments
+ processing time does not provide determinism, because it is susceptible to
+ the speed at which records arrive in the system (for example from the message
+ queue), to the speed at which the records flow between operators inside the
+ system, and to outages (scheduled, or otherwise).
+
+- **Event time:** Event time is the time that each individual event occurred on
+ its producing device. This time is typically embedded within the records
+ before they enter Flink, and that *event timestamp* can be extracted from
+ each record. In event time, the progress of time depends on the data, not on
+ any wall clocks. Event time programs must specify how to generate *Event Time
+ Watermarks*, which is the mechanism that signals progress in event time. This
+ watermarking mechanism is described in a later section,
+ [below](#event-time-and-watermarks).
+
+ In a perfect world, event time processing would yield completely consistent
+ and deterministic results, regardless of when events arrive, or their
+ ordering. However, unless the events are known to arrive in-order (by
+ timestamp), event time processing incurs some latency while waiting for
+ out-of-order events. As it is only possible to wait for a finite period of
+ time, this places a limit on how deterministic event time applications can
+ be.
+
+ Assuming all of the data has arrived, event time operations will behave as
+ expected, and produce correct and consistent results even when working with
+ out-of-order or late events, or when reprocessing historic data. For example,
+ an hourly event time window will contain all records that carry an event
+ timestamp that falls into that hour, regardless of the order in which they
+ arrive, or when they are processed. (See the section on [late
+ events](#late-elements) for more information.)
+
+ Note that sometimes when event time programs are processing live data in
+ real-time, they will use some *processing time* operations in order to
+ guarantee that they are progressing in a timely fashion.
+
+{{< img src="/fig/event_processing_time.svg" alt="Event Time and Processing Time" width="80%" >}}
+
+{{< top >}}
+
+## Event Time and Watermarks
+
+*Note: Flink implements many techniques from the Dataflow Model. For a good
+introduction to event time and watermarks, have a look at the articles below.*
+
+ - [Streaming 101](https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101) by Tyler Akidau
+ - The [Dataflow Model paper](https://research.google.com/pubs/archive/43864.pdf)
+
+
+A stream processor that supports *event time* needs a way to measure the
+progress of event time. For example, a window operator that builds hourly
+windows needs to be notified when event time has passed beyond the end of an
+hour, so that the operator can close the window in progress.
+
+*Event time* can progress independently of *processing time* (measured by wall
+clocks). For example, in one program the current *event time* of an operator
+may trail slightly behind the *processing time* (accounting for a delay in
+receiving the events), while both proceed at the same speed. On the other
+hand, another streaming program might progress through weeks of event time with
+only a few seconds of processing, by fast-forwarding through some historic data
+already buffered in a Kafka topic (or another message queue).
+
+------
+
+The mechanism in Flink to measure progress in event time is **watermarks**.
+Watermarks flow as part of the data stream and carry a timestamp *t*. A
+*Watermark(t)* declares that event time has reached time *t* in that stream,
+meaning that there should be no more elements from the stream with a timestamp
+*t' <= t* (i.e. events with timestamps older or equal to the watermark).
+
+The figure below shows a stream of events with (logical) timestamps, and
+watermarks flowing inline. In this example the events are in order (with
+respect to their timestamps), meaning that the watermarks are simply periodic
+markers in the stream.
+
+{{< img src="/fig/stream_watermark_in_order.svg" alt="A data stream with events (in order) and watermarks" width="65%" >}}
+
+Watermarks are crucial for *out-of-order* streams, as illustrated below, where
+the events are not ordered by their timestamps. In general a watermark is a
+declaration that by that point in the stream, all events up to a certain
+timestamp should have arrived. Once a watermark reaches an operator, the
+operator can advance its internal *event time clock* to the value of the
+watermark.
+
+{{< img src="/fig/stream_watermark_out_of_order.svg" alt="A data stream with events (out of order) and watermarks" width="65%" >}}
+
+Note that event time is inherited by a freshly created stream element (or
+elements) from either the event that produced them or from watermark that
+triggered creation of those elements.
+
+### Watermarks in Parallel Streams
+
+Watermarks are generated at, or directly after, source functions. Each parallel
+subtask of a source function usually generates its watermarks independently.
+These watermarks define the event time at that particular parallel source.
+
+As the watermarks flow through the streaming program, they advance the event
+time at the operators where they arrive. Whenever an operator advances its
+event time, it generates a new watermark downstream for its successor
+operators.
+
+Some operators consume multiple input streams; a union, for example, or
+operators following a *keyBy(...)* or *partition(...)* function. Such an
+operator's current event time is the minimum of its input streams' event times.
+As its input streams update their event times, so does the operator.
+
+The figure below shows an example of events and watermarks flowing through
+parallel streams, and operators tracking event time.
+
+{{< img src="/fig/parallel_streams_watermarks.svg" alt="Parallel data streams and operators with events and watermarks" class="center" width="80%" >}}
+
+## Lateness
+
+It is possible that certain elements will violate the watermark condition,
+meaning that even after the *Watermark(t)* has occurred, more elements with
+timestamp *t' <= t* will occur. In fact, in many real world setups, certain
+elements can be arbitrarily delayed, making it impossible to specify a time by
+which all elements of a certain event timestamp will have occurred.
+Furthermore, even if the lateness can be bounded, delaying the watermarks by
+too much is often not desirable, because it causes too much delay in the
+evaluation of event time windows.
+
+For this reason, streaming programs may explicitly expect some *late* elements.
+Late elements are elements that arrive after the system's event time clock (as
+signaled by the watermarks) has already passed the time of the late element's
+timestamp. See [Allowed Lateness]({{< ref "docs/dev/datastream/operators/windows" >}}#allowed-lateness) for more information on
+how to work with late elements in event time windows.
+
+## Windowing
+
+Aggregating events (e.g., counts, sums) works differently on streams than in
+batch processing. For example, it is impossible to count all elements in a
+stream, because streams are in general infinite (unbounded). Instead,
+aggregates on streams (counts, sums, etc), are scoped by **windows**, such as
+*"count over the last 5 minutes"*, or *"sum of the last 100 elements"*.
+
+Windows can be *time driven* (example: every 30 seconds) or *data driven*
+(example: every 100 elements). One typically distinguishes different types of
+windows, such as *tumbling windows* (no overlap), *sliding windows* (with
+overlap), and *session windows* (punctuated by a gap of inactivity).
+
+{{< img src="/fig/windows.svg" alt="Time- and Count Windows" class="offset" width="80%" >}}
+
+Please check out this [blog post](https://flink.apache.org/news/2015/12/04/Introducing-windows.html) for
+additional examples of windows or take a look a [window documentation]({{< ref "docs/dev/datastream/operators/windows" >}}) of the DataStream API.
+
+{{< top >}}
diff --git a/docs/content.zh/docs/connectors/_index.md b/docs/content.zh/docs/connectors/_index.md
new file mode 100644
index 0000000000000..ee1ddc1be0fb5
--- /dev/null
+++ b/docs/content.zh/docs/connectors/_index.md
@@ -0,0 +1,25 @@
+---
+title: Connectors
+icon:
+bold: true
+bookCollapseSection: true
+weight: 6
+---
+
\ No newline at end of file
diff --git a/docs/content.zh/docs/connectors/dataset.md b/docs/content.zh/docs/connectors/dataset.md
new file mode 100644
index 0000000000000..2f6bd97d4d7fd
--- /dev/null
+++ b/docs/content.zh/docs/connectors/dataset.md
@@ -0,0 +1,190 @@
+---
+title: "DataSet Connectors"
+weight: 11
+type: docs
+aliases:
+ - /zh/dev/batch/connectors.html
+---
+
+
+# DataSet Connectors
+
+* TOC
+
+
+## Reading from and writing to file systems
+
+The Apache Flink project supports multiple [file systems]({{< ref "docs/deployment/filesystems/overview" >}}) that can be used as backing stores
+for input and output connectors.
+
+## Connecting to other systems using Input/OutputFormat wrappers for Hadoop
+
+Apache Flink allows users to access many different systems as data sources or sinks.
+The system is designed for very easy extensibility. Similar to Apache Hadoop, Flink has the concept
+of so called `InputFormat`s and `OutputFormat`s.
+
+One implementation of these `InputFormat`s is the `HadoopInputFormat`. This is a wrapper that allows
+users to use all existing Hadoop input formats with Flink.
+
+This section shows some examples for connecting Flink to other systems.
+[Read more about Hadoop compatibility in Flink]({{< ref "docs/dev/dataset/hadoop_compatibility" >}}).
+
+## Avro support in Flink
+
+Flink has extensive built-in support for [Apache Avro](http://avro.apache.org/). This allows to easily read from Avro files with Flink.
+Also, the serialization framework of Flink is able to handle classes generated from Avro schemas. Be sure to include the Flink Avro dependency to the pom.xml of your project.
+
+```xml
+
+ org.apache.flink
+ flink-avro
+ {{ site.version }}
+
+```
+
+In order to read data from an Avro file, you have to specify an `AvroInputFormat`.
+
+**Example**:
+
+```java
+AvroInputFormat users = new AvroInputFormat(in, User.class);
+DataSet usersDS = env.createInput(users);
+```
+
+Note that `User` is a POJO generated by Avro. Flink also allows to perform string-based key selection of these POJOs. For example:
+
+```java
+usersDS.groupBy("name")
+```
+
+
+Note that using the `GenericData.Record` type is possible with Flink, but not recommended. Since the record contains the full schema, its very data intensive and thus probably slow to use.
+
+Flink's POJO field selection also works with POJOs generated from Avro. However, the usage is only possible if the field types are written correctly to the generated class. If a field is of type `Object` you can not use the field as a join or grouping key.
+Specifying a field in Avro like this `{"name": "type_double_test", "type": "double"},` works fine, however specifying it as a UNION-type with only one field (`{"name": "type_double_test", "type": ["double"]},`) will generate a field of type `Object`. Note that specifying nullable types (`{"name": "type_double_test", "type": ["null", "double"]},`) is possible!
+
+
+
+### Access Microsoft Azure Table Storage
+
+_Note: This example works starting from Flink 0.6-incubating_
+
+This example is using the `HadoopInputFormat` wrapper to use an existing Hadoop input format implementation for accessing [Azure's Table Storage](https://azure.microsoft.com/en-us/documentation/articles/storage-introduction/).
+
+1. Download and compile the `azure-tables-hadoop` project. The input format developed by the project is not yet available in Maven Central, therefore, we have to build the project ourselves.
+Execute the following commands:
+
+```bash
+git clone https://github.com/mooso/azure-tables-hadoop.git
+cd azure-tables-hadoop
+mvn clean install
+```
+
+2. Setup a new Flink project using the quickstarts:
+
+```bash
+curl https://flink.apache.org/q/quickstart.sh | bash
+```
+
+3. Add the following dependencies (in the `` section) to your `pom.xml` file:
+
+```xml
+
+ org.apache.flink
+ flink-hadoop-compatibility{{ site.scala_version_suffix }}
+ {{site.version}}
+
+
+ com.microsoft.hadoop
+ microsoft-hadoop-azure
+ 0.0.4
+
+```
+
+`flink-hadoop-compatibility` is a Flink package that provides the Hadoop input format wrappers.
+`microsoft-hadoop-azure` is adding the project we've build before to our project.
+
+The project is now prepared for starting to code. We recommend to import the project into an IDE, such as Eclipse or IntelliJ. (Import as a Maven project!).
+Browse to the code of the `Job.java` file. Its an empty skeleton for a Flink job.
+
+Paste the following code into it:
+
+```java
+import java.util.Map;
+import org.apache.flink.api.common.functions.MapFunction;
+import org.apache.flink.api.java.DataSet;
+import org.apache.flink.api.java.ExecutionEnvironment;
+import org.apache.flink.api.java.tuple.Tuple2;
+import org.apache.flink.hadoopcompatibility.mapreduce.HadoopInputFormat;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import com.microsoft.hadoop.azure.AzureTableConfiguration;
+import com.microsoft.hadoop.azure.AzureTableInputFormat;
+import com.microsoft.hadoop.azure.WritableEntity;
+import com.microsoft.windowsazure.storage.table.EntityProperty;
+
+public class AzureTableExample {
+
+ public static void main(String[] args) throws Exception {
+ // set up the execution environment
+ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
+
+ // create a AzureTableInputFormat, using a Hadoop input format wrapper
+ HadoopInputFormat hdIf = new HadoopInputFormat(new AzureTableInputFormat(), Text.class, WritableEntity.class, new Job());
+
+ // set the Account URI, something like: https://apacheflink.table.core.windows.net
+ hdIf.getConfiguration().set(AzureTableConfiguration.Keys.ACCOUNT_URI.getKey(), "TODO");
+ // set the secret storage key here
+ hdIf.getConfiguration().set(AzureTableConfiguration.Keys.STORAGE_KEY.getKey(), "TODO");
+ // set the table name here
+ hdIf.getConfiguration().set(AzureTableConfiguration.Keys.TABLE_NAME.getKey(), "TODO");
+
+ DataSet> input = env.createInput(hdIf);
+ // a little example how to use the data in a mapper.
+ DataSet fin = input.map(new MapFunction, String>() {
+ @Override
+ public String map(Tuple2 arg0) throws Exception {
+ System.err.println("--------------------------------\nKey = "+arg0.f0);
+ WritableEntity we = arg0.f1;
+
+ for(Map.Entry prop : we.getProperties().entrySet()) {
+ System.err.println("key="+prop.getKey() + " ; value (asString)="+prop.getValue().getValueAsString());
+ }
+
+ return arg0.f0.toString();
+ }
+ });
+
+ // emit result (this works only locally)
+ fin.print();
+
+ // execute program
+ env.execute("Azure Example");
+ }
+}
+```
+
+The example shows how to access an Azure table and turn data into Flink's `DataSet` (more specifically, the type of the set is `DataSet>`). With the `DataSet`, you can apply all known transformations to the DataSet.
+
+## Access MongoDB
+
+This [GitHub repository documents how to use MongoDB with Apache Flink (starting from 0.7-incubating)](https://github.com/okkam-it/flink-mongodb-test).
+
+{{< top >}}
diff --git a/docs/content.zh/docs/connectors/datastream/_index.md b/docs/content.zh/docs/connectors/datastream/_index.md
new file mode 100644
index 0000000000000..c4ae18206f407
--- /dev/null
+++ b/docs/content.zh/docs/connectors/datastream/_index.md
@@ -0,0 +1,23 @@
+---
+title: DataStream Connectors
+bookCollapseSection: true
+weight: 1
+---
+
\ No newline at end of file
diff --git a/docs/content.zh/docs/connectors/datastream/cassandra.md b/docs/content.zh/docs/connectors/datastream/cassandra.md
new file mode 100644
index 0000000000000..97eb6daf907bb
--- /dev/null
+++ b/docs/content.zh/docs/connectors/datastream/cassandra.md
@@ -0,0 +1,284 @@
+---
+title: Cassandra
+weight: 3
+type: docs
+aliases:
+ - /zh/dev/connectors/cassandra.html
+ - /zh/apis/streaming/connectors/cassandra.html
+---
+
+
+# Apache Cassandra Connector
+
+This connector provides sinks that writes data into a [Apache Cassandra](https://cassandra.apache.org/) database.
+
+
+
+To use this connector, add the following dependency to your project:
+
+{{< artifact flink-connector-cassandra withScalaVersion >}}
+
+Note that the streaming connectors are currently __NOT__ part of the binary distribution. See how to link with them for cluster execution [here]({{< ref "docs/dev/datastream/project-configuration" >}}).
+
+## Installing Apache Cassandra
+There are multiple ways to bring up a Cassandra instance on local machine:
+
+1. Follow the instructions from [Cassandra Getting Started page](http://cassandra.apache.org/doc/latest/getting_started/index.html).
+2. Launch a container running Cassandra from [Official Docker Repository](https://hub.docker.com/_/cassandra/)
+
+## Cassandra Sinks
+
+### Configurations
+
+Flink's Cassandra sink are created by using the static CassandraSink.addSink(DataStream input) method.
+This method returns a CassandraSinkBuilder, which offers methods to further configure the sink, and finally `build()` the sink instance.
+
+The following configuration methods can be used:
+
+1. _setQuery(String query)_
+ * Sets the upsert query that is executed for every record the sink receives.
+ * The query is internally treated as CQL statement.
+ * __DO__ set the upsert query for processing __Tuple__ data type.
+ * __DO NOT__ set the query for processing __POJO__ data types.
+2. _setClusterBuilder()_
+ * Sets the cluster builder that is used to configure the connection to cassandra with more sophisticated settings such as consistency level, retry policy and etc.
+3. _setHost(String host[, int port])_
+ * Simple version of setClusterBuilder() with host/port information to connect to Cassandra instances
+4. _setMapperOptions(MapperOptions options)_
+ * Sets the mapper options that are used to configure the DataStax ObjectMapper.
+ * Only applies when processing __POJO__ data types.
+5. _setMaxConcurrentRequests(int maxConcurrentRequests, Duration timeout)_
+ * Sets the maximum allowed number of concurrent requests with a timeout for acquiring permits to execute.
+ * Only applies when __enableWriteAheadLog()__ is not configured.
+6. _enableWriteAheadLog([CheckpointCommitter committer])_
+ * An __optional__ setting
+ * Allows exactly-once processing for non-deterministic algorithms.
+7. _setFailureHandler([CassandraFailureHandler failureHandler])_
+ * An __optional__ setting
+ * Sets the custom failure handler.
+8. _build()_
+ * Finalizes the configuration and constructs the CassandraSink instance.
+
+### Write-ahead Log
+
+A checkpoint committer stores additional information about completed checkpoints
+in some resource. This information is used to prevent a full replay of the last
+completed checkpoint in case of a failure.
+You can use a `CassandraCommitter` to store these in a separate table in cassandra.
+Note that this table will NOT be cleaned up by Flink.
+
+Flink can provide exactly-once guarantees if the query is idempotent (meaning it can be applied multiple
+times without changing the result) and checkpointing is enabled. In case of a failure the failed
+checkpoint will be replayed completely.
+
+Furthermore, for non-deterministic programs the write-ahead log has to be enabled. For such a program
+the replayed checkpoint may be completely different than the previous attempt, which may leave the
+database in an inconsistent state since part of the first attempt may already be written.
+The write-ahead log guarantees that the replayed checkpoint is identical to the first attempt.
+Note that that enabling this feature will have an adverse impact on latency.
+
+
Note: The write-ahead log functionality is currently experimental. In many cases it is sufficient to use the connector without enabling it. Please report problems to the development mailing list.
+
+### Checkpointing and Fault Tolerance
+With checkpointing enabled, Cassandra Sink guarantees at-least-once delivery of action requests to C* instance.
+
+More details on [checkpoints docs]({{< ref "docs/dev/datastream/fault-tolerance/checkpointing" >}}) and [fault tolerance guarantee docs]({{< ref "docs/connectors/datastream/guarantees" >}})
+
+## Examples
+
+The Cassandra sinks currently support both Tuple and POJO data types, and Flink automatically detects which type of input is used. For general use case of those streaming data type, please refer to [Supported Data Types]({{< ref "docs/dev/serialization/types_serialization" >}}#supported-data-types). We show two implementations based on [SocketWindowWordCount](https://github.com/apache/flink/blob/master/flink-examples/flink-examples-streaming/src/main/java/org/apache/flink/streaming/examples/socket/SocketWindowWordCount.java), for Pojo and Tuple data types respectively.
+
+In all these examples, we assumed the associated Keyspace `example` and Table `wordcount` have been created.
+
+{{< tabs "ffc5c4d4-7872-479c-bfa6-206b9e96f6f3" >}}
+{{< tab "CQL" >}}
+```sql
+CREATE KEYSPACE IF NOT EXISTS example
+ WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
+
+CREATE TABLE IF NOT EXISTS example.wordcount (
+ word text,
+ count bigint,
+ PRIMARY KEY(word)
+);
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+### Cassandra Sink Example for Streaming Tuple Data Type
+While storing the result with Java/Scala Tuple data type to a Cassandra sink, it is required to set a CQL upsert statement (via setQuery('stmt')) to persist each record back to the database. With the upsert query cached as `PreparedStatement`, each Tuple element is converted to parameters of the statement.
+
+For details about `PreparedStatement` and `BoundStatement`, please visit [DataStax Java Driver manual](https://docs.datastax.com/en/developer/java-driver/2.1/manual/statements/prepared/)
+
+{{< tabs "1a84c6a0-0b2f-4f96-8cf8-43ec6dd3bc5d" >}}
+{{< tab "Java" >}}
+```java
+// get the execution environment
+final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+
+// get input data by connecting to the socket
+DataStream text = env.socketTextStream(hostname, port, "\n");
+
+// parse the data, group it, window it, and aggregate the counts
+DataStream> result = text
+ .flatMap(new FlatMapFunction>() {
+ @Override
+ public void flatMap(String value, Collector> out) {
+ // normalize and split the line
+ String[] words = value.toLowerCase().split("\\s");
+
+ // emit the pairs
+ for (String word : words) {
+ //Do not accept empty word, since word is defined as primary key in C* table
+ if (!word.isEmpty()) {
+ out.collect(new Tuple2(word, 1L));
+ }
+ }
+ }
+ })
+ .keyBy(value -> value.f0)
+ .window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
+ .sum(1);
+
+CassandraSink.addSink(result)
+ .setQuery("INSERT INTO example.wordcount(word, count) values (?, ?);")
+ .setHost("127.0.0.1")
+ .build();
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
+
+// get input data by connecting to the socket
+val text: DataStream[String] = env.socketTextStream(hostname, port, '\n')
+
+// parse the data, group it, window it, and aggregate the counts
+val result: DataStream[(String, Long)] = text
+ // split up the lines in pairs (2-tuples) containing: (word,1)
+ .flatMap(_.toLowerCase.split("\\s"))
+ .filter(_.nonEmpty)
+ .map((_, 1L))
+ // group by the tuple field "0" and sum up tuple field "1"
+ .keyBy(_._1)
+ .window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
+ .sum(1)
+
+CassandraSink.addSink(result)
+ .setQuery("INSERT INTO example.wordcount(word, count) values (?, ?);")
+ .setHost("127.0.0.1")
+ .build()
+
+result.print().setParallelism(1)
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+
+### Cassandra Sink Example for Streaming POJO Data Type
+An example of streaming a POJO data type and store the same POJO entity back to Cassandra. In addition, this POJO implementation needs to follow [DataStax Java Driver Manual](http://docs.datastax.com/en/developer/java-driver/2.1/manual/object_mapper/creating/) to annotate the class as each field of this entity is mapped to an associated column of the designated table using the DataStax Java Driver `com.datastax.driver.mapping.Mapper` class.
+
+The mapping of each table column can be defined through annotations placed on a field declaration in the Pojo class. For details of the mapping, please refer to CQL documentation on [Definition of Mapped Classes](http://docs.datastax.com/en/developer/java-driver/3.1/manual/object_mapper/creating/) and [CQL Data types](https://docs.datastax.com/en/cql/3.1/cql/cql_reference/cql_data_types_c.html)
+
+{{< tabs "d65ca6f5-acb2-4f2c-b5b6-d986eafca765" >}}
+{{< tab "Java" >}}
+```java
+// get the execution environment
+final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+
+// get input data by connecting to the socket
+DataStream text = env.socketTextStream(hostname, port, "\n");
+
+// parse the data, group it, window it, and aggregate the counts
+DataStream result = text
+ .flatMap(new FlatMapFunction() {
+ public void flatMap(String value, Collector out) {
+ // normalize and split the line
+ String[] words = value.toLowerCase().split("\\s");
+
+ // emit the pairs
+ for (String word : words) {
+ if (!word.isEmpty()) {
+ //Do not accept empty word, since word is defined as primary key in C* table
+ out.collect(new WordCount(word, 1L));
+ }
+ }
+ }
+ })
+ .keyBy(WordCount::getWord)
+ .window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
+
+ .reduce(new ReduceFunction() {
+ @Override
+ public WordCount reduce(WordCount a, WordCount b) {
+ return new WordCount(a.getWord(), a.getCount() + b.getCount());
+ }
+ });
+
+CassandraSink.addSink(result)
+ .setHost("127.0.0.1")
+ .setMapperOptions(() -> new Mapper.Option[]{Mapper.Option.saveNullFields(true)})
+ .build();
+
+
+@Table(keyspace = "example", name = "wordcount")
+public class WordCount {
+
+ @Column(name = "word")
+ private String word = "";
+
+ @Column(name = "count")
+ private long count = 0;
+
+ public WordCount() {}
+
+ public WordCount(String word, long count) {
+ this.setWord(word);
+ this.setCount(count);
+ }
+
+ public String getWord() {
+ return word;
+ }
+
+ public void setWord(String word) {
+ this.word = word;
+ }
+
+ public long getCount() {
+ return count;
+ }
+
+ public void setCount(long count) {
+ this.count = count;
+ }
+
+ @Override
+ public String toString() {
+ return getWord() + " : " + getCount();
+ }
+}
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+{{< top >}}
diff --git a/docs/content.zh/docs/connectors/datastream/elasticsearch.md b/docs/content.zh/docs/connectors/datastream/elasticsearch.md
new file mode 100644
index 0000000000000..2559151b00ea9
--- /dev/null
+++ b/docs/content.zh/docs/connectors/datastream/elasticsearch.md
@@ -0,0 +1,471 @@
+---
+title: Elasticsearch
+weight: 5
+type: docs
+aliases:
+ - /zh/dev/connectors/elasticsearch.html
+ - /zh/apis/streaming/connectors/elasticsearch.html
+ - /zh/dev/connectors/elasticsearch2.html
+ - /zh/apis/streaming/connectors/elasticsearch2.html
+---
+
+
+# Elasticsearch Connector
+
+This connector provides sinks that can request document actions to an
+[Elasticsearch](https://elastic.co/) Index. To use this connector, add one
+of the following dependencies to your project, depending on the version
+of the Elasticsearch installation:
+
+
+
+Note that the streaming connectors are currently not part of the binary
+distribution. See [here]({{< ref "docs/dev/datastream/project-configuration" >}}) for information
+about how to package the program with the libraries for cluster execution.
+
+## Installing Elasticsearch
+
+Instructions for setting up an Elasticsearch cluster can be found
+[here](https://www.elastic.co/guide/en/elasticsearch/reference/current/setup.html).
+Make sure to set and remember a cluster name. This must be set when
+creating an `ElasticsearchSink` for requesting document actions against your cluster.
+
+## Elasticsearch Sink
+
+The `ElasticsearchSink` uses a `TransportClient` (before 6.x) or `RestHighLevelClient` (starting with 6.x) to communicate with an
+Elasticsearch cluster.
+
+The example below shows how to configure and create a sink:
+
+{{< tabs "51732edd-4218-470e-adad-b1ebb4021ae4" >}}
+{{< tab "java, 5.x" >}}
+```java
+import org.apache.flink.api.common.functions.RuntimeContext;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
+import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
+import org.apache.flink.streaming.connectors.elasticsearch5.ElasticsearchSink;
+
+import org.elasticsearch.action.index.IndexRequest;
+import org.elasticsearch.client.Requests;
+
+import java.net.InetAddress;
+import java.net.InetSocketAddress;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+DataStream input = ...;
+
+Map config = new HashMap<>();
+config.put("cluster.name", "my-cluster-name");
+// This instructs the sink to emit after every element, otherwise they would be buffered
+config.put("bulk.flush.max.actions", "1");
+
+List transportAddresses = new ArrayList<>();
+transportAddresses.add(new InetSocketAddress(InetAddress.getByName("127.0.0.1"), 9300));
+transportAddresses.add(new InetSocketAddress(InetAddress.getByName("10.2.3.1"), 9300));
+
+input.addSink(new ElasticsearchSink<>(config, transportAddresses, new ElasticsearchSinkFunction() {
+ public IndexRequest createIndexRequest(String element) {
+ Map json = new HashMap<>();
+ json.put("data", element);
+
+ return Requests.indexRequest()
+ .index("my-index")
+ .type("my-type")
+ .source(json);
+ }
+
+ @Override
+ public void process(String element, RuntimeContext ctx, RequestIndexer indexer) {
+ indexer.add(createIndexRequest(element));
+ }
+}));```
+{{< /tab >}}
+{{< tab "java, Elasticsearch 6.x and above" >}}
+```java
+import org.apache.flink.api.common.functions.RuntimeContext;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
+import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
+import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink;
+
+import org.apache.http.HttpHost;
+import org.elasticsearch.action.index.IndexRequest;
+import org.elasticsearch.client.Requests;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+DataStream input = ...;
+
+List httpHosts = new ArrayList<>();
+httpHosts.add(new HttpHost("127.0.0.1", 9200, "http"));
+httpHosts.add(new HttpHost("10.2.3.1", 9200, "http"));
+
+// use a ElasticsearchSink.Builder to create an ElasticsearchSink
+ElasticsearchSink.Builder esSinkBuilder = new ElasticsearchSink.Builder<>(
+ httpHosts,
+ new ElasticsearchSinkFunction() {
+ public IndexRequest createIndexRequest(String element) {
+ Map json = new HashMap<>();
+ json.put("data", element);
+
+ return Requests.indexRequest()
+ .index("my-index")
+ .type("my-type")
+ .source(json);
+ }
+
+ @Override
+ public void process(String element, RuntimeContext ctx, RequestIndexer indexer) {
+ indexer.add(createIndexRequest(element));
+ }
+ }
+);
+
+// configuration for the bulk requests; this instructs the sink to emit after every element, otherwise they would be buffered
+esSinkBuilder.setBulkFlushMaxActions(1);
+
+// provide a RestClientFactory for custom configuration on the internally created REST client
+esSinkBuilder.setRestClientFactory(
+ restClientBuilder -> {
+ restClientBuilder.setDefaultHeaders(...)
+ restClientBuilder.setMaxRetryTimeoutMillis(...)
+ restClientBuilder.setPathPrefix(...)
+ restClientBuilder.setHttpClientConfigCallback(...)
+ }
+);
+
+// finally, build and add the sink to the job's pipeline
+input.addSink(esSinkBuilder.build());
+```
+{{< /tab >}}
+{{< tab "scala, 5.x" >}}
+```scala
+import org.apache.flink.api.common.functions.RuntimeContext
+import org.apache.flink.streaming.api.datastream.DataStream
+import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction
+import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer
+import org.apache.flink.streaming.connectors.elasticsearch5.ElasticsearchSink
+
+import org.elasticsearch.action.index.IndexRequest
+import org.elasticsearch.client.Requests
+
+import java.net.InetAddress
+import java.net.InetSocketAddress
+import java.util.ArrayList
+import java.util.HashMap
+import java.util.List
+import java.util.Map
+
+val input: DataStream[String] = ...
+
+val config = new java.util.HashMap[String, String]
+config.put("cluster.name", "my-cluster-name")
+// This instructs the sink to emit after every element, otherwise they would be buffered
+config.put("bulk.flush.max.actions", "1")
+
+val transportAddresses = new java.util.ArrayList[InetSocketAddress]
+transportAddresses.add(new InetSocketAddress(InetAddress.getByName("127.0.0.1"), 9300))
+transportAddresses.add(new InetSocketAddress(InetAddress.getByName("10.2.3.1"), 9300))
+
+input.addSink(new ElasticsearchSink(config, transportAddresses, new ElasticsearchSinkFunction[String] {
+ def createIndexRequest(element: String): IndexRequest = {
+ val json = new java.util.HashMap[String, String]
+ json.put("data", element)
+
+ return Requests.indexRequest()
+ .index("my-index")
+ .type("my-type")
+ .source(json)
+ }
+}))
+```
+{{< /tab >}}
+{{< tab "scala, Elasticsearch 6.x and above" >}}
+```scala
+import org.apache.flink.api.common.functions.RuntimeContext
+import org.apache.flink.streaming.api.datastream.DataStream
+import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction
+import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer
+import org.apache.flink.streaming.connectors.elasticsearch6.ElasticsearchSink
+
+import org.apache.http.HttpHost
+import org.elasticsearch.action.index.IndexRequest
+import org.elasticsearch.client.Requests
+
+import java.util.ArrayList
+import java.util.List
+
+val input: DataStream[String] = ...
+
+val httpHosts = new java.util.ArrayList[HttpHost]
+httpHosts.add(new HttpHost("127.0.0.1", 9200, "http"))
+httpHosts.add(new HttpHost("10.2.3.1", 9200, "http"))
+
+val esSinkBuilder = new ElasticsearchSink.Builder[String](
+ httpHosts,
+ new ElasticsearchSinkFunction[String] {
+ def process(element: String, ctx: RuntimeContext, indexer: RequestIndexer) {
+ val json = new java.util.HashMap[String, String]
+ json.put("data", element)
+
+ val rqst: IndexRequest = Requests.indexRequest
+ .index("my-index")
+ .`type`("my-type")
+ .source(json)
+
+ indexer.add(rqst)
+ }
+ }
+)
+
+// configuration for the bulk requests; this instructs the sink to emit after every element, otherwise they would be buffered
+esSinkBuilder.setBulkFlushMaxActions(1)
+
+// provide a RestClientFactory for custom configuration on the internally created REST client
+esSinkBuilder.setRestClientFactory(new RestClientFactory {
+ override def configureRestClientBuilder(restClientBuilder: RestClientBuilder): Unit = {
+ restClientBuilder.setDefaultHeaders(...)
+ restClientBuilder.setMaxRetryTimeoutMillis(...)
+ restClientBuilder.setPathPrefix(...)
+ restClientBuilder.setHttpClientConfigCallback(...)
+ }
+})
+
+// finally, build and add the sink to the job's pipeline
+input.addSink(esSinkBuilder.build)
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+For Elasticsearch versions that still uses the now deprecated `TransportClient` to communicate
+with the Elasticsearch cluster (i.e., versions equal or below 5.x), note how a `Map` of `String`s
+is used to configure the `ElasticsearchSink`. This config map will be directly
+forwarded when creating the internally used `TransportClient`.
+The configuration keys are documented in the Elasticsearch documentation
+[here](https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html).
+Especially important is the `cluster.name` parameter that must correspond to
+the name of your cluster.
+
+For Elasticsearch 6.x and above, internally, the `RestHighLevelClient` is used for cluster communication.
+By default, the connector uses the default configurations for the REST client. To have custom
+configuration for the REST client, users can provide a `RestClientFactory` implementation when
+setting up the `ElasticsearchClient.Builder` that builds the sink.
+
+Also note that the example only demonstrates performing a single index
+request for each incoming element. Generally, the `ElasticsearchSinkFunction`
+can be used to perform multiple requests of different types (ex.,
+`DeleteRequest`, `UpdateRequest`, etc.).
+
+Internally, each parallel instance of the Flink Elasticsearch Sink uses
+a `BulkProcessor` to send action requests to the cluster.
+This will buffer elements before sending them in bulk to the cluster. The `BulkProcessor`
+executes bulk requests one at a time, i.e. there will be no two concurrent
+flushes of the buffered actions in progress.
+
+### Elasticsearch Sinks and Fault Tolerance
+
+With Flink’s checkpointing enabled, the Flink Elasticsearch Sink guarantees
+at-least-once delivery of action requests to Elasticsearch clusters. It does
+so by waiting for all pending action requests in the `BulkProcessor` at the
+time of checkpoints. This effectively assures that all requests before the
+checkpoint was triggered have been successfully acknowledged by Elasticsearch, before
+proceeding to process more records sent to the sink.
+
+More details on checkpoints and fault tolerance are in the [fault tolerance docs]({{< ref "docs/learn-flink/fault_tolerance" >}}).
+
+To use fault tolerant Elasticsearch Sinks, checkpointing of the topology needs to be enabled at the execution environment:
+
+{{< tabs "d00d1e93-4844-40d7-b0ec-9ec37e73145e" >}}
+{{< tab "Java" >}}
+```java
+final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+env.enableCheckpointing(5000); // checkpoint every 5000 msecs
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val env = StreamExecutionEnvironment.getExecutionEnvironment()
+env.enableCheckpointing(5000) // checkpoint every 5000 msecs
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+
+NOTE: Users can disable flushing if they wish to do so, by calling
+disableFlushOnCheckpoint() on the created ElasticsearchSink. Be aware
+that this essentially means the sink will not provide any strong
+delivery guarantees anymore, even with checkpoint for the topology enabled.
+
+
+### Handling Failing Elasticsearch Requests
+
+Elasticsearch action requests may fail due to a variety of reasons, including
+temporarily saturated node queue capacity or malformed documents to be indexed.
+The Flink Elasticsearch Sink allows the user to specify how request
+failures are handled, by simply implementing an `ActionRequestFailureHandler` and
+providing it to the constructor.
+
+Below is an example:
+
+{{< tabs "ddb958b3-5dd5-476e-b946-ace3335628b2" >}}
+{{< tab "Java" >}}
+```java
+DataStream input = ...;
+
+input.addSink(new ElasticsearchSink<>(
+ config, transportAddresses,
+ new ElasticsearchSinkFunction() {...},
+ new ActionRequestFailureHandler() {
+ @Override
+ void onFailure(ActionRequest action,
+ Throwable failure,
+ int restStatusCode,
+ RequestIndexer indexer) throw Throwable {
+
+ if (ExceptionUtils.findThrowable(failure, EsRejectedExecutionException.class).isPresent()) {
+ // full queue; re-add document for indexing
+ indexer.add(action);
+ } else if (ExceptionUtils.findThrowable(failure, ElasticsearchParseException.class).isPresent()) {
+ // malformed document; simply drop request without failing sink
+ } else {
+ // for all other failures, fail the sink
+ // here the failure is simply rethrown, but users can also choose to throw custom exceptions
+ throw failure;
+ }
+ }
+}));
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val input: DataStream[String] = ...
+
+input.addSink(new ElasticsearchSink(
+ config, transportAddresses,
+ new ElasticsearchSinkFunction[String] {...},
+ new ActionRequestFailureHandler {
+ @throws(classOf[Throwable])
+ override def onFailure(ActionRequest action,
+ Throwable failure,
+ int restStatusCode,
+ RequestIndexer indexer) {
+
+ if (ExceptionUtils.findThrowable(failure, EsRejectedExecutionException.class).isPresent()) {
+ // full queue; re-add document for indexing
+ indexer.add(action)
+ } else if (ExceptionUtils.findThrowable(failure, ElasticsearchParseException.class).isPresent()) {
+ // malformed document; simply drop request without failing sink
+ } else {
+ // for all other failures, fail the sink
+ // here the failure is simply rethrown, but users can also choose to throw custom exceptions
+ throw failure
+ }
+ }
+}))
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+The above example will let the sink re-add requests that failed due to
+queue capacity saturation and drop requests with malformed documents, without
+failing the sink. For all other failures, the sink will fail. If a `ActionRequestFailureHandler`
+is not provided to the constructor, the sink will fail for any kind of error.
+
+Note that `onFailure` is called for failures that still occur only after the
+`BulkProcessor` internally finishes all backoff retry attempts.
+By default, the `BulkProcessor` retries to a maximum of 8 attempts with
+an exponential backoff. For more information on the behaviour of the
+internal `BulkProcessor` and how to configure it, please see the following section.
+
+By default, if a failure handler is not provided, the sink uses a
+`NoOpFailureHandler` that simply fails for all kinds of exceptions. The
+connector also provides a `RetryRejectedExecutionFailureHandler` implementation
+that always re-add requests that have failed due to queue capacity saturation.
+
+
+IMPORTANT: Re-adding requests back to the internal BulkProcessor
+on failures will lead to longer checkpoints, as the sink will also
+need to wait for the re-added requests to be flushed when checkpointing.
+For example, when using RetryRejectedExecutionFailureHandler, checkpoints
+will need to wait until Elasticsearch node queues have enough capacity for
+all the pending requests. This also means that if re-added requests never
+succeed, the checkpoint will never finish.
+
+
+
+
+### 数据丢失
+
+根据你的 Kafka 配置,即使在 Kafka 确认写入后,你仍然可能会遇到数据丢失。特别要记住在 Kafka 的配置中设置以下属性:
+
+- `acks`
+- `log.flush.interval.messages`
+- `log.flush.interval.ms`
+- `log.flush.*`
+
+上述选项的默认值是很容易导致数据丢失的。请参考 Kafka 文档以获得更多的解释。
+
+
+
+### UnknownTopicOrPartitionException
+
+导致此错误的一个可能原因是正在进行新的 leader 选举,例如在重新启动 Kafka broker 之后或期间。这是一个可重试的异常,因此 Flink job 应该能够重启并恢复正常运行。也可以通过更改 producer 设置中的 `retries` 属性来规避。但是,这可能会导致重新排序消息,反过来可以通过将 `max.in.flight.requests.per.connection` 设置为 1 来避免不需要的消息。
+
+{{< top >}}
diff --git a/docs/content.zh/docs/connectors/datastream/kinesis.md b/docs/content.zh/docs/connectors/datastream/kinesis.md
new file mode 100644
index 0000000000000..3088f0b9a94d1
--- /dev/null
+++ b/docs/content.zh/docs/connectors/datastream/kinesis.md
@@ -0,0 +1,701 @@
+---
+title: Kinesis
+weight: 4
+type: docs
+aliases:
+ - /zh/dev/connectors/kinesis.html
+ - /zh/apis/streaming/connectors/kinesis.html
+---
+
+
+# Amazon Kinesis Data Streams Connector
+
+The Kinesis connector provides access to [Amazon AWS Kinesis Streams](http://aws.amazon.com/kinesis/streams/).
+
+To use the connector, add the following Maven dependency to your project:
+
+{{< artifact flink-connector-kinesis withScalaVersion >}}
+
+{{< hint warning >}}
+**Attention** Prior to Flink version 1.10.0 the `flink-connector-kinesis{{< scala_version >}}` has a dependency on code licensed under the [Amazon Software License](https://aws.amazon.com/asl/).
+Linking to the prior versions of flink-connector-kinesis will include this code into your application.
+{{< /hint >}}
+
+Due to the licensing issue, the `flink-connector-kinesis{{< scala_version >}}` artifact is not deployed to Maven central for the prior versions. Please see the version specific documentation for further information.
+
+## Using the Amazon Kinesis Streams Service
+Follow the instructions from the [Amazon Kinesis Streams Developer Guide](https://docs.aws.amazon.com/streams/latest/dev/learning-kinesis-module-one-create-stream.html)
+to setup Kinesis streams.
+
+## Configuring Access to Kinesis with IAM
+Make sure to create the appropriate IAM policy to allow reading / writing to / from the Kinesis streams. See examples [here](https://docs.aws.amazon.com/streams/latest/dev/controlling-access.html).
+
+Depending on your deployment you would choose a different Credentials Provider to allow access to Kinesis.
+By default, the `AUTO` Credentials Provider is used.
+If the access key ID and secret key are set in the configuration, the `BASIC` provider is used.
+
+A specific Credentials Provider can **optionally** be set by using the `AWSConfigConstants.AWS_CREDENTIALS_PROVIDER` setting.
+
+Supported Credential Providers are:
+* `AUTO` - Using the default AWS Credentials Provider chain that searches for credentials in the following order: `ENV_VARS`, `SYS_PROPS`, `WEB_IDENTITY_TOKEN`, `PROFILE` and EC2/ECS credentials provider.
+* `BASIC` - Using access key ID and secret key supplied as configuration.
+* `ENV_VAR` - Using `AWS_ACCESS_KEY_ID` & `AWS_SECRET_ACCESS_KEY` environment variables.
+* `SYS_PROP` - Using Java system properties aws.accessKeyId and aws.secretKey.
+* `PROFILE` - Use AWS credentials profile file to create the AWS credentials.
+* `ASSUME_ROLE` - Create AWS credentials by assuming a role. The credentials for assuming the role must be supplied.
+* `WEB_IDENTITY_TOKEN` - Create AWS credentials by assuming a role using Web Identity Token.
+
+## Kinesis Consumer
+
+The `FlinkKinesisConsumer` is an exactly-once parallel streaming data source that subscribes to multiple AWS Kinesis
+streams within the same AWS service region, and can transparently handle resharding of streams while the job is running. Each subtask of the consumer is
+responsible for fetching data records from multiple Kinesis shards. The number of shards fetched by each subtask will
+change as shards are closed and created by Kinesis.
+
+Before consuming data from Kinesis streams, make sure that all streams are created with the status "ACTIVE" in the AWS dashboard.
+
+{{< tabs "58b6c235-48ee-4cf7-aabc-41e0679a3370" >}}
+{{< tab "Java" >}}
+```java
+Properties consumerConfig = new Properties();
+consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1");
+consumerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id");
+consumerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key");
+consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST");
+
+StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+
+DataStream kinesis = env.addSource(new FlinkKinesisConsumer<>(
+ "kinesis_stream_name", new SimpleStringSchema(), consumerConfig));
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val consumerConfig = new Properties()
+consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1")
+consumerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id")
+consumerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key")
+consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST")
+
+val env = StreamExecutionEnvironment.getExecutionEnvironment
+
+val kinesis = env.addSource(new FlinkKinesisConsumer[String](
+ "kinesis_stream_name", new SimpleStringSchema, consumerConfig))
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+The above is a simple example of using the consumer. Configuration for the consumer is supplied with a `java.util.Properties`
+instance, the configuration keys for which can be found in `AWSConfigConstants` (AWS-specific parameters) and
+`ConsumerConfigConstants` (Kinesis consumer parameters). The example
+demonstrates consuming a single Kinesis stream in the AWS region "us-east-1". The AWS credentials are supplied using the basic method in which
+the AWS access key ID and secret access key are directly supplied in the configuration. Also, data is being consumed
+from the newest position in the Kinesis stream (the other option will be setting `ConsumerConfigConstants.STREAM_INITIAL_POSITION`
+to `TRIM_HORIZON`, which lets the consumer start reading the Kinesis stream from the earliest record possible).
+
+Other optional configuration keys for the consumer can be found in `ConsumerConfigConstants`.
+
+Note that the configured parallelism of the Flink Kinesis Consumer source
+can be completely independent of the total number of shards in the Kinesis streams.
+When the number of shards is larger than the parallelism of the consumer,
+then each consumer subtask can subscribe to multiple shards; otherwise
+if the number of shards is smaller than the parallelism of the consumer,
+then some consumer subtasks will simply be idle and wait until it gets assigned
+new shards (i.e., when the streams are resharded to increase the
+number of shards for higher provisioned Kinesis service throughput).
+
+Also note that the assignment of shards to subtasks may not be optimal when
+shard IDs are not consecutive (as result of dynamic re-sharding in Kinesis).
+For cases where skew in the assignment leads to significant imbalanced consumption,
+a custom implementation of `KinesisShardAssigner` can be set on the consumer.
+
+### The `DeserializationSchema`
+
+Flink Kinesis Consumer also needs a schema to know how to turn the binary data in a Kinesis Data Stream into Java objects.
+The `KinesisDeserializationSchema` allows users to specify such a schema. The `T deserialize(byte[] recordValue, String partitionKey, String seqNum, long approxArrivalTimestamp, String stream, String shardId)`
+method gets called for each Kinesis record.
+
+For convenience, Flink provides the following schemas out of the box:
+
+1. `TypeInformationSerializationSchema` which creates a schema based on a Flink's `TypeInformation`.
+ This is useful if the data is both written and read by Flink.
+ This schema is a performant Flink-specific alternative to other generic serialization approaches.
+
+2. `AvroDeserializationSchema` which reads data serialized with Avro format using a statically provided schema. It can
+ infer the schema from Avro generated classes (`AvroDeserializationSchema.forSpecific(...)`) or it can work with `GenericRecords`
+ with a manually provided schema (with `AvroDeserializationSchema.forGeneric(...)`). This deserialization schema expects that
+ the serialized records DO NOT contain embedded schema.
+
+ - You can use [AWS Glue Schema Registry](https://docs.aws.amazon.com/glue/latest/dg/schema-registry.html)
+ to retrieve the writer’s schema. Similarly, the deserialization record will be read with the schema from AWS Glue Schema Registry and transformed
+ (either through `GlueSchemaRegistryAvroDeserializationSchema.forGeneric(...)` or `GlueSchemaRegistryAvroDeserializationSchema.forSpecific(...)`).
+ For more information on integrating the AWS Glue Schema Registry with Apache Flink see
+ [Use Case: Amazon Kinesis Data Analytics for Apache Flink](https://docs.aws.amazon.com/glue/latest/dg/schema-registry-integrations.html#schema-registry-integrations-kinesis-data-analytics-apache-flink).
+
+ To use this deserialization schema one has to add the following additional dependency:
+
+{{< tabs "8c6721c7-4a48-496e-b0fe-6522cf6a5e13" >}}
+{{< tab "AvroDeserializationSchema" >}}
+{{< artifact flink-avro >}}
+{{< /tab >}}
+{{< tab "GlueSchemaRegistryAvroDeserializationSchema" >}}
+{{< artifact flink-avro-glue-schema-registry >}}
+{{< /tab >}}
+{{< /tabs >}}
+
+### Configuring Starting Position
+
+The Flink Kinesis Consumer currently provides the following options to configure where to start reading Kinesis streams, simply by setting `ConsumerConfigConstants.STREAM_INITIAL_POSITION` to
+one of the following values in the provided configuration properties (the naming of the options identically follows [the namings used by the AWS Kinesis Streams service](http://docs.aws.amazon.com/kinesis/latest/APIReference/API_GetShardIterator.html#API_GetShardIterator_RequestSyntax)):
+
+- `LATEST`: read all shards of all streams starting from the latest record.
+- `TRIM_HORIZON`: read all shards of all streams starting from the earliest record possible (data may be trimmed by Kinesis depending on the retention settings).
+- `AT_TIMESTAMP`: read all shards of all streams starting from a specified timestamp. The timestamp must also be specified in the configuration
+properties by providing a value for `ConsumerConfigConstants.STREAM_INITIAL_TIMESTAMP`, in one of the following date pattern :
+ - a non-negative double value representing the number of seconds that has elapsed since the Unix epoch (for example, `1459799926.480`).
+ - a user defined pattern, which is a valid pattern for `SimpleDateFormat` provided by `ConsumerConfigConstants.STREAM_TIMESTAMP_DATE_FORMAT`.
+ If `ConsumerConfigConstants.STREAM_TIMESTAMP_DATE_FORMAT` is not defined then the default pattern will be `yyyy-MM-dd'T'HH:mm:ss.SSSXXX`
+ (for example, timestamp value is `2016-04-04` and pattern is `yyyy-MM-dd` given by user or timestamp value is `2016-04-04T19:58:46.480-00:00` without given a pattern).
+
+### Fault Tolerance for Exactly-Once User-Defined State Update Semantics
+
+With Flink's checkpointing enabled, the Flink Kinesis Consumer will consume records from shards in Kinesis streams and
+periodically checkpoint each shard's progress. In case of a job failure, Flink will restore the streaming program to the
+state of the latest complete checkpoint and re-consume the records from Kinesis shards, starting from the progress that
+was stored in the checkpoint.
+
+The interval of drawing checkpoints therefore defines how much the program may have to go back at most, in case of a failure.
+
+To use fault tolerant Kinesis Consumers, checkpointing of the topology needs to be enabled at the execution environment:
+
+{{< tabs "b1399ed7-5855-446d-9684-7a49de9b4c97" >}}
+{{< tab "Java" >}}
+```java
+final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+env.enableCheckpointing(5000); // checkpoint every 5000 msecs
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val env = StreamExecutionEnvironment.getExecutionEnvironment()
+env.enableCheckpointing(5000) // checkpoint every 5000 msecs
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+Also note that Flink can only restart the topology if enough processing slots are available to restart the topology.
+Therefore, if the topology fails due to loss of a TaskManager, there must still be enough slots available afterwards.
+Flink on YARN supports automatic restart of lost YARN containers.
+
+### Using Enhanced Fan-Out
+
+[Enhanced Fan-Out (EFO)](https://aws.amazon.com/blogs/aws/kds-enhanced-fanout/) increases the maximum
+number of concurrent consumers per Kinesis stream.
+Without EFO, all concurrent consumers share a single read quota per shard.
+Using EFO, each consumer gets a distinct dedicated read quota per shard, allowing read throughput to scale with the number of consumers.
+Using EFO will [incur additional cost](https://aws.amazon.com/kinesis/data-streams/pricing/).
+
+In order to enable EFO two additional configuration parameters are required:
+
+- `RECORD_PUBLISHER_TYPE`: Determines whether to use `EFO` or `POLLING`. The default `RecordPublisher` is `POLLING`.
+- `EFO_CONSUMER_NAME`: A name to identify the consumer.
+For a given Kinesis data stream, each consumer must have a unique name.
+However, consumer names do not have to be unique across data streams.
+Reusing a consumer name will result in existing subscriptions being terminated.
+
+The code snippet below shows a simple example configurating an EFO consumer.
+
+{{< tabs "42345893-70c3-4678-a348-4c419b337eb1" >}}
+{{< tab "Java" >}}
+```java
+Properties consumerConfig = new Properties();
+consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1");
+consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST");
+
+consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE,
+ ConsumerConfigConstants.RecordPublisherType.EFO.name());
+consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer");
+
+StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+
+DataStream kinesis = env.addSource(new FlinkKinesisConsumer<>(
+ "kinesis_stream_name", new SimpleStringSchema(), consumerConfig));
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val consumerConfig = new Properties()
+consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1")
+consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST")
+
+consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE,
+ ConsumerConfigConstants.RecordPublisherType.EFO.name());
+consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer");
+
+val env = StreamExecutionEnvironment.getExecutionEnvironment()
+
+val kinesis = env.addSource(new FlinkKinesisConsumer[String](
+ "kinesis_stream_name", new SimpleStringSchema, consumerConfig))
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+#### EFO Stream Consumer Registration/Deregistration
+
+In order to use EFO, a stream consumer must be registered against each stream you wish to consume.
+By default, the `FlinkKinesisConsumer` will register the stream consumer automatically when the Flink job starts.
+The stream consumer will be registered using the name provided by the `EFO_CONSUMER_NAME` configuration.
+`FlinkKinesisConsumer` provides three registration strategies:
+
+- Registration
+ - `LAZY` (default): Stream consumers are registered when the Flink job starts running.
+ If the stream consumer already exists, it will be reused.
+ This is the preferred strategy for the majority of applications.
+ However, jobs with parallelism greater than 1 will result in tasks competing to register and acquire the stream consumer ARN.
+ For jobs with very large parallelism this can result in an increased start-up time.
+ The describe operation has a limit of 20 [transactions per second](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_DescribeStreamConsumer.html),
+ this means application startup time will increase by roughly `parallelism/20 seconds`.
+ - `EAGER`: Stream consumers are registered in the `FlinkKinesisConsumer` constructor.
+ If the stream consumer already exists, it will be reused.
+ This will result in registration occurring when the job is constructed,
+ either on the Flink Job Manager or client environment submitting the job.
+ Using this strategy results in a single thread registering and retrieving the stream consumer ARN,
+ reducing startup time over `LAZY` (with large parallelism).
+ However, consider that the client environment will require access to the AWS services.
+ - `NONE`: Stream consumer registration is not performed by `FlinkKinesisConsumer`.
+ Registration must be performed externally using the [AWS CLI or SDK](https://aws.amazon.com/tools/)
+ to invoke [RegisterStreamConsumer](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_RegisterStreamConsumer.html).
+ Stream consumer ARNs should be provided to the job via the consumer configuration.
+- Deregistration
+ - `LAZY|EAGER` (default): Stream consumers are deregistered when the job is shutdown gracefully.
+ In the event that a job terminates within executing the shutdown hooks, stream consumers will remain active.
+ In this situation the stream consumers will be gracefully reused when the application restarts.
+ - `NONE`: Stream consumer deregistration is not performed by `FlinkKinesisConsumer`.
+
+Below is an example configuration to use the `EAGER` registration strategy:
+
+{{< tabs "a85d716b-6c1c-46d8-9ee4-12d8380a0c06" >}}
+{{< tab "Java" >}}
+```java
+Properties consumerConfig = new Properties();
+consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1");
+consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST");
+
+consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE,
+ ConsumerConfigConstants.RecordPublisherType.EFO.name());
+consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer");
+
+consumerConfig.put(ConsumerConfigConstants.EFO_REGISTRATION_TYPE,
+ ConsumerConfigConstants.EFORegistrationType.EAGER.name());
+
+StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+
+DataStream kinesis = env.addSource(new FlinkKinesisConsumer<>(
+ "kinesis_stream_name", new SimpleStringSchema(), consumerConfig));
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val consumerConfig = new Properties()
+consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1")
+consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST")
+
+consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE,
+ ConsumerConfigConstants.RecordPublisherType.EFO.name());
+consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer");
+
+consumerConfig.put(ConsumerConfigConstants.EFO_REGISTRATION_TYPE,
+ ConsumerConfigConstants.EFORegistrationType.EAGER.name());
+
+val env = StreamExecutionEnvironment.getExecutionEnvironment()
+
+val kinesis = env.addSource(new FlinkKinesisConsumer[String](
+ "kinesis_stream_name", new SimpleStringSchema, consumerConfig))
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+Below is an example configuration to use the `NONE` registration strategy:
+
+{{< tabs "00b46c87-7740-4263-8040-2aa7e2960513" >}}
+{{< tab "Java" >}}
+```java
+Properties consumerConfig = new Properties();
+consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1");
+consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST");
+
+consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE,
+ ConsumerConfigConstants.RecordPublisherType.EFO.name());
+consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer");
+
+consumerConfig.put(ConsumerConfigConstants.EFO_REGISTRATION_TYPE,
+ ConsumerConfigConstants.EFORegistrationType.NONE.name());
+consumerConfig.put(ConsumerConfigConstants.efoConsumerArn("stream-name"),
+ "arn:aws:kinesis::>:stream//consumer/:");
+
+StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+
+DataStream kinesis = env.addSource(new FlinkKinesisConsumer<>(
+ "kinesis_stream_name", new SimpleStringSchema(), consumerConfig));
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val consumerConfig = new Properties()
+consumerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1")
+consumerConfig.put(ConsumerConfigConstants.STREAM_INITIAL_POSITION, "LATEST")
+
+consumerConfig.put(ConsumerConfigConstants.RECORD_PUBLISHER_TYPE,
+ ConsumerConfigConstants.RecordPublisherType.EFO.name());
+consumerConfig.put(ConsumerConfigConstants.EFO_CONSUMER_NAME, "my-flink-efo-consumer");
+
+consumerConfig.put(ConsumerConfigConstants.EFO_REGISTRATION_TYPE,
+ ConsumerConfigConstants.EFORegistrationType.NONE.name());
+consumerConfig.put(ConsumerConfigConstants.efoConsumerArn("stream-name"),
+ "arn:aws:kinesis::>:stream//consumer/:");
+
+val env = StreamExecutionEnvironment.getExecutionEnvironment()
+
+val kinesis = env.addSource(new FlinkKinesisConsumer[String](
+ "kinesis_stream_name", new SimpleStringSchema, consumerConfig))
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+### Event Time for Consumed Records
+
+If streaming topologies choose to use the [event time notion]({{< ref "docs/concepts/time" >}}) for record
+timestamps, an *approximate arrival timestamp* will be used by default. This timestamp is attached to records by Kinesis once they
+were successfully received and stored by streams. Note that this timestamp is typically referred to as a Kinesis server-side
+timestamp, and there are no guarantees about the accuracy or order correctness (i.e., the timestamps may not always be
+ascending).
+
+Users can choose to override this default with a custom timestamp, as described [here]({{< ref "docs/dev/datastream/event-time/generating_watermarks" >}}),
+or use one from the [predefined ones]({{< ref "docs/dev/datastream/event-time/built_in" >}}). After doing so,
+it can be passed to the consumer in the following way:
+
+{{< tabs "8fbaf5cb-3b76-4c62-a74e-db51b60f6600" >}}
+{{< tab "Java" >}}
+```java
+FlinkKinesisConsumer consumer = new FlinkKinesisConsumer<>(
+ "kinesis_stream_name",
+ new SimpleStringSchema(),
+ kinesisConsumerConfig);
+consumer.setPeriodicWatermarkAssigner(new CustomAssignerWithPeriodicWatermarks());
+DataStream stream = env
+ .addSource(consumer)
+ .print();
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val consumer = new FlinkKinesisConsumer[String](
+ "kinesis_stream_name",
+ new SimpleStringSchema(),
+ kinesisConsumerConfig);
+consumer.setPeriodicWatermarkAssigner(new CustomAssignerWithPeriodicWatermarks());
+val stream = env
+ .addSource(consumer)
+ .print();
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+Internally, an instance of the assigner is executed per shard / consumer thread (see threading model below).
+When an assigner is specified, for each record read from Kinesis, the extractTimestamp(T element, long previousElementTimestamp)
+is called to assign a timestamp to the record and getCurrentWatermark() to determine the new watermark for the shard.
+The watermark of the consumer subtask is then determined as the minimum watermark of all its shards and emitted periodically.
+The per shard watermark is essential to deal with varying consumption speed between shards, that otherwise could lead
+to issues with downstream logic that relies on the watermark, such as incorrect late data dropping.
+
+By default, the watermark is going to stall if shards do not deliver new records.
+The property `ConsumerConfigConstants.SHARD_IDLE_INTERVAL_MILLIS` can be used to avoid this potential issue through a
+timeout that will allow the watermark to progress despite of idle shards.
+
+### Event Time Alignment for Shard Consumers
+
+The Flink Kinesis Consumer optionally supports synchronization between parallel consumer subtasks (and their threads)
+to avoid the event time skew related problems described in [Event time synchronization across sources](https://issues.apache.org/jira/browse/FLINK-10886).
+
+To enable synchronization, set the watermark tracker on the consumer:
+
+
+```java
+JobManagerWatermarkTracker watermarkTracker =
+ new JobManagerWatermarkTracker("myKinesisSource");
+consumer.setWatermarkTracker(watermarkTracker);
+```
+
+
+The `JobManagerWatermarkTracker` will use a global aggregate to synchronize the per subtask watermarks. Each subtask
+uses a per shard queue to control the rate at which records are emitted downstream based on how far ahead of the global
+watermark the next record in the queue is.
+
+The "emit ahead" limit is configured via `ConsumerConfigConstants.WATERMARK_LOOKAHEAD_MILLIS`. Smaller values reduce
+the skew but also the throughput. Larger values will allow the subtask to proceed further before waiting for the global
+watermark to advance.
+
+Another variable in the throughput equation is how frequently the watermark is propagated by the tracker.
+The interval can be configured via `ConsumerConfigConstants.WATERMARK_SYNC_MILLIS`.
+Smaller values reduce emitter waits and come at the cost of increased communication with the job manager.
+
+Since records accumulate in the queues when skew occurs, increased memory consumption needs to be expected.
+How much depends on the average record size. With larger sizes, it may be necessary to adjust the emitter queue capacity via
+`ConsumerConfigConstants.WATERMARK_SYNC_QUEUE_CAPACITY`.
+
+### Threading Model
+
+The Flink Kinesis Consumer uses multiple threads for shard discovery and data consumption.
+
+#### Shard Discovery
+
+For shard discovery, each parallel consumer subtask will have a single thread that constantly queries Kinesis for shard
+information even if the subtask initially did not have shards to read from when the consumer was started. In other words, if
+the consumer is run with a parallelism of 10, there will be a total of 10 threads constantly querying Kinesis regardless
+of the total amount of shards in the subscribed streams.
+
+#### Polling (default) Record Publisher
+
+For `POLLING` data consumption, a single thread will be created to consume each discovered shard. Threads will terminate when the
+shard it is responsible of consuming is closed as a result of stream resharding. In other words, there will always be
+one thread per open shard.
+
+#### Enhanced Fan-Out Record Publisher
+
+For `EFO` data consumption the threading model is the same as `POLLING`, with additional thread pools to handle
+asynchronous communication with Kinesis. AWS SDK v2.x `KinesisAsyncClient` uses additional threads for
+Netty to handle IO and asynchronous response. Each parallel consumer subtask will have their own instance of the `KinesisAsyncClient`.
+In other words, if the consumer is run with a parallelism of 10, there will be a total of 10 `KinesisAsyncClient` instances.
+A separate client will be created and subsequently destroyed when registering and deregistering stream consumers.
+
+### Internally Used Kinesis APIs
+
+The Flink Kinesis Consumer uses the [AWS Java SDK](http://aws.amazon.com/sdk-for-java/) internally to call Kinesis APIs
+for shard discovery and data consumption. Due to Amazon's [service limits for Kinesis Streams](http://docs.aws.amazon.com/streams/latest/dev/service-sizes-and-limits.html)
+on the APIs, the consumer will be competing with other non-Flink consuming applications that the user may be running.
+Below is a list of APIs called by the consumer with description of how the consumer uses the API, as well as information
+on how to deal with any errors or warnings that the Flink Kinesis Consumer may have due to these service limits.
+
+#### Shard Discovery
+
+- *[ListShards](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_ListShards.html)*: this is constantly called
+by a single thread in each parallel consumer subtask to discover any new shards as a result of stream resharding. By default,
+the consumer performs the shard discovery at an interval of 10 seconds, and will retry indefinitely until it gets a result
+from Kinesis. If this interferes with other non-Flink consuming applications, users can slow down the consumer of
+calling this API by setting a value for `ConsumerConfigConstants.SHARD_DISCOVERY_INTERVAL_MILLIS` in the supplied
+configuration properties. This sets the discovery interval to a different value. Note that this setting directly impacts
+the maximum delay of discovering a new shard and starting to consume it, as shards will not be discovered during the interval.
+
+#### Polling (default) Record Publisher
+
+- *[GetShardIterator](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_GetShardIterator.html)*: this is called
+only once when per shard consuming threads are started, and will retry if Kinesis complains that the transaction limit for the
+API has exceeded, up to a default of 3 attempts. Note that since the rate limit for this API is per shard (not per stream),
+the consumer itself should not exceed the limit. Usually, if this happens, users can either try to slow down any other
+non-Flink consuming applications of calling this API, or modify the retry behaviour of this API call in the consumer by
+setting keys prefixed by `ConsumerConfigConstants.SHARD_GETITERATOR_*` in the supplied configuration properties.
+
+- *[GetRecords](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_GetRecords.html)*: this is constantly called
+by per shard consuming threads to fetch records from Kinesis. When a shard has multiple concurrent consumers (when there
+are any other non-Flink consuming applications running), the per shard rate limit may be exceeded. By default, on each call
+of this API, the consumer will retry if Kinesis complains that the data size / transaction limit for the API has exceeded,
+up to a default of 3 attempts. Users can either try to slow down other non-Flink consuming applications, or adjust the throughput
+of the consumer by setting the `ConsumerConfigConstants.SHARD_GETRECORDS_MAX` and
+`ConsumerConfigConstants.SHARD_GETRECORDS_INTERVAL_MILLIS` keys in the supplied configuration properties. Setting the former
+adjusts the maximum number of records each consuming thread tries to fetch from shards on each call (default is 10,000), while
+the latter modifies the sleep interval between each fetch (default is 200). The retry behaviour of the
+consumer when calling this API can also be modified by using the other keys prefixed by `ConsumerConfigConstants.SHARD_GETRECORDS_*`.
+
+#### Enhanced Fan-Out Record Publisher
+
+- *[SubscribeToShard](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_SubscribeToShard.html)*: this is called
+by per shard consuming threads to obtain shard subscriptions. A shard subscription is typically active for 5 minutes,
+but subscriptions will be reaquired if any recoverable errors are thrown. Once a subscription is acquired, the consumer
+will receive a stream of [SubscribeToShardEvents](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_SubscribeToShardEvent.html)s.
+Retry and backoff parameters can be configured using the `ConsumerConfigConstants.SUBSCRIBE_TO_SHARD_*` keys.
+
+- *[DescribeStreamSummary](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_DescribeStreamSummary.html)*: this is called
+once per stream, during stream consumer registration. By default, the `LAZY` registration strategy will scale the
+number of calls by the job parallelism. `EAGER` will invoke this once per stream and `NONE` will not invoke this API.
+Retry and backoff parameters can be configured using the
+`ConsumerConfigConstants.STREAM_DESCRIBE_*` keys.
+
+- *[DescribeStreamConsumer](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_DescribeStreamConsumer.html)*:
+this is called during stream consumer registration and deregistration. For each stream this service will be invoked
+periodically until the stream consumer is reported `ACTIVE`/`not found` for registration/deregistration. By default,
+the `LAZY` registration strategy will scale the number of calls by the job parallelism. `EAGER` will call the service
+once per stream for registration, and scale the number of calls by the job parallelism for deregistration.
+`NONE` will not invoke this service. Retry and backoff parameters can be configured using the
+`ConsumerConfigConstants.DESCRIBE_STREAM_CONSUMER_*` keys.
+
+- *[RegisterStreamConsumer](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_RegisterStreamConsumer.html)*:
+this is called once per stream during stream consumer registration, unless the `NONE` registration strategy is configured.
+Retry and backoff parameters can be configured using the `ConsumerConfigConstants.REGISTER_STREAM_*` keys.
+
+- *[DeregisterStreamConsumer](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_DeregisterStreamConsumer.html)*:
+this is called once per stream during stream consumer deregistration, unless the `NONE` registration strategy is configured.
+Retry and backoff parameters can be configured using the `ConsumerConfigConstants.DEREGISTER_STREAM_*` keys.
+
+## Kinesis Producer
+
+The `FlinkKinesisProducer` uses [Kinesis Producer Library (KPL)](http://docs.aws.amazon.com/streams/latest/dev/developing-producers-with-kpl.html) to put data from a Flink stream into a Kinesis stream.
+
+Note that the producer is not participating in Flink's checkpointing and doesn't provide exactly-once processing guarantees. Also, the Kinesis producer does not guarantee that records are written in order to the shards (See [here](https://github.com/awslabs/amazon-kinesis-producer/issues/23) and [here](http://docs.aws.amazon.com/kinesis/latest/APIReference/API_PutRecord.html#API_PutRecord_RequestSyntax) for more details).
+
+In case of a failure or a resharding, data will be written again to Kinesis, leading to duplicates. This behavior is usually called "at-least-once" semantics.
+
+To put data into a Kinesis stream, make sure the stream is marked as "ACTIVE" in the AWS dashboard.
+
+For the monitoring to work, the user accessing the stream needs access to the CloudWatch service.
+
+{{< tabs "6df3b696-c2ca-4f44-bea0-96cf8275d61c" >}}
+{{< tab "Java" >}}
+```java
+Properties producerConfig = new Properties();
+// Required configs
+producerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1");
+producerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id");
+producerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key");
+// Optional configs
+producerConfig.put("AggregationMaxCount", "4294967295");
+producerConfig.put("CollectionMaxCount", "1000");
+producerConfig.put("RecordTtl", "30000");
+producerConfig.put("RequestTimeout", "6000");
+producerConfig.put("ThreadPoolSize", "15");
+
+// Disable Aggregation if it's not supported by a consumer
+// producerConfig.put("AggregationEnabled", "false");
+// Switch KinesisProducer's threading model
+// producerConfig.put("ThreadingModel", "PER_REQUEST");
+
+FlinkKinesisProducer kinesis = new FlinkKinesisProducer<>(new SimpleStringSchema(), producerConfig);
+kinesis.setFailOnError(true);
+kinesis.setDefaultStream("kinesis_stream_name");
+kinesis.setDefaultPartition("0");
+
+DataStream simpleStringStream = ...;
+simpleStringStream.addSink(kinesis);
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val producerConfig = new Properties()
+// Required configs
+producerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1")
+producerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id")
+producerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key")
+// Optional KPL configs
+producerConfig.put("AggregationMaxCount", "4294967295")
+producerConfig.put("CollectionMaxCount", "1000")
+producerConfig.put("RecordTtl", "30000")
+producerConfig.put("RequestTimeout", "6000")
+producerConfig.put("ThreadPoolSize", "15")
+
+// Disable Aggregation if it's not supported by a consumer
+// producerConfig.put("AggregationEnabled", "false")
+// Switch KinesisProducer's threading model
+// producerConfig.put("ThreadingModel", "PER_REQUEST")
+
+val kinesis = new FlinkKinesisProducer[String](new SimpleStringSchema, producerConfig)
+kinesis.setFailOnError(true)
+kinesis.setDefaultStream("kinesis_stream_name")
+kinesis.setDefaultPartition("0")
+
+val simpleStringStream = ...
+simpleStringStream.addSink(kinesis)
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+The above is a simple example of using the producer. To initialize `FlinkKinesisProducer`, users are required to pass in `AWS_REGION`, `AWS_ACCESS_KEY_ID`, and `AWS_SECRET_ACCESS_KEY` via a `java.util.Properties` instance. Users can also pass in KPL's configurations as optional parameters to customize the KPL underlying `FlinkKinesisProducer`. The full list of KPL configs and explanations can be found [here](https://github.com/awslabs/amazon-kinesis-producer/blob/master/java/amazon-kinesis-producer-sample/default_config.properties). The example demonstrates producing a single Kinesis stream in the AWS region "us-east-1".
+
+If users don't specify any KPL configs and values, `FlinkKinesisProducer` will use default config values of KPL, except `RateLimit`. `RateLimit` limits the maximum allowed put rate for a shard, as a percentage of the backend limits. KPL's default value is 150 but it makes KPL throw `RateLimitExceededException` too frequently and breaks Flink sink as a result. Thus `FlinkKinesisProducer` overrides KPL's default value to 100.
+
+Instead of a `SerializationSchema`, it also supports a `KinesisSerializationSchema`. The `KinesisSerializationSchema` allows to send the data to multiple streams. This is
+done using the `KinesisSerializationSchema.getTargetStream(T element)` method. Returning `null` there will instruct the producer to write the element to the default stream.
+Otherwise, the returned stream name is used.
+
+### Threading Model
+
+Since Flink 1.4.0, `FlinkKinesisProducer` switches its default underlying KPL from a one-thread-per-request mode to a thread-pool mode. KPL in thread-pool mode uses a queue and thread pool to execute requests to Kinesis. This limits the number of threads that KPL's native process may create, and therefore greatly lowers CPU utilization and improves efficiency. **Thus, We highly recommend Flink users use thread-pool model.** The default thread pool size is `10`. Users can set the pool size in `java.util.Properties` instance with key `ThreadPoolSize`, as shown in the above example.
+
+Users can still switch back to one-thread-per-request mode by setting a key-value pair of `ThreadingModel` and `PER_REQUEST` in `java.util.Properties`, as shown in the code commented out in above example.
+
+### Backpressure
+
+By default, `FlinkKinesisProducer` does not backpressure. Instead, records that
+cannot be sent because of the rate restriction of 1 MB per second per shard are
+buffered in an unbounded queue and dropped when their `RecordTtl` expires.
+
+To avoid data loss, you can enable backpressuring by restricting the size of the
+internal queue:
+
+```
+// 200 Bytes per record, 1 shard
+kinesis.setQueueLimit(500);
+```
+
+The value for `queueLimit` depends on the expected record size. To choose a good
+value, consider that Kinesis is rate-limited to 1MB per second per shard. If
+less than one second's worth of records is buffered, then the queue may not be
+able to operate at full capacity. With the default `RecordMaxBufferedTime` of
+100ms, a queue size of 100kB per shard should be sufficient. The `queueLimit`
+can then be computed via
+
+```
+queue limit = (number of shards * queue size per shard) / record size
+```
+
+E.g. for 200Bytes per record and 8 shards, a queue limit of 4000 is a good
+starting point. If the queue size limits throughput (below 1MB per second per
+shard), try increasing the queue limit slightly.
+
+
+## Using Custom Kinesis Endpoints
+
+It is sometimes desirable to have Flink operate as a consumer or producer against a Kinesis VPC endpoint or a non-AWS
+Kinesis endpoint such as [Kinesalite](https://github.com/mhart/kinesalite); this is especially useful when performing
+functional testing of a Flink application. The AWS endpoint that would normally be inferred by the AWS region set in the
+Flink configuration must be overridden via a configuration property.
+
+To override the AWS endpoint, set the `AWSConfigConstants.AWS_ENDPOINT` and `AWSConfigConstants.AWS_REGION` properties. The region will be used to sign the endpoint URL.
+
+{{< tabs "bcadd466-8416-4d3c-a6a7-c46eee0cbd4a" >}}
+{{< tab "Java" >}}
+```java
+Properties producerConfig = new Properties();
+producerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1");
+producerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id");
+producerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key");
+producerConfig.put(AWSConfigConstants.AWS_ENDPOINT, "http://localhost:4567");
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val producerConfig = new Properties()
+producerConfig.put(AWSConfigConstants.AWS_REGION, "us-east-1")
+producerConfig.put(AWSConfigConstants.AWS_ACCESS_KEY_ID, "aws_access_key_id")
+producerConfig.put(AWSConfigConstants.AWS_SECRET_ACCESS_KEY, "aws_secret_access_key")
+producerConfig.put(AWSConfigConstants.AWS_ENDPOINT, "http://localhost:4567")
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+{{< top >}}
diff --git a/docs/content.zh/docs/connectors/datastream/nifi.md b/docs/content.zh/docs/connectors/datastream/nifi.md
new file mode 100644
index 0000000000000..ffcbbfc715f40
--- /dev/null
+++ b/docs/content.zh/docs/connectors/datastream/nifi.md
@@ -0,0 +1,128 @@
+---
+title: NiFi
+weight: 8
+type: docs
+aliases:
+ - /zh/dev/connectors/nifi.html
+---
+
+
+# Apache NiFi 连接器
+
+[Apache NiFi](https://nifi.apache.org/) 连接器提供了可以读取和写入的 Source 和 Sink。
+使用这个连接器,需要在工程中添加下面的依赖:
+
+{{< artifact flink-connector-nifi withScalaVersion >}}
+
+注意这些连接器目前还没有包含在二进制发行版中。添加依赖、打包配置以及集群运行的相关信息请参考 [这里]({{< ref "docs/dev/datastream/project-configuration" >}})。
+
+#### 安装 Apache NiFi
+
+安装 Apache NiFi 集群请参考 [这里](https://nifi.apache.org/docs/nifi-docs/html/administration-guide.html#how-to-install-and-start-nifi)。
+
+#### Apache NiFi Source
+
+该连接器提供了一个 Source 可以用来从 Apache NiFi 读取数据到 Apache Flink。
+
+`NiFiSource(…)` 类有两个构造方法。
+
+- `NiFiSource(SiteToSiteConfig config)` - 构造一个 `NiFiSource(…)` ,需要指定参数 SiteToSiteConfig ,采用默认的等待时间 1000 ms。
+
+- `NiFiSource(SiteToSiteConfig config, long waitTimeMs)` - 构造一个 `NiFiSource(…)`,需要指定参数 SiteToSiteConfig 和等待时间(单位为毫秒)。
+
+示例:
+
+{{< tabs "44ccc35b-83c3-464f-9464-995d4981f4d9" >}}
+{{< tab "Java" >}}
+```java
+StreamExecutionEnvironment streamExecEnv = StreamExecutionEnvironment.getExecutionEnvironment();
+
+SiteToSiteClientConfig clientConfig = new SiteToSiteClient.Builder()
+ .url("http://localhost:8080/nifi")
+ .portName("Data for Flink")
+ .requestBatchCount(5)
+ .buildConfig();
+
+SourceFunction nifiSource = new NiFiSource(clientConfig);
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val streamExecEnv = StreamExecutionEnvironment.getExecutionEnvironment()
+
+val clientConfig: SiteToSiteClientConfig = new SiteToSiteClient.Builder()
+ .url("http://localhost:8080/nifi")
+ .portName("Data for Flink")
+ .requestBatchCount(5)
+ .buildConfig()
+
+val nifiSource = new NiFiSource(clientConfig)
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+数据从 Apache NiFi Output Port 读取,Apache NiFi Output Port 也被称为 "Data for Flink",是 Apache NiFi Site-to-site 协议配置的一部分。
+
+#### Apache NiFi Sink
+
+该连接器提供了一个 Sink 可以用来把 Apache Flink 的数据写入到 Apache NiFi。
+
+`NiFiSink(…)` 类只有一个构造方法。
+
+- `NiFiSink(SiteToSiteClientConfig, NiFiDataPacketBuilder)` 构造一个 `NiFiSink(…)`,需要指定 `SiteToSiteConfig` 和 `NiFiDataPacketBuilder` 参数 ,`NiFiDataPacketBuilder` 可以将Flink数据转化成可以被NiFi识别的 `NiFiDataPacket`.
+
+示例:
+
+{{< tabs "599dbd31-e2a4-4203-a428-0a4c95c8fd07" >}}
+{{< tab "Java" >}}
+```java
+StreamExecutionEnvironment streamExecEnv = StreamExecutionEnvironment.getExecutionEnvironment();
+
+SiteToSiteClientConfig clientConfig = new SiteToSiteClient.Builder()
+ .url("http://localhost:8080/nifi")
+ .portName("Data from Flink")
+ .requestBatchCount(5)
+ .buildConfig();
+
+SinkFunction nifiSink = new NiFiSink<>(clientConfig, new NiFiDataPacketBuilder() {...});
+
+streamExecEnv.addSink(nifiSink);
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+val streamExecEnv = StreamExecutionEnvironment.getExecutionEnvironment()
+
+val clientConfig: SiteToSiteClientConfig = new SiteToSiteClient.Builder()
+ .url("http://localhost:8080/nifi")
+ .portName("Data from Flink")
+ .requestBatchCount(5)
+ .buildConfig()
+
+val nifiSink: NiFiSink[NiFiDataPacket] = new NiFiSink[NiFiDataPacket](clientConfig, new NiFiDataPacketBuilder() {...})
+
+streamExecEnv.addSink(nifiSink)
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+更多关于 [Apache NiFi](https://nifi.apache.org) Site-to-Site Protocol 的信息请参考 [这里](https://nifi.apache.org/docs/nifi-docs/html/user-guide.html#site-to-site)。
+
+{{< top >}}
diff --git a/docs/content.zh/docs/connectors/datastream/overview.md b/docs/content.zh/docs/connectors/datastream/overview.md
new file mode 100644
index 0000000000000..cc59b1d1d288f
--- /dev/null
+++ b/docs/content.zh/docs/connectors/datastream/overview.md
@@ -0,0 +1,79 @@
+---
+title: 概览
+weight: 1
+type: docs
+aliases:
+ - /zh/dev/connectors/
+ - /zh/apis/connectors.html
+---
+
+
+# DataStream Connectors
+
+## 预定义的 Source 和 Sink
+
+一些比较基本的 Source 和 Sink 已经内置在 Flink 里。
+[预定义 data sources]({{< ref "docs/dev/datastream/overview" >}}#data-sources) 支持从文件、目录、socket,以及 collections 和 iterators 中读取数据。
+[预定义 data sinks]({{< ref "docs/dev/datastream/overview" >}}#data-sinks) 支持把数据写入文件、标准输出(stdout)、标准错误输出(stderr)和 socket。
+
+## 附带的连接器
+
+连接器可以和多种多样的第三方系统进行交互。目前支持以下系统:
+
+ * [Apache Kafka](kafka.html) (source/sink)
+ * [Apache Cassandra](cassandra.html) (sink)
+ * [Amazon Kinesis Streams](kinesis.html) (source/sink)
+ * [Elasticsearch](elasticsearch.html) (sink)
+ * [FileSystem(包括 Hadoop ) - 仅支持流](streamfile_sink.html) (sink)
+ * [FileSystem(包括 Hadoop ) - 流批统一](file_sink.html) (sink)
+ * [RabbitMQ](rabbitmq.html) (source/sink)
+ * [Apache NiFi](nifi.html) (source/sink)
+ * [Twitter Streaming API](twitter.html) (source)
+ * [Google PubSub](pubsub.html) (source/sink)
+ * [JDBC](jdbc.html) (sink)
+
+请记住,在使用一种连接器时,通常需要额外的第三方组件,比如:数据存储服务器或者消息队列。
+要注意这些列举的连接器是 Flink 工程的一部分,包含在发布的源码中,但是不包含在二进制发行版中。
+更多说明可以参考对应的子部分。
+
+## Apache Bahir 中的连接器
+
+Flink 还有些一些额外的连接器通过 [Apache Bahir](https://bahir.apache.org/) 发布, 包括:
+
+ * [Apache ActiveMQ](https://bahir.apache.org/docs/flink/current/flink-streaming-activemq/) (source/sink)
+ * [Apache Flume](https://bahir.apache.org/docs/flink/current/flink-streaming-flume/) (sink)
+ * [Redis](https://bahir.apache.org/docs/flink/current/flink-streaming-redis/) (sink)
+ * [Akka](https://bahir.apache.org/docs/flink/current/flink-streaming-akka/) (sink)
+ * [Netty](https://bahir.apache.org/docs/flink/current/flink-streaming-netty/) (source)
+
+## 连接Fink的其他方法
+
+### 异步 I/O
+
+使用connector并不是唯一可以使数据进入或者流出Flink的方式。
+一种常见的模式是从外部数据库或者 Web 服务查询数据得到初始数据流,然后通过 `Map` 或者 `FlatMap` 对初始数据流进行丰富和增强。
+Flink 提供了[异步 I/O]({{< ref "docs/dev/datastream/operators/asyncio" >}}) API 来让这个过程更加简单、高效和稳定。
+
+### 可查询状态
+
+当 Flink 应用程序需要向外部存储推送大量数据时会导致 I/O 瓶颈问题出现。在这种场景下,如果对数据的读操作远少于写操作,那么让外部应用从 Flink 拉取所需的数据会是一种更好的方式。
+[可查询状态]({{< ref "docs/dev/datastream/fault-tolerance/queryable_state" >}}) 接口可以实现这个功能,该接口允许被 Flink 托管的状态可以被按需查询。
+
+{{< top >}}
diff --git a/docs/content.zh/docs/connectors/datastream/pubsub.md b/docs/content.zh/docs/connectors/datastream/pubsub.md
new file mode 100644
index 0000000000000..f4194083aeba5
--- /dev/null
+++ b/docs/content.zh/docs/connectors/datastream/pubsub.md
@@ -0,0 +1,153 @@
+---
+title: Google Cloud PubSub
+weight: 8
+type: docs
+aliases:
+ - /zh/dev/connectors/pubsub.html
+---
+
+
+# Google Cloud PubSub
+
+这个连接器可向 [Google Cloud PubSub](https://cloud.google.com/pubsub) 读取与写入数据。添加下面的依赖来使用此连接器:
+
+{{< artifact flink-connector-pubsub withScalaVersion >}}
+
+
string with format: date-time (with UTC time zone)
+
+
+
INTERVAL
+
number
+
+
+
ARRAY
+
array
+
+
+
MAP / MULTISET
+
object
+
+
+
ROW
+
object
+
+
+
diff --git a/docs/content.zh/docs/connectors/table/formats/maxwell.md b/docs/content.zh/docs/connectors/table/formats/maxwell.md
new file mode 100644
index 0000000000000..205b1259296f2
--- /dev/null
+++ b/docs/content.zh/docs/connectors/table/formats/maxwell.md
@@ -0,0 +1,204 @@
+---
+title: Maxwell
+weight: 7
+type: docs
+aliases:
+ - /zh/dev/table/connectors/formats/maxwell.html
+---
+
+
+# Maxwell Format
+
+{{< label "Changelog-Data-Capture Format" >}}
+{{< label "Format: Serialization Schema" >}}
+{{< label "Format: Deserialization Schema" >}}
+
+[Maxwell](https://maxwells-daemon.io/) is a CDC (Changelog Data Capture) tool that can stream changes in real-time from MySQL into Kafka, Kinesis and other streaming connectors. Maxwell provides a unified format schema for changelog and supports to serialize messages using JSON.
+
+Flink supports to interpret Maxwell JSON messages as INSERT/UPDATE/DELETE messages into Flink SQL system. This is useful in many cases to leverage this feature, such as
+ - synchronizing incremental data from databases to other systems
+ - auditing logs
+ - real-time materialized views on databases
+ - temporal join changing history of a database table and so on.
+
+Flink also supports to encode the INSERT/UPDATE/DELETE messages in Flink SQL as Maxwell JSON messages, and emit to external systems like Kafka.
+However, currently Flink can't combine UPDATE_BEFORE and UPDATE_AFTER into a single UPDATE message. Therefore, Flink encodes UPDATE_BEFORE and UDPATE_AFTER as DELETE and INSERT Maxwell messages.
+
+Dependencies
+------------
+
+{{< sql_download_table "maxwell" >}}
+
+*Note: please refer to [Maxwell documentation](http://maxwells-daemon.io/quickstart/) about how to synchronize changelog to Kafka topics with Maxwell JSON.*
+
+
+How to use Maxwell format
+----------------
+
+Maxwell provides a unified format for changelog, here is a simple example for an update operation captured from a MySQL `products` table in JSON format:
+
+```json
+{
+ "database":"test",
+ "table":"e",
+ "type":"insert",
+ "ts":1477053217,
+ "xid":23396,
+ "commit":true,
+ "position":"master.000006:800911",
+ "server_id":23042,
+ "thread_id":108,
+ "primary_key": [1, "2016-10-21 05:33:37.523000"],
+ "primary_key_columns": ["id", "c"],
+ "data":{
+ "id":111,
+ "name":"scooter",
+ "description":"Big 2-wheel scooter",
+ "weight":5.15
+ },
+ "old":{
+ "weight":5.18,
+ }
+}
+```
+
+*Note: please refer to [Maxwell documentation](http://maxwells-daemon.io/dataformat/) about the meaning of each fields.*
+
+The MySQL `products` table has 4 columns (`id`, `name`, `description` and `weight`). The above JSON message is an update change event on the `products` table where the `weight` value of the row with `id = 111` is changed from `5.18` to `5.15`.
+Assuming this messages is synchronized to Kafka topic `products_binlog`, then we can use the following DDL to consume this topic and interpret the change events.
+
+```sql
+CREATE TABLE topic_products (
+ -- schema is totally the same to the MySQL "products" table
+ id BIGINT,
+ name STRING,
+ description STRING,
+ weight DECIMAL(10, 2)
+) WITH (
+ 'connector' = 'kafka',
+ 'topic' = 'products_binlog',
+ 'properties.bootstrap.servers' = 'localhost:9092',
+ 'properties.group.id' = 'testGroup',
+ 'format' = 'maxwell-json'
+)
+```
+
+After registering the topic as a Flink table, then you can consume the Maxwell messages as a changelog source.
+
+```sql
+-- a real-time materialized view on the MySQL "products"
+-- which calculate the latest average of weight for the same products
+SELECT name, AVG(weight) FROM topic_products GROUP BY name;
+
+-- synchronize all the data and incremental changes of MySQL "products" table to
+-- Elasticsearch "products" index for future searching
+INSERT INTO elasticsearch_products
+SELECT * FROM topic_products;
+```
+
+Format Options
+----------------
+
+
+
+
+
+
+
Option
+
Required
+
Default
+
Type
+
Description
+
+
+
+
+
format
+
required
+
(none)
+
String
+
Specify what format to use, here should be 'maxwell-json'.
+
+
+
maxwell-json.ignore-parse-errors
+
optional
+
false
+
Boolean
+
Skip fields and rows with parse errors instead of failing.
+ Fields are set to null in case of errors.
+
+
+
maxwell-json.timestamp-format.standard
+
optional
+
'SQL'
+
String
+
Specify the input and output timestamp format. Currently supported values are 'SQL' and 'ISO-8601':
+
+
Option 'SQL' will parse input timestamp in "yyyy-MM-dd HH:mm:ss.s{precision}" format, e.g '2020-12-30 12:13:14.123' and output timestamp in the same format.
+
Option 'ISO-8601'will parse input timestamp in "yyyy-MM-ddTHH:mm:ss.s{precision}" format, e.g '2020-12-30T12:13:14.123' and output timestamp in the same format.
+
+
+
+
+
maxwell-json.map-null-key.mode
+
optional
+
'FAIL'
+
String
+
Specify the handling mode when serializing null keys for map data. Currently supported values are 'FAIL', 'DROP' and 'LITERAL':
+
+
Option 'FAIL' will throw exception when encountering map with null key.
+
Option 'DROP' will drop null key entries for map data.
+
Option 'LITERAL' will replace null key with string literal. The string literal is defined by maxwell-json.map-null-key.literal option.
+
+
+
+
+
maxwell-json.map-null-key.literal
+
optional
+
'null'
+
String
+
Specify string literal to replace null key when 'maxwell-json.map-null-key.mode' is LITERAL.
+
+
+
maxwell-json.encode.decimal-as-plain-number
+
optional
+
false
+
Boolean
+
Encode all decimals as plain numbers instead of possible scientific notations. By default, decimals may be written using scientific notation. For example, 0.000000027 is encoded as 2.7E-8 by default, and will be written as 0.000000027 if set this option to true.
+
+
+
+
+
+
+Caveats
+----------------
+
+### Duplicate change events
+
+The Maxwell application allows to deliver every change event **exactly-once**. Flink works pretty well when consuming Maxwell produced events in this situation.
+If Maxwell application works in **at-least-once** delivery, it may deliver duplicate change events to Kafka and Flink will get the duplicate events.
+This may cause Flink query to get wrong results or unexpected exceptions. Thus, it is recommended to set job configuration [`table.exec.source.cdc-events-duplicate`]({{< ref "docs/dev/table/config" >}}#table-exec-source-cdc-events-duplicate) to `true` and define PRIMARY KEY on the source in this situation.
+Framework will generate an additional stateful operator, and use the primary key to deduplicate the change events and produce a normalized changelog stream.
+
+Data Type Mapping
+----------------
+
+Currently, the Maxwell format uses JSON for serialization and deserialization. Please refer to [JSON Format documentation]({{< ref "docs/connectors/table/formats/json" >}}#data-type-mapping) for more details about the data type mapping.
diff --git a/docs/content.zh/docs/connectors/table/formats/orc.md b/docs/content.zh/docs/connectors/table/formats/orc.md
new file mode 100644
index 0000000000000..57877f0609777
--- /dev/null
+++ b/docs/content.zh/docs/connectors/table/formats/orc.md
@@ -0,0 +1,173 @@
+---
+title: Orc
+weight: 9
+type: docs
+aliases:
+ - /zh/dev/table/connectors/formats/orc.html
+---
+
+
+# Orc Format
+
+{{< label "Format: Serialization Schema" >}}
+{{< label "Format: Deserialization Schema" >}}
+
+
+[Apache Orc](https://orc.apache.org/) Format 允许读写 ORC 数据。
+
+依赖
+------------
+
+{{< sql_download_table "orc" >}}
+
+
+如何用 Orc 格式创建一个表格
+----------------
+
+下面是一个用 Filesystem connector 和 Orc format 创建表格的例子
+
+```sql
+CREATE TABLE user_behavior (
+ user_id BIGINT,
+ item_id BIGINT,
+ category_id BIGINT,
+ behavior STRING,
+ ts TIMESTAMP(3),
+ dt STRING
+) PARTITIONED BY (dt) WITH (
+ 'connector' = 'filesystem',
+ 'path' = '/tmp/user_behavior',
+ 'format' = 'orc'
+)
+```
+
+Format 参数
+----------------
+
+
+
+{{< top >}}
diff --git a/docs/content.zh/docs/connectors/table/hive/_index.md b/docs/content.zh/docs/connectors/table/hive/_index.md
new file mode 100644
index 0000000000000..615740adb5005
--- /dev/null
+++ b/docs/content.zh/docs/connectors/table/hive/_index.md
@@ -0,0 +1,23 @@
+---
+title: Hive
+bookCollapseSection: true
+weight: 16
+---
+
\ No newline at end of file
diff --git a/docs/content.zh/docs/connectors/table/hive/hive_catalog.md b/docs/content.zh/docs/connectors/table/hive/hive_catalog.md
new file mode 100644
index 0000000000000..0353a1be80bfc
--- /dev/null
+++ b/docs/content.zh/docs/connectors/table/hive/hive_catalog.md
@@ -0,0 +1,397 @@
+---
+title: "Hive Catalog"
+weight: 2
+type: docs
+aliases:
+ - /zh/dev/table/connectors/hive/hive_catalog.html
+---
+
+
+# Hive Catalog
+
+Hive Metastore has evolved into the de facto metadata hub over the years in Hadoop ecosystem. Many companies have a single
+Hive Metastore service instance in their production to manage all of their metadata, either Hive metadata or non-Hive metadata,
+ as the source of truth.
+
+For users who have both Hive and Flink deployments, `HiveCatalog` enables them to use Hive Metastore to manage Flink's metadata.
+
+For users who have just Flink deployment, `HiveCatalog` is the only persistent catalog provided out-of-box by Flink.
+Without a persistent catalog, users using [Flink SQL CREATE DDL]({{< ref "docs/dev/table/sql/create" >}}) have to repeatedly
+create meta-objects like a Kafka table in each session, which wastes a lot of time. `HiveCatalog` fills this gap by empowering
+users to create tables and other meta-objects only once, and reference and manage them with convenience later on across sessions.
+
+
+## Set up HiveCatalog
+
+### Dependencies
+
+Setting up a `HiveCatalog` in Flink requires the same [dependencies]({{< ref "docs/connectors/table/hive/overview" >}}#dependencies)
+as those of an overall Flink-Hive integration.
+
+### Configuration
+
+Setting up a `HiveCatalog` in Flink requires the same [configuration]({{< ref "docs/connectors/table/hive/overview" >}}#connecting-to-hive)
+as those of an overall Flink-Hive integration.
+
+
+## How to use HiveCatalog
+
+Once configured properly, `HiveCatalog` should just work out of box. Users can create Flink meta-objects with DDL, and should
+see them immediately afterwards.
+
+`HiveCatalog` can be used to handle two kinds of tables: Hive-compatible tables and generic tables. Hive-compatible tables
+are those stored in a Hive-compatible way, in terms of both metadata and data in the storage layer. Therefore, Hive-compatible tables
+created via Flink can be queried from Hive side.
+
+Generic tables, on the other hand, are specific to Flink. When creating generic tables with `HiveCatalog`, we're just using
+HMS to persist the metadata. While these tables are visible to Hive, it's unlikely Hive is able to understand
+the metadata. And therefore using such tables in Hive leads to undefined behavior.
+
+Flink uses the property '*is_generic*' to tell whether a table is Hive-compatible or generic. When creating a table with
+`HiveCatalog`, it's by default considered generic. If you'd like to create a Hive-compatible table, make sure to set
+`is_generic` to false in your table properties.
+
+As stated above, generic tables shouldn't be used from Hive. In Hive CLI, you can call `DESCRIBE FORMATTED` for a table and
+decide whether it's generic or not by checking the `is_generic` property. Generic tables will have `is_generic=true`.
+
+### Example
+
+We will walk through a simple example here.
+
+#### step 1: set up a Hive Metastore
+
+Have a Hive Metastore running.
+
+Here, we set up a local Hive Metastore and our `hive-site.xml` file in local path `/opt/hive-conf/hive-site.xml`.
+We have some configs like the following:
+
+```xml
+
+
+
+ javax.jdo.option.ConnectionURL
+ jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true
+ metadata is stored in a MySQL server
+
+
+
+ javax.jdo.option.ConnectionDriverName
+ com.mysql.jdbc.Driver
+ MySQL JDBC driver class
+
+
+
+ javax.jdo.option.ConnectionUserName
+ ...
+ user name for connecting to mysql server
+
+
+
+ javax.jdo.option.ConnectionPassword
+ ...
+ password for connecting to mysql server
+
+
+
+ hive.metastore.uris
+ thrift://localhost:9083
+ IP address (or fully-qualified domain name) and port of the metastore host
+
+
+
+ hive.metastore.schema.verification
+ true
+
+
+
+```
+
+
+Test connection to the HMS with Hive Cli. Running some commands, we can see we have a database named `default` and there's no table in it.
+
+
+```bash
+
+hive> show databases;
+OK
+default
+Time taken: 0.032 seconds, Fetched: 1 row(s)
+
+hive> show tables;
+OK
+Time taken: 0.028 seconds, Fetched: 0 row(s)
+```
+
+
+#### step 2: configure Flink cluster and SQL CLI
+
+Add all Hive dependencies to `/lib` dir in Flink distribution, and modify SQL CLI's yaml config file `sql-cli-defaults.yaml` as following:
+
+```yaml
+
+execution:
+ planner: blink
+ type: streaming
+ ...
+ current-catalog: myhive # set the HiveCatalog as the current catalog of the session
+ current-database: mydatabase
+
+catalogs:
+ - name: myhive
+ type: hive
+ hive-conf-dir: /opt/hive-conf # contains hive-site.xml
+```
+
+
+#### step 3: set up a Kafka cluster
+
+Bootstrap a local Kafka 2.3.0 cluster with a topic named "test", and produce some simple data to the topic as tuple of name and age.
+
+```bash
+
+localhost$ bin/kafka-console-producer.sh --broker-list localhost:9092 --topic test
+>tom,15
+>john,21
+
+```
+
+
+These message can be seen by starting a Kafka console consumer.
+
+```bash
+localhost$ bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic test --from-beginning
+
+tom,15
+john,21
+
+```
+
+
+#### step 4: start SQL Client, and create a Kafka table with Flink SQL DDL
+
+Start Flink SQL Client, create a simple Kafka 2.3.0 table via DDL, and verify its schema.
+
+```bash
+
+Flink SQL> CREATE TABLE mykafka (name String, age Int) WITH (
+ 'connector.type' = 'kafka',
+ 'connector.version' = 'universal',
+ 'connector.topic' = 'test',
+ 'connector.properties.bootstrap.servers' = 'localhost:9092',
+ 'format.type' = 'csv',
+ 'update-mode' = 'append'
+);
+[INFO] Table has been created.
+
+Flink SQL> DESCRIBE mykafka;
+root
+ |-- name: STRING
+ |-- age: INT
+
+```
+
+Verify the table is also visible to Hive via Hive Cli, and note that the table has property `is_generic=true`:
+
+```bash
+hive> show tables;
+OK
+mykafka
+Time taken: 0.038 seconds, Fetched: 1 row(s)
+
+hive> describe formatted mykafka;
+OK
+# col_name data_type comment
+
+
+# Detailed Table Information
+Database: default
+Owner: null
+CreateTime: ......
+LastAccessTime: UNKNOWN
+Retention: 0
+Location: ......
+Table Type: MANAGED_TABLE
+Table Parameters:
+ flink.connector.properties.bootstrap.servers localhost:9092
+ flink.connector.topic test
+ flink.connector.type kafka
+ flink.connector.version universal
+ flink.format.type csv
+ flink.generic.table.schema.0.data-type VARCHAR(2147483647)
+ flink.generic.table.schema.0.name name
+ flink.generic.table.schema.1.data-type INT
+ flink.generic.table.schema.1.name age
+ flink.update-mode append
+ is_generic true
+ transient_lastDdlTime ......
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+InputFormat: org.apache.hadoop.mapred.TextInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+Storage Desc Params:
+ serialization.format 1
+Time taken: 0.158 seconds, Fetched: 36 row(s)
+
+```
+
+
+#### step 5: run Flink SQL to query the Kafka table
+
+Run a simple select query from Flink SQL Client in a Flink cluster, either standalone or yarn-session.
+
+```bash
+Flink SQL> select * from mykafka;
+
+```
+
+
+Produce some more messages in the Kafka topic
+
+```bash
+localhost$ bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic test --from-beginning
+
+tom,15
+john,21
+kitty,30
+amy,24
+kaiky,18
+
+```
+
+
+You should see results produced by Flink in SQL Client now, as:
+
+
+```bash
+ SQL Query Result (Table)
+ Refresh: 1 s Page: Last of 1
+
+ name age
+ tom 15
+ john 21
+ kitty 30
+ amy 24
+ kaiky 18
+
+```
+
+## Supported Types
+
+`HiveCatalog` supports all Flink types for generic tables.
+
+For Hive-compatible tables, `HiveCatalog` needs to map Flink data types to corresponding Hive types as described in
+the following table:
+
+
+
+
+
Flink Data Type
+
Hive Data Type
+
+
+
+
+
CHAR(p)
+
CHAR(p)
+
+
+
VARCHAR(p)
+
VARCHAR(p)
+
+
+
STRING
+
STRING
+
+
+
BOOLEAN
+
BOOLEAN
+
+
+
TINYINT
+
TINYINT
+
+
+
SMALLINT
+
SMALLINT
+
+
+
INT
+
INT
+
+
+
BIGINT
+
LONG
+
+
+
FLOAT
+
FLOAT
+
+
+
DOUBLE
+
DOUBLE
+
+
+
DECIMAL(p, s)
+
DECIMAL(p, s)
+
+
+
DATE
+
DATE
+
+
+
TIMESTAMP(9)
+
TIMESTAMP
+
+
+
BYTES
+
BINARY
+
+
+
ARRAY<T>
+
LIST<T>
+
+
+
MAP
+
MAP
+
+
+
ROW
+
STRUCT
+
+
+
+
+Something to note about the type mapping:
+* Hive's `CHAR(p)` has a maximum length of 255
+* Hive's `VARCHAR(p)` has a maximum length of 65535
+* Hive's `MAP` only supports primitive key types while Flink's `MAP` can be any data type
+* Hive's `UNION` type is not supported
+* Hive's `TIMESTAMP` always has precision 9 and doesn't support other precisions. Hive UDFs, on the other hand, can process `TIMESTAMP` values with a precision <= 9.
+* Hive doesn't support Flink's `TIMESTAMP_WITH_TIME_ZONE`, `TIMESTAMP_WITH_LOCAL_TIME_ZONE`, and `MULTISET`
+* Flink's `INTERVAL` type cannot be mapped to Hive `INTERVAL` type yet
+
+## Scala Shell
+
+NOTE: since blink planner is not well supported in Scala Shell at the moment, it's **NOT** recommended to use Hive connector in Scala Shell.
diff --git a/docs/content.zh/docs/connectors/table/hive/hive_dialect.md b/docs/content.zh/docs/connectors/table/hive/hive_dialect.md
new file mode 100644
index 0000000000000..9840494a4c365
--- /dev/null
+++ b/docs/content.zh/docs/connectors/table/hive/hive_dialect.md
@@ -0,0 +1,419 @@
+---
+title: "Hive 方言"
+weight: 3
+type: docs
+aliases:
+ - /zh/dev/table/connectors/hive/hive_dialect.html
+---
+
+
+# Hive 方言
+
+从 1.11.0 开始,在使用 Hive 方言时,Flink 允许用户用 Hive 语法来编写 SQL 语句。通过提供与 Hive 语法的兼容性,我们旨在改善与 Hive 的互操作性,并减少用户需要在 Flink 和 Hive 之间切换来执行不同语句的情况。
+
+## 使用 Hive 方言
+
+Flink 目前支持两种 SQL 方言: `default` 和 `hive`。你需要先切换到 Hive 方言,然后才能使用 Hive 语法编写。下面介绍如何使用 SQL 客户端和 Table API 设置方言。
+还要注意,你可以为执行的每个语句动态切换方言。无需重新启动会话即可使用其他方言。
+
+### SQL 客户端
+
+SQL 方言可以通过 `table.sql-dialect` 属性指定。因此你可以通过 SQL 客户端 yaml 文件中的 `configuration` 部分来设置初始方言。
+
+```yaml
+
+execution:
+ planner: blink
+ type: batch
+ result-mode: table
+
+configuration:
+ table.sql-dialect: hive
+
+```
+
+你同样可以在 SQL 客户端启动后设置方言。
+
+```bash
+
+Flink SQL> set table.sql-dialect=hive; -- to use hive dialect
+[INFO] Session property has been set.
+
+Flink SQL> set table.sql-dialect=default; -- to use default dialect
+[INFO] Session property has been set.
+
+```
+
+### Table API
+
+你可以使用 Table API 为 TableEnvironment 设置方言。
+
+{{< tabs "82a7968d-df12-4db2-83ab-16f09b263935" >}}
+{{< tab "Java" >}}
+```java
+
+EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner()...build();
+TableEnvironment tableEnv = TableEnvironment.create(settings);
+// to use hive dialect
+tableEnv.getConfig().setSqlDialect(SqlDialect.HIVE);
+// to use default dialect
+tableEnv.getConfig().setSqlDialect(SqlDialect.DEFAULT);
+
+```
+{{< /tab >}}
+{{< tab "Python" >}}
+```python
+from pyflink.table import *
+
+settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()
+t_env = TableEnvironment.create(settings)
+
+# to use hive dialect
+t_env.get_config().set_sql_dialect(SqlDialect.HIVE)
+# to use default dialect
+t_env.get_config().set_sql_dialect(SqlDialect.DEFAULT)
+
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+## DDL
+
+本章节列出了 Hive 方言支持的 DDL 语句。我们主要关注语法。你可以参考 [Hive 文档](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL)
+了解每个 DDL 语句的语义。
+
+### CATALOG
+
+#### Show
+
+```sql
+SHOW CURRENT CATALOG;
+```
+
+### DATABASE
+
+#### Show
+
+```sql
+SHOW DATABASES;
+```
+
+#### Create
+
+```sql
+CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name
+ [COMMENT database_comment]
+ [LOCATION fs_path]
+ [WITH DBPROPERTIES (property_name=property_value, ...)];
+```
+
+#### Alter
+
+##### Update Properties
+
+```sql
+ALTER (DATABASE|SCHEMA) database_name SET DBPROPERTIES (property_name=property_value, ...);
+```
+
+##### Update Owner
+
+```sql
+ALTER (DATABASE|SCHEMA) database_name SET OWNER [USER|ROLE] user_or_role;
+```
+
+##### Update Location
+
+```sql
+ALTER (DATABASE|SCHEMA) database_name SET LOCATION fs_path;
+```
+
+#### Drop
+
+```sql
+DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT|CASCADE];
+```
+
+#### Use
+
+```sql
+USE database_name;
+```
+
+### TABLE
+
+#### Show
+
+```sql
+SHOW TABLES;
+```
+
+#### Create
+
+```sql
+CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
+ [(col_name data_type [column_constraint] [COMMENT col_comment], ... [table_constraint])]
+ [COMMENT table_comment]
+ [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
+ [
+ [ROW FORMAT row_format]
+ [STORED AS file_format]
+ ]
+ [LOCATION fs_path]
+ [TBLPROPERTIES (property_name=property_value, ...)]
+
+row_format:
+ : DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char]
+ [MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
+ [NULL DEFINED AS char]
+ | SERDE serde_name [WITH SERDEPROPERTIES (property_name=property_value, ...)]
+
+file_format:
+ : SEQUENCEFILE
+ | TEXTFILE
+ | RCFILE
+ | ORC
+ | PARQUET
+ | AVRO
+ | INPUTFORMAT input_format_classname OUTPUTFORMAT output_format_classname
+
+column_constraint:
+ : NOT NULL [[ENABLE|DISABLE] [VALIDATE|NOVALIDATE] [RELY|NORELY]]
+
+table_constraint:
+ : [CONSTRAINT constraint_name] PRIMARY KEY (col_name, ...) [[ENABLE|DISABLE] [VALIDATE|NOVALIDATE] [RELY|NORELY]]
+```
+
+#### Alter
+
+##### Rename
+
+```sql
+ALTER TABLE table_name RENAME TO new_table_name;
+```
+
+##### Update Properties
+
+```sql
+ALTER TABLE table_name SET TBLPROPERTIES (property_name = property_value, property_name = property_value, ... );
+```
+
+##### Update Location
+
+```sql
+ALTER TABLE table_name [PARTITION partition_spec] SET LOCATION fs_path;
+```
+
+如果指定了 `partition_spec`,那么必须完整,即具有所有分区列的值。如果指定了,该操作将作用在对应分区上而不是表上。
+
+##### Update File Format
+
+```sql
+ALTER TABLE table_name [PARTITION partition_spec] SET FILEFORMAT file_format;
+```
+
+如果指定了 `partition_spec`,那么必须完整,即具有所有分区列的值。如果指定了,该操作将作用在对应分区上而不是表上。
+
+##### Update SerDe Properties
+
+```sql
+ALTER TABLE table_name [PARTITION partition_spec] SET SERDE serde_class_name [WITH SERDEPROPERTIES serde_properties];
+
+ALTER TABLE table_name [PARTITION partition_spec] SET SERDEPROPERTIES serde_properties;
+
+serde_properties:
+ : (property_name = property_value, property_name = property_value, ... )
+```
+
+如果指定了 `partition_spec`,那么必须完整,即具有所有分区列的值。如果指定了,该操作将作用在对应分区上而不是表上。
+
+##### Add Partitions
+
+```sql
+ALTER TABLE table_name ADD [IF NOT EXISTS] (PARTITION partition_spec [LOCATION fs_path])+;
+```
+
+##### Drop Partitions
+
+```sql
+ALTER TABLE table_name DROP [IF EXISTS] PARTITION partition_spec[, PARTITION partition_spec, ...];
+```
+
+##### Add/Replace Columns
+
+```sql
+ALTER TABLE table_name
+ ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], ...)
+ [CASCADE|RESTRICT]
+```
+
+##### Change Column
+
+```sql
+ALTER TABLE table_name CHANGE [COLUMN] col_old_name col_new_name column_type
+ [COMMENT col_comment] [FIRST|AFTER column_name] [CASCADE|RESTRICT];
+```
+
+#### Drop
+
+```sql
+DROP TABLE [IF EXISTS] table_name;
+```
+
+### VIEW
+
+#### Create
+
+```sql
+CREATE VIEW [IF NOT EXISTS] view_name [(column_name, ...) ]
+ [COMMENT view_comment]
+ [TBLPROPERTIES (property_name = property_value, ...)]
+ AS SELECT ...;
+```
+
+#### Alter
+
+**注意**: 变更视图只在 Table API 中有效,SQL 客户端不支持。
+
+##### Rename
+
+```sql
+ALTER VIEW view_name RENAME TO new_view_name;
+```
+
+##### Update Properties
+
+```sql
+ALTER VIEW view_name SET TBLPROPERTIES (property_name = property_value, ... );
+```
+
+##### Update As Select
+
+```sql
+ALTER VIEW view_name AS select_statement;
+```
+
+#### Drop
+
+```sql
+DROP VIEW [IF EXISTS] view_name;
+```
+
+### FUNCTION
+
+#### Show
+
+```sql
+SHOW FUNCTIONS;
+```
+
+#### Create
+
+```sql
+CREATE FUNCTION function_name AS class_name;
+```
+
+#### Drop
+
+```sql
+DROP FUNCTION [IF EXISTS] function_name;
+```
+
+## DML & DQL _`Beta`_
+
+Hive 方言支持常用的 Hive [DML](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML)
+和 [DQL](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Select) 。 下表列出了一些 Hive 方言支持的语法。
+
+- [SORT/CLUSTER/DISTRIBUTE BY](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+SortBy)
+- [Group By](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+GroupBy)
+- [Join](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Joins)
+- [Union](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Union)
+- [LATERAL VIEW](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+LateralView)
+- [Window Functions](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+WindowingAndAnalytics)
+- [SubQueries](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+SubQueries)
+- [CTE](https://cwiki.apache.org/confluence/display/Hive/Common+Table+Expression)
+- [INSERT INTO dest schema](https://issues.apache.org/jira/browse/HIVE-9481)
+- [Implicit type conversions](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+Types#LanguageManualTypes-AllowedImplicitConversions)
+
+为了实现更好的语法和语义的兼容,强烈建议使用 [HiveModule]({{< ref "docs/connectors/table/hive/hive_functions" >}}#use-hive-built-in-functions-via-hivemodule)
+并将其放在 Module 列表的首位,以便在函数解析时优先使用 Hive 内置函数。
+
+Hive 方言不再支持 [Flink SQL 语法]({{< ref "docs/dev/table/sql/queries" >}}) 。 若需使用 Flink 语法,请切换到 `default` 方言。
+
+以下是一个使用 Hive 方言的示例。
+
+```bash
+Flink SQL> create catalog myhive with ('type' = 'hive', 'hive-conf-dir' = '/opt/hive-conf');
+[INFO] Execute statement succeed.
+
+Flink SQL> use catalog myhive;
+[INFO] Execute statement succeed.
+
+Flink SQL> load module hive;
+[INFO] Execute statement succeed.
+
+Flink SQL> use modules hive,core;
+[INFO] Execute statement succeed.
+
+Flink SQL> set table.sql-dialect=hive;
+[INFO] Session property has been set.
+
+Flink SQL> select explode(array(1,2,3)); -- call hive udtf
++-----+
+| col |
++-----+
+| 1 |
+| 2 |
+| 3 |
++-----+
+3 rows in set
+
+Flink SQL> create table tbl (key int,value string);
+[INFO] Execute statement succeed.
+
+Flink SQL> insert overwrite table tbl values (5,'e'),(1,'a'),(1,'a'),(3,'c'),(2,'b'),(3,'c'),(3,'c'),(4,'d');
+[INFO] Submitting SQL update statement to the cluster...
+[INFO] SQL update statement has been successfully submitted to the cluster:
+
+Flink SQL> select * from tbl cluster by key; -- run cluster by
+2021-04-22 16:13:57,005 INFO org.apache.hadoop.mapred.FileInputFormat [] - Total input paths to process : 1
++-----+-------+
+| key | value |
++-----+-------+
+| 1 | a |
+| 1 | a |
+| 5 | e |
+| 2 | b |
+| 3 | c |
+| 3 | c |
+| 3 | c |
+| 4 | d |
++-----+-------+
+8 rows in set
+```
+
+## 注意
+
+以下是使用 Hive 方言的一些注意事项。
+
+- Hive 方言只能用于操作 Hive 对象,并要求当前 Catalog 是一个 [HiveCatalog]({{< ref "docs/connectors/table/hive/hive_catalog" >}}) 。
+- Hive 方言只支持 `db.table` 这种两级的标识符,不支持带有 Catalog 名字的标识符。
+- 虽然所有 Hive 版本支持相同的语法,但是一些特定的功能是否可用仍取决于你使用的[Hive 版本]({{< ref "docs/connectors/table/hive/overview" >}}#支持的hive版本)。例如,更新数据库位置
+ 只在 Hive-2.4.0 或更高版本支持。
+- 执行 DML 和 DQL 时应该使用 [HiveModule]({{< ref "docs/connectors/table/hive/hive_functions" >}}#use-hive-built-in-functions-via-hivemodule) 。
diff --git a/docs/content.zh/docs/connectors/table/hive/hive_functions.md b/docs/content.zh/docs/connectors/table/hive/hive_functions.md
new file mode 100644
index 0000000000000..4d1d071b290ee
--- /dev/null
+++ b/docs/content.zh/docs/connectors/table/hive/hive_functions.md
@@ -0,0 +1,211 @@
+---
+title: "Hive Functions"
+weight: 5
+type: docs
+aliases:
+ - /zh/dev/table/connectors/hive/hive_functions.html
+---
+
+
+# Hive Functions
+
+## Use Hive Built-in Functions via HiveModule
+
+The `HiveModule` provides Hive built-in functions as Flink system (built-in) functions to Flink SQL and Table API users.
+
+For detailed information, please refer to [HiveModule]({{< ref "docs/dev/table/modules" >}}#hivemodule).
+
+{{< tabs "2e76857e-17c6-45ee-9da8-0819e132e40c" >}}
+{{< tab "Java" >}}
+```java
+
+String name = "myhive";
+String version = "2.3.4";
+
+tableEnv.loadModue(name, new HiveModule(version));
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+```scala
+
+val name = "myhive"
+val version = "2.3.4"
+
+tableEnv.loadModue(name, new HiveModule(version));
+```
+{{< /tab >}}
+{{< tab "Python" >}}
+```Python
+from pyflink.table.module import HiveModule
+
+name = "myhive"
+version = "2.3.4"
+
+t_env.load_module(name, HiveModule(version))
+```
+{{< /tab >}}
+{{< tab "YAML" >}}
+```yaml
+modules:
+ - name: core
+ type: core
+ - name: myhive
+ type: hive
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+{{< hint info >}}
+Some Hive built-in functions in older versions have [thread safety issues](https://issues.apache.org/jira/browse/HIVE-16183).
+We recommend users patch their own Hive to fix them.
+{{< /hint >}}
+
+## Hive User Defined Functions
+
+Users can use their existing Hive User Defined Functions in Flink.
+
+Supported UDF types include:
+
+- UDF
+- GenericUDF
+- GenericUDTF
+- UDAF
+- GenericUDAFResolver2
+
+Upon query planning and execution, Hive's UDF and GenericUDF are automatically translated into Flink's ScalarFunction,
+Hive's GenericUDTF is automatically translated into Flink's TableFunction,
+and Hive's UDAF and GenericUDAFResolver2 are translated into Flink's AggregateFunction.
+
+To use a Hive User Defined Function, user have to
+
+- set a HiveCatalog backed by Hive Metastore that contains that function as current catalog of the session
+- include a jar that contains that function in Flink's classpath
+- use Blink planner.
+
+## Using Hive User Defined Functions
+
+Assuming we have the following Hive functions registered in Hive Metastore:
+
+
+```java
+/**
+ * Test simple udf. Registered under name 'myudf'
+ */
+public class TestHiveSimpleUDF extends UDF {
+
+ public IntWritable evaluate(IntWritable i) {
+ return new IntWritable(i.get());
+ }
+
+ public Text evaluate(Text text) {
+ return new Text(text.toString());
+ }
+}
+
+/**
+ * Test generic udf. Registered under name 'mygenericudf'
+ */
+public class TestHiveGenericUDF extends GenericUDF {
+
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+ checkArgument(arguments.length == 2);
+
+ checkArgument(arguments[1] instanceof ConstantObjectInspector);
+ Object constant = ((ConstantObjectInspector) arguments[1]).getWritableConstantValue();
+ checkArgument(constant instanceof IntWritable);
+ checkArgument(((IntWritable) constant).get() == 1);
+
+ if (arguments[0] instanceof IntObjectInspector ||
+ arguments[0] instanceof StringObjectInspector) {
+ return arguments[0];
+ } else {
+ throw new RuntimeException("Not support argument: " + arguments[0]);
+ }
+ }
+
+ @Override
+ public Object evaluate(DeferredObject[] arguments) throws HiveException {
+ return arguments[0].get();
+ }
+
+ @Override
+ public String getDisplayString(String[] children) {
+ return "TestHiveGenericUDF";
+ }
+}
+
+/**
+ * Test split udtf. Registered under name 'mygenericudtf'
+ */
+public class TestHiveUDTF extends GenericUDTF {
+
+ @Override
+ public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException {
+ checkArgument(argOIs.length == 2);
+
+ // TEST for constant arguments
+ checkArgument(argOIs[1] instanceof ConstantObjectInspector);
+ Object constant = ((ConstantObjectInspector) argOIs[1]).getWritableConstantValue();
+ checkArgument(constant instanceof IntWritable);
+ checkArgument(((IntWritable) constant).get() == 1);
+
+ return ObjectInspectorFactory.getStandardStructObjectInspector(
+ Collections.singletonList("col1"),
+ Collections.singletonList(PrimitiveObjectInspectorFactory.javaStringObjectInspector));
+ }
+
+ @Override
+ public void process(Object[] args) throws HiveException {
+ String str = (String) args[0];
+ for (String s : str.split(",")) {
+ forward(s);
+ forward(s);
+ }
+ }
+
+ @Override
+ public void close() {
+ }
+}
+
+```
+
+From Hive CLI, we can see they are registered:
+
+```bash
+hive> show functions;
+OK
+......
+mygenericudf
+myudf
+myudtf
+
+```
+
+
+Then, users can use them in SQL as:
+
+
+```bash
+
+Flink SQL> select mygenericudf(myudf(name), 1) as a, mygenericudf(myudf(age), 1) as b, s from mysourcetable, lateral table(myudtf(name, 1)) as T(s);
+
+```
diff --git a/docs/content.zh/docs/connectors/table/hive/hive_read_write.md b/docs/content.zh/docs/connectors/table/hive/hive_read_write.md
new file mode 100644
index 0000000000000..6d45f8dd298be
--- /dev/null
+++ b/docs/content.zh/docs/connectors/table/hive/hive_read_write.md
@@ -0,0 +1,454 @@
+---
+title: "Hive Read & Write"
+weight: 4
+type: docs
+aliases:
+ - /zh/dev/table/connectors/hive/hive_read_write.html
+ - /zh/dev/table/hive/hive_streaming.html
+---
+
+
+# Hive Read & Write
+
+Using the `HiveCatalog`, Apache Flink can be used for unified `BATCH` and `STREAM` processing of Apache
+Hive Tables. This means Flink can be used as a more performant alternative to Hive’s batch engine,
+or to continuously read and write data into and out of Hive tables to power real-time data
+warehousing applications.
+
+## Reading
+
+Flink supports reading data from Hive in both `BATCH` and `STREAMING` modes. When run as a `BATCH`
+application, Flink will execute its query over the state of the table at the point in time when the
+query is executed. `STREAMING` reads will continuously monitor the table and incrementally fetch
+new data as it is made available. Flink will read tables as bounded by default.
+
+`STREAMING` reads support consuming both partitioned and non-partitioned tables.
+For partitioned tables, Flink will monitor the generation of new partitions, and read
+them incrementally when available. For non-partitioned tables, Flink will monitor the generation
+of new files in the folder and read new files incrementally.
+
+
+
+
+
Key
+
Default
+
Type
+
Description
+
+
+
+
+
streaming-source.enable
+
false
+
Boolean
+
Enable streaming source or not. NOTES: Please make sure that each partition/file should be written atomically, otherwise the reader may get incomplete data.
+
+
+
streaming-source.partition.include
+
all
+
String
+
Option to set the partitions to read, the supported option are `all` and `latest`, the `all` means read all partitions; the `latest` means read latest partition in order of 'streaming-source.partition.order', the `latest` only works` when the streaming hive source table used as temporal table. By default the option is `all`.
+ Flink supports temporal join the latest hive partition by enabling 'streaming-source.enable' and setting 'streaming-source.partition.include' to 'latest', at the same time, user can assign the partition compare order and data update interval by configuring following partition-related options.
+
+
+
+
streaming-source.monitor-interval
+
None
+
Duration
+
Time interval for consecutively monitoring partition/file.
+ Notes: The default interval for hive streaming reading is '1 m', the default interval for hive streaming temporal join is '60 m', this is because there's one framework limitation that every TM will visit the Hive metaStore in current hive streaming temporal join implementation which may produce pressure to metaStore, this will improve in the future.
+
+
+
streaming-source.partition-order
+
partition-name
+
String
+
The partition order of streaming source, support create-time, partition-time and partition-name. create-time compares partition/file creation time, this is not the partition create time in Hive metaStore, but the folder/file modification time in filesystem, if the partition folder somehow gets updated, e.g. add new file into folder, it can affect how the data is consumed. partition-time compares the time extracted from partition name. partition-name compares partition name's alphabetical order. For non-partition table, this value should always be 'create-time'. By default the value is partition-name. The option is equality with deprecated option 'streaming-source.consume-order'.
+
+
+
streaming-source.consume-start-offset
+
None
+
String
+
Start offset for streaming consuming. How to parse and compare offsets depends on your order. For create-time and partition-time, should be a timestamp string (yyyy-[m]m-[d]d [hh:mm:ss]). For partition-time, will use partition time extractor to extract time from partition.
+ For partition-name, is the partition name string (e.g. pt_year=2020/pt_mon=10/pt_day=01).
+
+
+
+
+[SQL Hints]({{< ref "docs/dev/table/sql/queries/hints" >}}) can be used to apply configurations to a Hive table
+without changing its definition in the Hive metastore.
+
+```sql
+
+SELECT *
+FROM hive_table
+/*+ OPTIONS('streaming-source.enable'='true', 'streaming-source.consume-start-offset'='2020-05-20') */;
+
+```
+
+**Notes**
+
+- Monitor strategy is to scan all directories/files currently in the location path. Many partitions may cause performance degradation.
+- Streaming reads for non-partitioned tables requires that each file be written atomically into the target directory.
+- Streaming reading for partitioned tables requires that each partition should be added atomically in the view of hive metastore. If not, new data added to an existing partition will be consumed.
+- Streaming reads do not support watermark grammar in Flink DDL. These tables cannot be used for window operators.
+
+### Reading Hive Views
+
+Flink is able to read from Hive defined views, but some limitations apply:
+
+1) The Hive catalog must be set as the current catalog before you can query the view.
+This can be done by either `tableEnv.useCatalog(...)` in Table API or `USE CATALOG ...` in SQL Client.
+
+2) Hive and Flink SQL have different syntax, e.g. different reserved keywords and literals.
+Make sure the view’s query is compatible with Flink grammar.
+
+### Vectorized Optimization upon Read
+
+Flink will automatically used vectorized reads of Hive tables when the following conditions are met:
+
+- Format: ORC or Parquet.
+- Columns without complex data type, like hive types: List, Map, Struct, Union.
+
+This feature is enabled by default.
+It may be disabled with the following configuration.
+
+```bash
+table.exec.hive.fallback-mapred-reader=true
+```
+
+### Source Parallelism Inference
+
+By default, Flink will infer the optimal parallelism for its Hive readers
+based on the number of files, and number of blocks in each file.
+
+Flink allows you to flexibly configure the policy of parallelism inference. You can configure the
+following parameters in `TableConfig` (note that these parameters affect all sources of the job):
+
+
+
+
+
Key
+
Default
+
Type
+
Description
+
+
+
+
+
table.exec.hive.infer-source-parallelism
+
true
+
Boolean
+
If is true, source parallelism is inferred according to splits number. If is false, parallelism of source are set by config.
+
+
+
table.exec.hive.infer-source-parallelism.max
+
1000
+
Integer
+
Sets max infer parallelism for source operator.
+
+
+
+
+## Temporal Table Join
+
+You can use a Hive table as a temporal table, and then a stream can correlate the Hive table by temporal join.
+Please see [temporal join]({{< ref "docs/dev/table/sql/queries/joins" >}}#temporal-joins) for more information about the temporal join.
+
+Flink supports processing-time temporal join Hive Table, the processing-time temporal join always joins the latest version of temporal table.
+Flink supports temporal join both partitioned table and Hive non-partitioned table, for partitioned table, Flink supports tracking the latest partition of Hive table automatically.
+
+**NOTE**: Flink does not support event-time temporal join Hive table yet.
+
+### Temporal Join The Latest Partition
+
+For a partitioned table which is changing over time, we can read it out as an unbounded stream, the partition can be acted as a version of the temporal table if every partition contains complete data of a version,
+the version of temporal table keeps the data of the partition.
+
+Flink support tracking the latest partition(version) of temporal table automatically in processing time temporal join, the latest partition(version) is defined by 'streaming-source.partition-order' option,
+This is the most common user cases that use Hive table as dimension table in a Flink stream application job.
+
+**NOTE:** This feature is only support in Flink `STREAMING` Mode.
+
+The following demo shows a classical business pipeline, the dimension table comes from Hive and it's updated once every day by a batch pipeline job or a Flink job, the kafka stream comes from real time online business data or log and need to join with the dimension table to enrich stream.
+
+```sql
+-- Assume the data in hive table is updated per day, every day contains the latest and complete dimension data
+SET table.sql-dialect=hive;
+CREATE TABLE dimension_table (
+ product_id STRING,
+ product_name STRING,
+ unit_price DECIMAL(10, 4),
+ pv_count BIGINT,
+ like_count BIGINT,
+ comment_count BIGINT,
+ update_time TIMESTAMP(3),
+ update_user STRING,
+ ...
+) PARTITIONED BY (pt_year STRING, pt_month STRING, pt_day STRING) TBLPROPERTIES (
+ -- using default partition-name order to load the latest partition every 12h (the most recommended and convenient way)
+ 'streaming-source.enable' = 'true',
+ 'streaming-source.partition.include' = 'latest',
+ 'streaming-source.monitor-interval' = '12 h',
+ 'streaming-source.partition-order' = 'partition-name', -- option with default value, can be ignored.
+
+ -- using partition file create-time order to load the latest partition every 12h
+ 'streaming-source.enable' = 'true',
+ 'streaming-source.partition.include' = 'latest',
+ 'streaming-source.partition-order' = 'create-time',
+ 'streaming-source.monitor-interval' = '12 h'
+
+ -- using partition-time order to load the latest partition every 12h
+ 'streaming-source.enable' = 'true',
+ 'streaming-source.partition.include' = 'latest',
+ 'streaming-source.monitor-interval' = '12 h',
+ 'streaming-source.partition-order' = 'partition-time',
+ 'partition.time-extractor.kind' = 'default',
+ 'partition.time-extractor.timestamp-pattern' = '$pt_year-$pt_month-$pt_day 00:00:00'
+);
+
+SET table.sql-dialect=default;
+CREATE TABLE orders_table (
+ order_id STRING,
+ order_amount DOUBLE,
+ product_id STRING,
+ log_ts TIMESTAMP(3),
+ proctime as PROCTIME()
+) WITH (...);
+
+
+-- streaming sql, kafka temporal join a hive dimension table. Flink will automatically reload data from the
+-- configured latest partition in the interval of 'streaming-source.monitor-interval'.
+
+SELECT * FROM orders_table AS o
+JOIN dimension_table FOR SYSTEM_TIME AS OF o.proctime AS dim
+ON o.product_id = dim.product_id;
+
+```
+
+### Temporal Join The Latest Table
+
+For a Hive table, we can read it out as a bounded stream. In this case, the Hive table can only track its latest version at the time when we query.
+The latest version of table keep all data of the Hive table.
+
+When performing the temporal join the latest Hive table, the Hive table will be cached in Slot memory and each record from the stream is joined against the table by key to decide whether a match is found.
+Using the latest Hive table as a temporal table does not require any additional configuration. Optionally, you can configure the TTL of the Hive table cache with the following property. After the cache expires, the Hive table will be scanned again to load the latest data.
+
+
+
+
+
Key
+
Default
+
Type
+
Description
+
+
+
+
+
lookup.join.cache.ttl
+
60 min
+
Duration
+
The cache TTL (e.g. 10min) for the build table in lookup join. By default the TTL is 60 minutes. NOTES: The option only works when lookup bounded hive table source, if you're using streaming hive source as temporal table, please use 'streaming-source.monitor-interval' to configure the interval of data update.
+
+
+
+
+
+The following demo shows load all data of hive table as a temporal table.
+
+```sql
+-- Assume the data in hive table is overwrite by batch pipeline.
+SET table.sql-dialect=hive;
+CREATE TABLE dimension_table (
+ product_id STRING,
+ product_name STRING,
+ unit_price DECIMAL(10, 4),
+ pv_count BIGINT,
+ like_count BIGINT,
+ comment_count BIGINT,
+ update_time TIMESTAMP(3),
+ update_user STRING,
+ ...
+) TBLPROPERTIES (
+ 'streaming-source.enable' = 'false', -- option with default value, can be ignored.
+ 'streaming-source.partition.include' = 'all', -- option with default value, can be ignored.
+ 'lookup.join.cache.ttl' = '12 h'
+);
+
+SET table.sql-dialect=default;
+CREATE TABLE orders_table (
+ order_id STRING,
+ order_amount DOUBLE,
+ product_id STRING,
+ log_ts TIMESTAMP(3),
+ proctime as PROCTIME()
+) WITH (...);
+
+
+-- streaming sql, kafka join a hive dimension table. Flink will reload all data from dimension_table after cache ttl is expired.
+
+SELECT * FROM orders_table AS o
+JOIN dimension_table FOR SYSTEM_TIME AS OF o.proctime AS dim
+ON o.product_id = dim.product_id;
+
+```
+Note:
+
+1. Each joining subtask needs to keep its own cache of the Hive table. Please make sure the Hive table can fit into the memory of a TM task slot.
+2. It is encouraged to set a relatively large value both for `streaming-source.monitor-interval`(latest partition as temporal table) or `lookup.join.cache.ttl`(all partitions as temporal table). Otherwise, Jobs are prone to performance issues as the table needs to be updated and reloaded too frequently.
+3. Currently we simply load the whole Hive table whenever the cache needs refreshing. There's no way to differentiate
+new data from the old.
+
+## Writing
+
+Flink supports writing data from Hive in both `BATCH` and `STREAMING` modes. When run as a `BATCH`
+application, Flink will write to a Hive table only making those records visible when the Job finishes.
+`BATCH` writes support both appending to and overwriting existing tables.
+
+```sql
+# ------ INSERT INTO will append to the table or partition, keeping the existing data intact ------
+Flink SQL> INSERT INTO mytable SELECT 'Tom', 25;
+
+# ------ INSERT OVERWRITE will overwrite any existing data in the table or partition ------
+Flink SQL> INSERT OVERWRITE mytable SELECT 'Tom', 25;
+```
+
+Data can also be inserted into particular partitions.
+
+```sql
+# ------ Insert with static partition ------
+Flink SQL> INSERT OVERWRITE myparttable PARTITION (my_type='type_1', my_date='2019-08-08') SELECT 'Tom', 25;
+
+# ------ Insert with dynamic partition ------
+Flink SQL> INSERT OVERWRITE myparttable SELECT 'Tom', 25, 'type_1', '2019-08-08';
+
+# ------ Insert with static(my_type) and dynamic(my_date) partition ------
+Flink SQL> INSERT OVERWRITE myparttable PARTITION (my_type='type_1') SELECT 'Tom', 25, '2019-08-08';
+```
+
+`STREAMING` writes continuously adding new data to Hive, committing records - making them
+visible - incrementally. Users control when/how to trigger commits with several properties. Insert
+overwrite is not supported for streaming write.
+
+The below examples show how the streaming sink can be used to write a streaming query to write data from Kafka into a Hive table with partition-commit,
+and runs a batch query to read that data back out.
+
+Please see the [streaming sink]({{< ref "docs/connectors/table/filesystem" >}}#streaming-sink) for a full list of available configurations.
+
+```sql
+
+SET table.sql-dialect=hive;
+CREATE TABLE hive_table (
+ user_id STRING,
+ order_amount DOUBLE
+) PARTITIONED BY (dt STRING, hr STRING) STORED AS parquet TBLPROPERTIES (
+ 'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00',
+ 'sink.partition-commit.trigger'='partition-time',
+ 'sink.partition-commit.delay'='1 h',
+ 'sink.partition-commit.policy.kind'='metastore,success-file'
+);
+
+SET table.sql-dialect=default;
+CREATE TABLE kafka_table (
+ user_id STRING,
+ order_amount DOUBLE,
+ log_ts TIMESTAMP(3),
+ WATERMARK FOR log_ts AS log_ts - INTERVAL '5' SECOND -- Define watermark on TIMESTAMP column
+) WITH (...);
+
+-- streaming sql, insert into hive table
+INSERT INTO TABLE hive_table
+SELECT user_id, order_amount, DATE_FORMAT(log_ts, 'yyyy-MM-dd'), DATE_FORMAT(log_ts, 'HH')
+FROM kafka_table;
+
+-- batch sql, select with partition pruning
+SELECT * FROM hive_table WHERE dt='2020-05-20' and hr='12';
+
+```
+
+If the watermark is defined on TIMESTAMP_LTZ column and used `partition-time` to commit, the `sink.partition-commit.watermark-time-zone` is required to set to the session time zone, otherwise the partition committed may happen after a few hours.
+```sql
+
+SET table.sql-dialect=hive;
+CREATE TABLE hive_table (
+ user_id STRING,
+ order_amount DOUBLE
+) PARTITIONED BY (dt STRING, hr STRING) STORED AS parquet TBLPROPERTIES (
+ 'partition.time-extractor.timestamp-pattern'='$dt $hr:00:00',
+ 'sink.partition-commit.trigger'='partition-time',
+ 'sink.partition-commit.delay'='1 h',
+ 'sink.partition-commit.watermark-time-zone'='Asia/Shanghai', -- Assume user configured time zone is 'Asia/Shanghai'
+ 'sink.partition-commit.policy.kind'='metastore,success-file'
+);
+
+SET table.sql-dialect=default;
+CREATE TABLE kafka_table (
+ user_id STRING,
+ order_amount DOUBLE,
+ ts BIGINT, -- time in epoch milliseconds
+ ts_ltz AS TO_TIMESTAMP_LTZ(ts, 3),
+ WATERMARK FOR ts_ltz AS ts_ltz - INTERVAL '5' SECOND -- Define watermark on TIMESTAMP_LTZ column
+) WITH (...);
+
+-- streaming sql, insert into hive table
+INSERT INTO TABLE hive_table
+SELECT user_id, order_amount, DATE_FORMAT(ts_ltz, 'yyyy-MM-dd'), DATE_FORMAT(ts_ltz, 'HH')
+FROM kafka_table;
+
+-- batch sql, select with partition pruning
+SELECT * FROM hive_table WHERE dt='2020-05-20' and hr='12';
+
+```
+
+By default, for streaming writes, Flink only supports renaming committers, meaning the S3 filesystem
+cannot support exactly-once streaming writes.
+Exactly-once writes to S3 can be achieved by configuring the following parameter to false.
+This will instruct the sink to use Flink's native writers but only works for
+parquet and orc file types.
+This configuration is set in the `TableConfig` and will affect all sinks of the job.
+
+
+
+
+
Key
+
Default
+
Type
+
Description
+
+
+
+
+
table.exec.hive.fallback-mapred-writer
+
true
+
Boolean
+
If it is false, using flink native writer to write parquet and orc files; if it is true, using hadoop mapred record writer to write parquet and orc files.
+
+
+
+
+
+## Formats
+
+Flink's Hive integration has been tested against the following file formats:
+
+- Text
+- CSV
+- SequenceFile
+- ORC
+- Parquet
diff --git a/docs/content.zh/docs/connectors/table/hive/overview.md b/docs/content.zh/docs/connectors/table/hive/overview.md
new file mode 100644
index 0000000000000..a32a3ff4d1809
--- /dev/null
+++ b/docs/content.zh/docs/connectors/table/hive/overview.md
@@ -0,0 +1,460 @@
+---
+title: "Overview"
+weight: 1
+type: docs
+aliases:
+ - /zh/dev/table/connectors/hive/
+---
+
+
+# Hive
+
+[Apache Hive](https://hive.apache.org/) 已经成为了数据仓库生态系统中的核心。
+它不仅仅是一个用于大数据分析和ETL场景的SQL引擎,同样它也是一个数据管理平台,可用于发现,定义,和演化数据。
+
+Flink 与 Hive 的集成包含两个层面。
+
+一是利用了 Hive 的 MetaStore 作为持久化的 Catalog,用户可通过`HiveCatalog`将不同会话中的 Flink 元数据存储到 Hive Metastore 中。
+例如,用户可以使用`HiveCatalog`将其 Kafka 表或 Elasticsearch 表存储在 Hive Metastore 中,并后续在 SQL 查询中重新使用它们。
+
+二是利用 Flink 来读写 Hive 的表。
+
+`HiveCatalog`的设计提供了与 Hive 良好的兼容性,用户可以"开箱即用"的访问其已有的 Hive 数仓。
+您不需要修改现有的 Hive Metastore,也不需要更改表的数据位置或分区。
+
+* 我们强烈建议用户使用 [Blink planner]({{< ref "docs/dev/table/overview" >}}#dependency-structure) 与 Hive 集成。
+
+## 支持的Hive版本
+
+Flink 支持一下的 Hive 版本。
+
+- 1.0
+ - 1.0.0
+ - 1.0.1
+- 1.1
+ - 1.1.0
+ - 1.1.1
+- 1.2
+ - 1.2.0
+ - 1.2.1
+ - 1.2.2
+- 2.0
+ - 2.0.0
+ - 2.0.1
+- 2.1
+ - 2.1.0
+ - 2.1.1
+- 2.2
+ - 2.2.0
+- 2.3
+ - 2.3.0
+ - 2.3.1
+ - 2.3.2
+ - 2.3.3
+ - 2.3.4
+ - 2.3.5
+ - 2.3.6
+- 3.1
+ - 3.1.0
+ - 3.1.1
+ - 3.1.2
+
+请注意,某些功能是否可用取决于您使用的 Hive 版本,这些限制不是由 Flink 所引起的:
+
+- Hive 内置函数在使用 Hive-1.2.0 及更高版本时支持。
+- 列约束,也就是 PRIMARY KEY 和 NOT NULL,在使用 Hive-3.1.0 及更高版本时支持。
+- 更改表的统计信息,在使用 Hive-1.2.0 及更高版本时支持。
+- `DATE`列统计信息,在使用 Hive-1.2.0 及更高版时支持。
+- 使用 Hive-2.0.x 版本时不支持写入 ORC 表。
+
+### 依赖项
+
+要与 Hive 集成,您需要在 Flink 下的`/lib/`目录中添加一些额外的依赖包,
+以便通过 Table API 或 SQL Client 与 Hive 进行交互。
+或者,您可以将这些依赖项放在专用文件夹中,并分别使用 Table API 程序或 SQL Client 的`-C`或`-l`选项将它们添加到 classpath 中。
+
+Apache Hive 是基于 Hadoop 之上构建的, 首先您需要 Hadoop 的依赖,请参考
+Providing Hadoop classes:
+```
+export HADOOP_CLASSPATH=`hadoop classpath`
+```
+
+有两种添加 Hive 依赖项的方法。第一种是使用 Flink 提供的 Hive Jar包。您可以根据使用的 Metastore 的版本来选择对应的 Hive jar。第二个方式是分别添加每个所需的 jar 包。如果您使用的 Hive 版本尚未在此处列出,则第二种方法会更适合。
+
+**注意**:建议您优先使用 Flink 提供的 Hive jar 包。仅在 Flink 提供的 Hive jar 不满足您的需求时,再考虑使用分开添加 jar 包的方式。
+
+#### 使用 Flink 提供的 Hive jar
+
+下表列出了所有可用的 Hive jar。您可以选择一个并放在 Flink 发行版的`/lib/` 目录中。
+
+| Metastore version | Maven dependency | SQL Client JAR |
+| :---------------- | :--------------------------- | :----------------------|
+| 1.0.0 - 1.2.2 | `flink-sql-connector-hive-1.2.2` | {{< stable >}}[Download](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-1.2.2{{< scala_version >}}/{{< version >}}/flink-sql-connector-hive-1.2.2{{< scala_version >}}-{{< version >}}.jar) {{< /stable >}}{{< unstable >}} Only available for stable releases {{< /unstable >}} |
+| 2.0.0 - 2.2.0 | `flink-sql-connector-hive-2.2.0` | {{< stable >}}[Download](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-2.2.0{{< scala_version >}}/{{< version >}}/flink-sql-connector-hive-2.2.0{{< scala_version >}}-{{< version >}}.jar) {{< /stable >}}{{< unstable >}} Only available for stable releases {{< /unstable >}} |
+| 2.3.0 - 2.3.6 | `flink-sql-connector-hive-2.3.6` | {{< stable >}}[Download](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-2.3.6{{< scala_version >}}/{{< version >}}/flink-sql-connector-hive-2.3.6{{< scala_version >}}-{{< version >}}.jar) {{< /stable >}}{{< unstable >}} Only available for stable releases {{< /unstable >}} |
+| 3.0.0 - 3.1.2 | `flink-sql-connector-hive-3.1.2` | {{< stable >}}[Download](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-3.1.2{{< scala_version >}}/{{< version >}}/flink-sql-connector-hive-3.1.2{{< scala_version >}}-{{< version >}}.jar) {{< /stable >}}{{< unstable >}} Only available for stable releases {{< /unstable >}} |
+
+#### 用户定义的依赖项
+
+您可以在下方找到不同Hive主版本所需要的依赖项。
+
+{{< tabs "8623cd64-8623-4922-92d2-ee82ec410d96" >}}
+{{< tab "Hive 2.3.4" >}}
+```txt
+/flink-{{< version >}}
+ /lib
+
+ // Flink's Hive connector.Contains flink-hadoop-compatibility and flink-orc jars
+ flink-connector-hive{{< scala_version >}}-{{< version >}}.jar
+
+ // Hive dependencies
+ hive-exec-2.3.4.jar
+
+ // add antlr-runtime if you need to use hive dialect
+ antlr-runtime-3.5.2.jar
+
+```
+{{< /tab >}}
+{{< tab "Hive 1.0.0" >}}
+```txt
+/flink-{{< version >}}
+ /lib
+
+ // Flink's Hive connector
+ flink-connector-hive{{< scala_version >}}-{{< version >}}.jar
+
+ // Hive dependencies
+ hive-metastore-1.0.0.jar
+ hive-exec-1.0.0.jar
+ libfb303-0.9.0.jar // libfb303 is not packed into hive-exec in some versions, need to add it separately
+
+ // Orc dependencies -- required by the ORC vectorized optimizations
+ orc-core-1.4.3-nohive.jar
+ aircompressor-0.8.jar // transitive dependency of orc-core
+
+ // add antlr-runtime if you need to use hive dialect
+ antlr-runtime-3.5.2.jar
+
+```
+{{< /tab >}}
+{{< tab "Hive 1.1.0" >}}
+```txt
+/flink-{{< version >}}
+ /lib
+
+ // Flink's Hive connector
+ flink-connector-hive{{< scala_version >}}-{{< version >}}.jar
+
+ // Hive dependencies
+ hive-metastore-1.1.0.jar
+ hive-exec-1.1.0.jar
+ libfb303-0.9.2.jar // libfb303 is not packed into hive-exec in some versions, need to add it separately
+
+ // Orc dependencies -- required by the ORC vectorized optimizations
+ orc-core-1.4.3-nohive.jar
+ aircompressor-0.8.jar // transitive dependency of orc-core
+
+ // add antlr-runtime if you need to use hive dialect
+ antlr-runtime-3.5.2.jar
+
+```
+{{< /tab >}}
+{{< tab "Hive 1.2.1" >}}
+```txt
+/flink-{{< version >}}
+ /lib
+
+ // Flink's Hive connector
+ flink-connector-hive{{< scala_version >}}-{{< version >}}.jar
+
+ // Hive dependencies
+ hive-metastore-1.2.1.jar
+ hive-exec-1.2.1.jar
+ libfb303-0.9.2.jar // libfb303 is not packed into hive-exec in some versions, need to add it separately
+
+ // Orc dependencies -- required by the ORC vectorized optimizations
+ orc-core-1.4.3-nohive.jar
+ aircompressor-0.8.jar // transitive dependency of orc-core
+
+ // add antlr-runtime if you need to use hive dialect
+ antlr-runtime-3.5.2.jar
+
+```
+{{< /tab >}}
+{{< tab "Hive 2.0.0" >}}
+```txt
+/flink-{{< version >}}
+ /lib
+
+ // Flink's Hive connector
+ flink-connector-hive{{< scala_version >}}-{{< version >}}.jar
+
+ // Hive dependencies
+ hive-exec-2.0.0.jar
+
+ // add antlr-runtime if you need to use hive dialect
+ antlr-runtime-3.5.2.jar
+
+```
+{{< /tab >}}
+{{< tab "Hive 2.1.0" >}}
+```txt
+/flink-{{< version >}}
+ /lib
+
+ // Flink's Hive connector
+ flink-connector-hive{{< scala_version >}}-{{< version >}}.jar
+
+ // Hive dependencies
+ hive-exec-2.1.0.jar
+
+ // add antlr-runtime if you need to use hive dialect
+ antlr-runtime-3.5.2.jar
+
+```
+{{< /tab >}}
+{{< tab "Hive 2.2.0" >}}
+```txt
+/flink-{{< version >}}
+ /lib
+
+ // Flink's Hive connector
+ flink-connector-hive{{< scala_version >}}-{{< version >}}.jar
+
+ // Hive dependencies
+ hive-exec-2.2.0.jar
+
+ // Orc dependencies -- required by the ORC vectorized optimizations
+ orc-core-1.4.3.jar
+ aircompressor-0.8.jar // transitive dependency of orc-core
+
+ // add antlr-runtime if you need to use hive dialect
+ antlr-runtime-3.5.2.jar
+
+```
+{{< /tab >}}
+{{< tab "Hive 3.1.0" >}}
+```txt
+/flink-{{< version >}}
+ /lib
+
+ // Flink's Hive connector
+ flink-connector-hive{{< scala_version >}}-{{< version >}}.jar
+
+ // Hive dependencies
+ hive-exec-3.1.0.jar
+ libfb303-0.9.3.jar // libfb303 is not packed into hive-exec in some versions, need to add it separately
+
+ // add antlr-runtime if you need to use hive dialect
+ antlr-runtime-3.5.2.jar
+
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+### Maven 依赖
+
+如果您在构建自己的应用程序,则需要在 mvn 文件中添加以下依赖项。
+您应该在运行时添加以上的这些依赖项,而不要在已生成的 jar 文件中去包含它们。
+
+```xml
+
+
+ org.apache.flink
+ flink-connector-hive{{ site.scala_version_suffix }}
+ {{site.version}}
+ provided
+
+
+
+ org.apache.flink
+ flink-table-api-java-bridge{{ site.scala_version_suffix }}
+ {{site.version}}
+ provided
+
+
+
+
+ org.apache.hive
+ hive-exec
+ ${hive.version}
+ provided
+
+```
+
+## 连接到Hive
+
+通过 TableEnvironment 或者 YAML 配置,使用 [Catalog 接口]({{< ref "docs/dev/table/catalogs" >}}) 和 [HiveCatalog]({{< ref "docs/connectors/table/hive/hive_catalog" >}})连接到现有的 Hive 集群。
+
+请注意,虽然 HiveCatalog 不需要特定的 planner,但读写Hive表仅适用于 Blink planner。因此,强烈建议您在连接到 Hive 仓库时使用 Blink planner。
+
+以下是如何连接到 Hive 的示例:
+
+{{< tabs "2ca7cad8-0b84-45db-92d9-a75abd8808e7" >}}
+{{< tab "Java" >}}
+
+```java
+
+EnvironmentSettings settings = EnvironmentSettings.newInstance().useBlinkPlanner().build();
+TableEnvironment tableEnv = TableEnvironment.create(settings);
+
+String name = "myhive";
+String defaultDatabase = "mydatabase";
+String hiveConfDir = "/opt/hive-conf";
+
+HiveCatalog hive = new HiveCatalog(name, defaultDatabase, hiveConfDir);
+tableEnv.registerCatalog("myhive", hive);
+
+// set the HiveCatalog as the current catalog of the session
+tableEnv.useCatalog("myhive");
+```
+{{< /tab >}}
+{{< tab "Scala" >}}
+
+```scala
+
+val settings = EnvironmentSettings.newInstance().useBlinkPlanner().build()
+val tableEnv = TableEnvironment.create(settings)
+
+val name = "myhive"
+val defaultDatabase = "mydatabase"
+val hiveConfDir = "/opt/hive-conf"
+
+val hive = new HiveCatalog(name, defaultDatabase, hiveConfDir)
+tableEnv.registerCatalog("myhive", hive)
+
+// set the HiveCatalog as the current catalog of the session
+tableEnv.useCatalog("myhive")
+```
+{{< /tab >}}
+{{< tab "Python" >}}
+```python
+from pyflink.table import *
+from pyflink.table.catalog import HiveCatalog
+
+settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()
+t_env = TableEnvironment.create(settings)
+
+catalog_name = "myhive"
+default_database = "mydatabase"
+hive_conf_dir = "/opt/hive-conf"
+
+hive_catalog = HiveCatalog(catalog_name, default_database, hive_conf_dir)
+t_env.register_catalog("myhive", hive_catalog)
+
+# set the HiveCatalog as the current catalog of the session
+tableEnv.use_catalog("myhive")
+```
+{{< /tab >}}
+{{< tab "YAML" >}}
+```yaml
+
+execution:
+ planner: blink
+ ...
+ current-catalog: myhive # set the HiveCatalog as the current catalog of the session
+ current-database: mydatabase
+
+catalogs:
+ - name: myhive
+ type: hive
+ hive-conf-dir: /opt/hive-conf
+```
+{{< /tab >}}
+{{< tab "SQL" >}}
+```sql
+
+CREATE CATALOG myhive WITH (
+ 'type' = 'hive',
+ 'default-database' = 'mydatabase',
+ 'hive-conf-dir' = '/opt/hive-conf'
+);
+-- set the HiveCatalog as the current catalog of the session
+USE CATALOG myhive;
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+下表列出了通过 YAML 文件或 DDL 定义 `HiveCatalog` 时所支持的参数。
+
+
+
+
+
参数
+
必选
+
默认值
+
类型
+
描述
+
+
+
+
+
type
+
是
+
(无)
+
String
+
Catalog 的类型。 创建 HiveCatalog 时,该参数必须设置为'hive'。
+
+
+
name
+
是
+
(无)
+
String
+
Catalog 的名字。仅在使用 YAML file 时需要指定。
+
+
+
hive-conf-dir
+
否
+
(无)
+
String
+
指向包含 hive-site.xml 目录的 URI。 该 URI 必须是 Hadoop 文件系统所支持的类型。 如果指定一个相对 URI,即不包含 scheme,则默认为本地文件系统。如果该参数没有指定,我们会在 class path 下查找hive-site.xml。
The unique identifier of the record within its shard.
+
+
+
+
+The extended `CREATE TABLE` example demonstrates the syntax for exposing these metadata fields:
+
+```sql
+CREATE TABLE KinesisTable (
+ `user_id` BIGINT,
+ `item_id` BIGINT,
+ `category_id` BIGINT,
+ `behavior` STRING,
+ `ts` TIMESTAMP(3),
+ `arrival_time` TIMESTAMP(3) METADATA FROM 'timestamp' VIRTUAL,
+ `shard_id` VARCHAR(128) NOT NULL METADATA FROM 'shard-id' VIRTUAL,
+ `sequence_number` VARCHAR(128) NOT NULL METADATA FROM 'sequence-number' VIRTUAL
+)
+PARTITIONED BY (user_id, item_id)
+WITH (
+ 'connector' = 'kinesis',
+ 'stream' = 'user_behavior',
+ 'aws.region' = 'us-east-2',
+ 'scan.stream.initpos' = 'LATEST',
+ 'format' = 'csv'
+);
+```
+
+
+Connector Options
+-----------------
+
+
+
+
+
Option
+
Required
+
Default
+
Type
+
Description
+
+
+
Common Options
+
+
+
+
+
connector
+
required
+
(none)
+
String
+
Specify what connector to use. For Kinesis use 'kinesis'.
+
+
+
stream
+
required
+
(none)
+
String
+
Name of the Kinesis data stream backing this table.
+
+
+
format
+
required
+
(none)
+
String
+
The format used to deserialize and serialize Kinesis data stream records. See Data Type Mapping for details.
+
+
+
aws.region
+
optional
+
(none)
+
String
+
The AWS region where the stream is defined. Either this or aws.endpoint are required.
+
+
+
aws.endpoint
+
optional
+
(none)
+
String
+
The AWS endpoint for Kinesis (derived from the AWS region setting if not set). Either this or aws.region are required.
+
+
+
+
+
Authentication Options
+
+
+
+
+
aws.credentials.provider
+
optional
+
AUTO
+
String
+
A credentials provider to use when authenticating against the Kinesis endpoint. See Authentication for details.
+
+
+
aws.credentials.basic.accesskeyid
+
optional
+
(none)
+
String
+
The AWS access key ID to use when setting credentials provider type to BASIC.
+
+
+
aws.credentials.basic.secretkey
+
optional
+
(none)
+
String
+
The AWS secret key to use when setting credentials provider type to BASIC.
+
+
+
aws.credentials.profile.path
+
optional
+
(none)
+
String
+
Optional configuration for profile path if credential provider type is set to be PROFILE.
+
+
+
aws.credentials.profile.name
+
optional
+
(none)
+
String
+
Optional configuration for profile name if credential provider type is set to be PROFILE.
+
+
+
aws.credentials.role.arn
+
optional
+
(none)
+
String
+
The role ARN to use when credential provider type is set to ASSUME_ROLE or WEB_IDENTITY_TOKEN.
+
+
+
aws.credentials.role.sessionName
+
optional
+
(none)
+
String
+
The role session name to use when credential provider type is set to ASSUME_ROLE or WEB_IDENTITY_TOKEN.
+
+
+
aws.credentials.role.externalId
+
optional
+
(none)
+
String
+
The external ID to use when credential provider type is set to ASSUME_ROLE.
+
+
+
aws.credentials.role.provider
+
optional
+
(none)
+
String
+
The credentials provider that provides credentials for assuming the role when credential provider type is set to ASSUME_ROLE. Roles can be nested, so this value can again be set to ASSUME_ROLE
+
+
+
aws.credentials.webIdentityToken.file
+
optional
+
(none)
+
String
+
The absolute path to the web identity token file that should be used if provider type is set to WEB_IDENTITY_TOKEN.
+
+
+
+
+
Source Options
+
+
+
+
+
scan.stream.initpos
+
optional
+
LATEST
+
String
+
Initial position to be used when reading from the table. See Start Reading Position for details.
+
+
+
scan.stream.initpos-timestamp
+
optional
+
(none)
+
String
+
The initial timestamp to start reading Kinesis stream from (when scan.stream.initpos is AT_TIMESTAMP). See Start Reading Position for details.
+
+
+
scan.stream.initpos-timestamp-format
+
optional
+
yyyy-MM-dd'T'HH:mm:ss.SSSXXX
+
String
+
The date format of initial timestamp to start reading Kinesis stream from (when scan.stream.initpos is AT_TIMESTAMP). See Start Reading Position for details.
+
+
+
scan.stream.recordpublisher
+
optional
+
POLLING
+
String
+
The RecordPublisher type to use for sources. See Enhanced Fan-Out for details.
+
+
+
scan.stream.efo.consumername
+
optional
+
(none)
+
String
+
The name of the EFO consumer to register with KDS. See Enhanced Fan-Out for details.
+
+
+
scan.stream.efo.registration
+
optional
+
LAZY
+
String
+
Determine how and when consumer de-/registration is performed (LAZY|EAGER|NONE). See Enhanced Fan-Out for details.
+
+
+
scan.stream.efo.consumerarn
+
optional
+
(none)
+
String
+
The prefix of consumer ARN for a given stream. See Enhanced Fan-Out for details.
+
+
+
scan.stream.efo.http-client.max-concurrency
+
optional
+
10000
+
Integer
+
Maximum number of allowed concurrent requests for the EFO client. See Enhanced Fan-Out for details.
+
+
+
scan.stream.describe.maxretries
+
optional
+
50
+
Integer
+
The maximum number of describeStream attempts if we get a recoverable exception.
+
+
+
scan.stream.describe.backoff.base
+
optional
+
2000
+
Long
+
The base backoff time (in milliseconds) between each describeStream attempt (for consuming from DynamoDB streams).
+
+
+
scan.stream.describe.backoff.max
+
optional
+
5000
+
Long
+
The maximum backoff time (in milliseconds) between each describeStream attempt (for consuming from DynamoDB streams).
+
+
+
scan.stream.describe.backoff.expconst
+
optional
+
1.5
+
Double
+
The power constant for exponential backoff between each describeStream attempt (for consuming from DynamoDB streams).
+
+
+
scan.list.shards.maxretries
+
optional
+
10
+
Integer
+
The maximum number of listShards attempts if we get a recoverable exception.
+
+
+
scan.list.shards.backoff.base
+
optional
+
1000
+
Long
+
The base backoff time (in milliseconds) between each listShards attempt.
+
+
+
scan.list.shards.backoff.max
+
optional
+
5000
+
Long
+
The maximum backoff time (in milliseconds) between each listShards attempt.
+
+
+
scan.list.shards.backoff.expconst
+
optional
+
1.5
+
Double
+
The power constant for exponential backoff between each listShards attempt.
+
+
+
scan.stream.describestreamconsumer.maxretries
+
optional
+
50
+
Integer
+
The maximum number of describeStreamConsumer attempts if we get a recoverable exception.
+
+
+
scan.stream.describestreamconsumer.backoff.base
+
optional
+
2000
+
Long
+
The base backoff time (in milliseconds) between each describeStreamConsumer attempt.
+
+
+
scan.stream.describestreamconsumer.backoff.max
+
optional
+
5000
+
Long
+
The maximum backoff time (in milliseconds) between each describeStreamConsumer attempt.
The power constant for exponential backoff between each deregisterStream attempt.
+
+
+
scan.shard.subscribetoshard.maxretries
+
optional
+
10
+
Integer
+
The maximum number of subscribeToShard attempts if we get a recoverable exception.
+
+
+
scan.shard.subscribetoshard.backoff.base
+
optional
+
1000
+
Long
+
The base backoff time (in milliseconds) between each subscribeToShard attempt.
+
+
+
scan.shard.subscribetoshard.backoff.max
+
optional
+
2000
+
Long
+
The maximum backoff time (in milliseconds) between each subscribeToShard attempt.
+
+
+
scan.shard.subscribetoshard.backoff.expconst
+
optional
+
1.5
+
Double
+
The power constant for exponential backoff between each subscribeToShard attempt.
+
+
+
scan.shard.getrecords.maxrecordcount
+
optional
+
10000
+
Integer
+
The maximum number of records to try to get each time we fetch records from a AWS Kinesis shard.
+
+
+
scan.shard.getrecords.maxretries
+
optional
+
3
+
Integer
+
The maximum number of getRecords attempts if we get a recoverable exception.
+
+
+
scan.shard.getrecords.backoff.base
+
optional
+
300
+
Long
+
The base backoff time (in milliseconds) between getRecords attempts if we get a ProvisionedThroughputExceededException.
+
+
+
scan.shard.getrecords.backoff.max
+
optional
+
1000
+
Long
+
The maximum backoff time (in milliseconds) between getRecords attempts if we get a ProvisionedThroughputExceededException.
+
+
+
scan.shard.getrecords.backoff.expconst
+
optional
+
1.5
+
Double
+
The power constant for exponential backoff between each getRecords attempt.
+
+
+
scan.shard.getrecords.intervalmillis
+
optional
+
200
+
Long
+
The interval (in milliseconds) between each getRecords request to a AWS Kinesis shard in milliseconds.
+
+
+
scan.shard.getiterator.maxretries
+
optional
+
3
+
Integer
+
The maximum number of getShardIterator attempts if we get ProvisionedThroughputExceededException.
+
+
+
scan.shard.getiterator.backoff.base
+
optional
+
300
+
Long
+
The base backoff time (in milliseconds) between getShardIterator attempts if we get a ProvisionedThroughputExceededException.
+
+
+
scan.shard.getiterator.backoff.max
+
optional
+
1000
+
Long
+
The maximum backoff time (in milliseconds) between getShardIterator attempts if we get a ProvisionedThroughputExceededException.
+
+
+
scan.shard.getiterator.backoff.expconst
+
optional
+
1.5
+
Double
+
The power constant for exponential backoff between each getShardIterator attempt.
+
+
+
scan.shard.discovery.intervalmillis
+
optional
+
10000
+
Integer
+
The interval between each attempt to discover new shards.
+
+
+
scan.shard.adaptivereads
+
optional
+
false
+
Boolean
+
The config to turn on adaptive reads from a shard. See the AdaptivePollingRecordPublisher documentation for details.
+
+
+
scan.shard.idle.interval
+
optional
+
-1
+
Long
+
The interval (in milliseconds) after which to consider a shard idle for purposes of watermark generation. A positive value will allow the watermark to progress even when some shards don't receive new records.
+
+
+
scan.watermark.sync.interval
+
optional
+
30000
+
Long
+
The interval (in milliseconds) for periodically synchronizing the shared watermark state.
+
+
+
scan.watermark.lookahead.millis
+
optional
+
0
+
Long
+
The maximum delta (in milliseconds) allowed for the reader to advance ahead of the shared global watermark.
+
+
+
scan.watermark.sync.queue.capacity
+
optional
+
100
+
Integer
+
The maximum number of records that will be buffered before suspending consumption of a shard.
+
+
+
+
+
Sink Options
+
+
+
+
+
sink.partitioner
+
optional
+
random or row-based
+
String
+
Optional output partitioning from Flink's partitions into Kinesis shards. See Sink Partitioning for details.
+
+
+
sink.partitioner-field-delimiter
+
optional
+
|
+
String
+
Optional field delimiter for a fields-based partitioner derived from a PARTITION BY clause. See Sink Partitioning for details.
+
+
+
sink.producer.*
+
optional
+
(none)
+
+
+ Sink options for the KinesisProducer.
+ Suffix names must match the KinesisProducerConfiguration setters in lower-case hyphenated style (for example, sink.producer.collection-max-count or sink.producer.aggregation-max-count).
+ The transformed action keys are passed to the sink.producer.* to KinesisProducerConfigurations#fromProperties.
+ Note that some of the defaults are overwritten by KinesisConfigUtil.
+
+
+
+
+
+Features
+--------
+
+### Authorization
+
+Make sure to [create an appropriate IAM policy](https://docs.aws.amazon.com/streams/latest/dev/controlling-access.html) to allow reading from / writing to the Kinesis data streams.
+
+### Authentication
+
+Depending on your deployment you would choose a different Credentials Provider to allow access to Kinesis.
+By default, the `AUTO` Credentials Provider is used.
+If the access key ID and secret key are set in the deployment configuration, this results in using the `BASIC` provider.
+
+A specific [AWSCredentialsProvider](https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/index.html?com/amazonaws/auth/AWSCredentialsProvider.html) can be **optionally** set using the `aws.credentials.provider` setting.
+Supported values are:
+
+* `AUTO` - Use the default AWS Credentials Provider chain that searches for credentials in the following order: `ENV_VARS`, `SYS_PROPS`, `WEB_IDENTITY_TOKEN`, `PROFILE`, and EC2/ECS credentials provider.
+* `BASIC` - Use access key ID and secret key supplied as configuration.
+* `ENV_VAR` - Use `AWS_ACCESS_KEY_ID` & `AWS_SECRET_ACCESS_KEY` environment variables.
+* `SYS_PROP` - Use Java system properties `aws.accessKeyId` and `aws.secretKey`.
+* `PROFILE` - Use an AWS credentials profile to create the AWS credentials.
+* `ASSUME_ROLE` - Create AWS credentials by assuming a role. The credentials for assuming the role must be supplied.
+* `WEB_IDENTITY_TOKEN` - Create AWS credentials by assuming a role using Web Identity Token.
+
+### Start Reading Position
+
+You can configure table sources to start reading a table-backing Kinesis data stream from a specific position through the `scan.stream.initpos` option.
+Available values are:
+
+* `LATEST`: read shards starting from the latest record.
+* `TRIM_HORIZON`: read shards starting from the earliest record possible (data may be trimmed by Kinesis depending on the current retention settings of the backing stream).
+* `AT_TIMESTAMP`: read shards starting from a specified timestamp. The timestamp value should be specified through the `scan.stream.initpos-timestamp` in one of the following formats:
+ * A non-negative double value representing the number of seconds that has elapsed since the Unix epoch (for example, `1459799926.480`).
+ * A value conforming to a user-defined `SimpleDateFormat` specified at `scan.stream.initpos-timestamp-format`.
+ If a user does not define a format, the default pattern will be `yyyy-MM-dd'T'HH:mm:ss.SSSXXX`.
+ For example, timestamp value is `2016-04-04` and user-defined format is `yyyy-MM-dd`, or timestamp value is `2016-04-04T19:58:46.480-00:00` and a user-defined format is not provided.
+
+### Sink Partitioning
+
+Kinesis data streams consist of one or more shards, and the `sink.partitioner` option allows you to control how records written into a multi-shard Kinesis-backed table will be partitioned between its shards.
+Valid values are:
+
+* `fixed`: Kinesis `PartitionKey` values derived from the Flink subtask index, so each Flink partition ends up in at most one Kinesis partition (assuming that no re-sharding takes place at runtime).
+* `random`: Kinesis `PartitionKey` values are assigned randomly. This is the default value for tables not defined with a `PARTITION BY` clause.
+* Custom `FixedKinesisPartitioner` subclass: e.g. `'org.mycompany.MyPartitioner'`.
+
+{{< hint info >}}
+Records written into tables defining a `PARTITION BY` clause will always be partitioned based on a concatenated projection of the `PARTITION BY` fields.
+In this case, the `sink.partitioner` field cannot be used to modify this behavior (attempting to do this results in a configuration error).
+You can, however, use the `sink.partitioner-field-delimiter` option to set the delimiter of field values in the concatenated [PartitionKey](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_PutRecord.html#Streams-PutRecord-request-PartitionKey) string (an empty string is also a valid delimiter).
+{{< /hint >}}
+
+### Enhanced Fan-Out
+
+[Enhanced Fan-Out (EFO)](https://aws.amazon.com/blogs/aws/kds-enhanced-fanout/) increases the maximum number of concurrent consumers per Kinesis data stream.
+Without EFO, all concurrent Kinesis consumers share a single read quota per shard.
+Using EFO, each consumer gets a distinct dedicated read quota per shard, allowing read throughput to scale with the number of consumers.
+
+Note Using EFO will [incur additional cost](https://aws.amazon.com/kinesis/data-streams/pricing/).
+
+You can enable and configure EFO with the following properties:
+
+* `scan.stream.recordpublisher`: Determines whether to use `EFO` or `POLLING`.
+* `scan.stream.efo.consumername`: A name to identify the consumer when the above value is `EFO`.
+* `scan.stream.efo.registration`: Strategy for (de-)registration of `EFO` consumers with the name given by the `scan.stream.efo.consumername` value. Valid strategies are:
+ * `LAZY` (default): Stream consumers are registered when the Flink job starts running.
+ If the stream consumer already exists, it will be reused.
+ This is the preferred strategy for the majority of applications.
+ However, jobs with parallelism greater than 1 will result in tasks competing to register and acquire the stream consumer ARN.
+ For jobs with very large parallelism this can result in an increased start-up time.
+ The describe operation has a limit of 20 [transactions per second](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_DescribeStreamConsumer.html),
+ this means application startup time will increase by roughly `parallelism/20 seconds`.
+ * `EAGER`: Stream consumers are registered in the `FlinkKinesisConsumer` constructor.
+ If the stream consumer already exists, it will be reused.
+ This will result in registration occurring when the job is constructed,
+ either on the Flink Job Manager or client environment submitting the job.
+ Using this strategy results in a single thread registering and retrieving the stream consumer ARN,
+ reducing startup time over `LAZY` (with large parallelism).
+ However, consider that the client environment will require access to the AWS services.
+ * `NONE`: Stream consumer registration is not performed by `FlinkKinesisConsumer`.
+ Registration must be performed externally using the [AWS CLI or SDK](https://aws.amazon.com/tools/)
+ to invoke [RegisterStreamConsumer](https://docs.aws.amazon.com/kinesis/latest/APIReference/API_RegisterStreamConsumer.html).
+ Stream consumer ARNs should be provided to the job via the consumer configuration.
+* `scan.stream.efo.consumerarn.`: ARNs identifying externally registered ARN-consumers (substitute `` with the name of your stream in the parameter name).
+ Use this if you choose to use `NONE` as a `scan.stream.efo.registration` strategy.
+
+Note For a given Kinesis data stream, each EFO consumer must have a unique name.
+However, consumer names do not have to be unique across data streams.
+Reusing a consumer name will result in existing subscriptions being terminated.
+
+Note With the `LAZY` and `EAGER` strategies, stream consumers are de-registered when the job is shutdown gracefully.
+In the event that a job terminates within executing the shutdown hooks, stream consumers will remain active.
+In this situation the stream consumers will be gracefully reused when the application restarts.
+With the `NONE` strategy, stream consumer de-registration is not performed by `FlinkKinesisConsumer`.
+
+Data Type Mapping
+----------------
+
+Kinesis stores records as Base64-encoded binary data objects, so it doesn't have a notion of internal record structure.
+Instead, Kinesis records are deserialized and serialized by formats, e.g. 'avro', 'csv', or 'json'.
+To determine the data type of the messages in your Kinesis-backed tables, pick a suitable Flink format with the `format` keyword.
+Please refer to the [Formats]({{< ref "docs/connectors/table/formats/overview" >}}) pages for more details.
+
+{{< top >}}
diff --git a/docs/content.zh/docs/connectors/table/overview.md b/docs/content.zh/docs/connectors/table/overview.md
new file mode 100644
index 0000000000000..51fce6ddcfbf6
--- /dev/null
+++ b/docs/content.zh/docs/connectors/table/overview.md
@@ -0,0 +1,356 @@
+---
+title: "概览"
+weight: 1
+type: docs
+aliases:
+ - /zh/dev/table/connectors/
+---
+
+
+# Table & SQL Connectors
+
+
+Flink's Table API & SQL programs can be connected to other external systems for reading and writing both batch and streaming tables. A table source provides access to data which is stored in external systems (such as a database, key-value store, message queue, or file system). A table sink emits a table to an external storage system. Depending on the type of source and sink, they support different formats such as CSV, Avro, Parquet, or ORC.
+
+This page describes how to register table sources and table sinks in Flink using the natively supported connectors. After a source or sink has been registered, it can be accessed by Table API & SQL statements.
+
+If you want to implement your own *custom* table source or sink, have a look at the [user-defined sources & sinks page]({{< ref "docs/dev/table/sourcessinks" >}}).
+
+Supported Connectors
+------------
+
+Flink natively support various connectors. The following tables list all available connectors.
+
+
+
+{{< top >}}
+
+How to use connectors
+--------
+
+Flink supports using SQL `CREATE TABLE` statements to register tables. One can define the table name,
+the table schema, and the table options for connecting to an external system.
+
+See the [SQL section for more information about creating a table]({{< ref "docs/dev/table/sql/create" >}}#create-table).
+
+The following code shows a full example of how to connect to Kafka for reading and writing JSON records.
+
+{{< tabs "6d4f00e3-0a94-4ebd-b6b5-c5171851b500" >}}
+{{< tab "SQL" >}}
+```sql
+CREATE TABLE MyUserTable (
+ -- declare the schema of the table
+ `user` BIGINT,
+ `message` STRING,
+ `rowtime` TIMESTAMP(3) METADATA FROM 'timestamp', -- use a metadata column to access Kafka's record timestamp
+ `proctime` AS PROCTIME(), -- use a computed column to define a proctime attribute
+ WATERMARK FOR `rowtime` AS `rowtime` - INTERVAL '5' SECOND -- use a WATERMARK statement to define a rowtime attribute
+) WITH (
+ -- declare the external system to connect to
+ 'connector' = 'kafka',
+ 'topic' = 'topic_name',
+ 'scan.startup.mode' = 'earliest-offset',
+ 'properties.bootstrap.servers' = 'localhost:9092',
+ 'format' = 'json' -- declare a format for this system
+)
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+The desired connection properties are converted into string-based key-value pairs. [Factories]({{< ref "docs/dev/table/sourcessinks" >}})
+will create configured table sources, table sinks, and corresponding formats from the key-value pairs
+based on factory identifiers (`kafka` and `json` in this example). All factories that can be found via
+Java's [Service Provider Interfaces (SPI)](https://docs.oracle.com/javase/tutorial/sound/SPI-intro.html)
+are taken into account when searching for exactly one matching factory for each component.
+
+If no factory can be found or multiple factories match for the given properties, an exception will be
+thrown with additional information about considered factories and supported properties.
+
+
+Transform table connector/format resources
+--------
+
+Flink uses Java's [Service Provider Interfaces (SPI)](https://docs.oracle.com/javase/tutorial/sound/SPI-intro.html) to load the table connector/format factories by their identifiers. Since the SPI resource file named `org.apache.flink.table.factories.Factory` for every table connector/format is under the same directory `META-INF/services`, these resource files will override each other when build the uber-jar of the project which uses more than one table connector/format, which will cause Flink to fail to load table connector/format factories.
+
+In this situation, the recommended way is transforming these resource files under the directory `META-INF/services` by [ServicesResourceTransformer](https://maven.apache.org/plugins/maven-shade-plugin/examples/resource-transformers.html) of maven shade plugin. Given the pom.xml file content of example that contains connector `flink-sql-connector-hive-3.1.2` and format `flink-parquet` in a project.
+
+```xml
+
+ 4.0.0
+ org.example
+ myProject
+ 1.0-SNAPSHOT
+
+
+
+
+ org.apache.flink
+ flink-sql-connector-hive-3.1.2_{{< scala_version >}}
+ {{< version >}}
+
+
+
+ org.apache.flink
+ flink-parquet_{{< scala_version >}}<
+ {{< version >}}
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+
+
+ shade
+ package
+
+ shade
+
+
+
+
+
+
+
+
+
+
+
+
+
+```
+
+After configured the `ServicesResourceTransformer`, the table connector/format resource files under the directory `META-INF/services` would be merged rather than overwritten each other when build the uber-jar of above project.
+
+{{< top >}}
+
+Schema Mapping
+------------
+
+The body clause of a SQL `CREATE TABLE` statement defines the names and types of physical columns,
+constraints and watermarks. Flink doesn't hold the data, thus the schema definition only declares how
+to map physical columns from an external system to Flink’s representation. The mapping may not be
+mapped by names, it depends on the implementation of formats and connectors. For example, a MySQL database
+table is mapped by field names (not case sensitive), and a CSV filesystem is mapped by field order
+(field names can be arbitrary). This will be explained in every connector.
+
+The following example shows a simple schema without time attributes and one-to-one field mapping
+of input/output to table columns.
+
+{{< tabs "0c267c40-32ef-4a00-b4eb-fa39bfe3f14d" >}}
+{{< tab "SQL" >}}
+```sql
+CREATE TABLE MyTable (
+ MyField1 INT,
+ MyField2 STRING,
+ MyField3 BOOLEAN
+) WITH (
+ ...
+)
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+### Metadata
+
+Some connectors and formats expose additional metadata fields that can be accessed in metadata columns
+next to the physical payload columns. See the [`CREATE TABLE` section]({{< ref "docs/dev/table/sql/create" >}}#columns)
+for more information about metadata columns.
+
+### Primary Key
+
+Primary key constraints tell that a column or a set of columns of a table are unique and they do not contain nulls. Primary key uniquely identifies a row in a table.
+
+The primary key of a source table is a metadata information for optimization. The primary key of a sink table is usually used by the sink implementation for upserting.
+
+SQL standard specifies that a constraint can either be ENFORCED or NOT ENFORCED. This controls if the constraint checks are performed on the incoming/outgoing data. Flink does not own the data the only mode we want to support is the NOT ENFORCED mode. Its up to the user to ensure that the query enforces key integrity.
+
+{{< tabs "9e32660c-868b-4b6a-9632-3b3ea482fe7d" >}}
+{{< tab "SQL" >}}
+```sql
+CREATE TABLE MyTable (
+ MyField1 INT,
+ MyField2 STRING,
+ MyField3 BOOLEAN,
+ PRIMARY KEY (MyField1, MyField2) NOT ENFORCED -- defines a primary key on columns
+) WITH (
+ ...
+)
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+### Time Attributes
+
+Time attributes are essential when working with unbounded streaming tables. Therefore both proctime and rowtime attributes can be defined as part of the schema.
+
+For more information about time handling in Flink and especially event-time, we recommend the general [event-time section]({{< ref "docs/dev/table/concepts/time_attributes" >}}).
+
+#### Proctime Attributes
+
+In order to declare a proctime attribute in the schema, you can use [Computed Column syntax]({{< ref "docs/dev/table/sql/create" >}}#create-table) to declare a computed column which is generated from `PROCTIME()` builtin function.
+The computed column is a virtual column which is not stored in the physical data.
+
+{{< tabs "5d1f475b-a002-4e85-84f4-00ab0a55a548" >}}
+{{< tab "SQL" >}}
+```sql
+CREATE TABLE MyTable (
+ MyField1 INT,
+ MyField2 STRING,
+ MyField3 BOOLEAN
+ MyField4 AS PROCTIME() -- declares a proctime attribute
+) WITH (
+ ...
+)
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+#### Rowtime Attributes
+
+In order to control the event-time behavior for tables, Flink provides predefined timestamp extractors and watermark strategies.
+
+Please refer to [CREATE TABLE statements]({{< ref "docs/dev/table/sql/create" >}}#create-table) for more information about defining time attributes in DDL.
+
+The following timestamp extractors are supported:
+
+{{< tabs "b40272ba-b259-4a26-9651-815006b283e7" >}}
+{{< tab "DDL" >}}
+```sql
+-- use the existing TIMESTAMP(3) field in schema as the rowtime attribute
+CREATE TABLE MyTable (
+ ts_field TIMESTAMP(3),
+ WATERMARK FOR ts_field AS ...
+) WITH (
+ ...
+)
+
+-- use system functions or UDFs or expressions to extract the expected TIMESTAMP(3) rowtime field
+CREATE TABLE MyTable (
+ log_ts STRING,
+ ts_field AS TO_TIMESTAMP(log_ts),
+ WATERMARK FOR ts_field AS ...
+) WITH (
+ ...
+)
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+The following watermark strategies are supported:
+
+{{< tabs "e004ebfb-75b1-4d81-80ff-ac5420744b75" >}}
+{{< tab "DDL" >}}
+```sql
+-- Sets a watermark strategy for strictly ascending rowtime attributes. Emits a watermark of the
+-- maximum observed timestamp so far. Rows that have a timestamp bigger to the max timestamp
+-- are not late.
+CREATE TABLE MyTable (
+ ts_field TIMESTAMP(3),
+ WATERMARK FOR ts_field AS ts_field
+) WITH (
+ ...
+)
+
+-- Sets a watermark strategy for ascending rowtime attributes. Emits a watermark of the maximum
+-- observed timestamp so far minus 1. Rows that have a timestamp bigger or equal to the max timestamp
+-- are not late.
+CREATE TABLE MyTable (
+ ts_field TIMESTAMP(3),
+ WATERMARK FOR ts_field AS ts_field - INTERVAL '0.001' SECOND
+) WITH (
+ ...
+)
+
+-- Sets a watermark strategy for rowtime attributes which are out-of-order by a bounded time interval.
+-- Emits watermarks which are the maximum observed timestamp minus the specified delay, e.g. 2 seconds.
+CREATE TABLE MyTable (
+ ts_field TIMESTAMP(3),
+ WATERMARK FOR ts_field AS ts_field - INTERVAL '2' SECOND
+) WITH (
+ ...
+)
+```
+{{< /tab >}}
+{{< /tabs >}}
+
+Make sure to always declare both timestamps and watermarks. Watermarks are required for triggering time-based operations.
+
+### SQL Types
+
+Please see the [Data Types]({{< ref "docs/dev/table/types" >}}) page about how to declare a type in SQL.
+
+{{< top >}}
diff --git a/docs/content.zh/docs/connectors/table/print.md b/docs/content.zh/docs/connectors/table/print.md
new file mode 100644
index 0000000000000..6354389d70705
--- /dev/null
+++ b/docs/content.zh/docs/connectors/table/print.md
@@ -0,0 +1,146 @@
+---
+title: Print
+weight: 14
+type: docs
+aliases:
+ - /zh/dev/table/connectors/print.html
+---
+
+
+# Print SQL 连接器
+
+{{< label "Sink" >}}
+
+Print 连接器允许将每一行写入标准输出流或者标准错误流。
+
+设计目的:
+
+- 简单的流作业测试。
+- 对生产调试带来极大便利。
+
+四种 format 选项:
+
+
Defines a custom prefix for all fields of the key format to avoid name clashes with fields
+ of the value format. By default, the prefix is empty. If a custom prefix is defined, both the
+ table schema and 'key.fields' will work with prefixed names. When constructing the
+ data type of the key format, the prefix will be removed and the non-prefixed names will be used
+ within the key format. Please note that this option requires that 'value.fields-include'
+ must be set to 'EXCEPT_KEY'.
+
+
+
+
value.format
+
必选
+
(none)
+
String
+
用于对 Kafka 消息中 value 部分序列化和反序列化的格式。支持的格式包括 'csv'、'json'、'avro'。请参考}}">格式页面以获取更多详细信息和格式参数。
+
+
+
+
value.fields-include
+
必选
+
'ALL'
+
String
+
控制哪些字段应该出现在 value 中。可取值:
+
+
ALL:消息的 value 部分将包含 schema 中所有的字段,包括定义为主键的字段。
+
EXCEPT_KEY:记录的 value 部分包含 schema 的所有字段,定义为主键的字段除外。
+
+#### 自定义脚本
+
+你可以提供一个自定义的发现脚本来满足你的特殊需求,例如使用 AMD GPU。请确保自定义脚本的的路径正确配置(`external-resource..param.discovery-script.path`)并且 Flink 可以访问。自定义的发现脚本需要:
+
+ - `GPUDriver` 将 GPU 数量(由 `external-resource..amount` 定义)作为第一个参数传递到脚本中。
+ `external-resource..param.discovery-script.args` 中自定义的参数会被附加在后面。
+
+ - 脚本需返回可用 GPU 索引的列表,用逗号分隔。空白的索引将被忽略。
+
+ - 脚本可以通过以非零退出来表示其未正确执行。在这种情况下,算子将不会得到 GPU 资源相关信息。
diff --git a/docs/content.zh/docs/deployment/advanced/historyserver.md b/docs/content.zh/docs/deployment/advanced/historyserver.md
new file mode 100644
index 0000000000000..27944bd4d9441
--- /dev/null
+++ b/docs/content.zh/docs/deployment/advanced/historyserver.md
@@ -0,0 +1,109 @@
+---
+title: "History Server"
+weight: 3
+type: docs
+aliases:
+ - /zh/deployment/advanced/historyserver.html
+ - /zh/monitoring/historyserver.html
+---
+
+
+# History Server
+
+Flink 提供了 history server,可以在相应的 Flink 集群关闭之后查询已完成作业的统计信息。
+
+此外,它暴露了一套 REST API,该 API 接受 HTTP 请求并返回 JSON 格式的数据。
+
+
+
+
+
+## 概览
+
+HistoryServer 允许查询 JobManager 存档的已完成作业的状态和统计信息。
+
+在配置 HistoryServer *和* JobManager 之后,你可以使用相应的脚本来启动和停止 HistoryServer:
+
+```shell
+# 启动或者停止 HistoryServer
+bin/historyserver.sh (start|start-foreground|stop)
+```
+
+默认情况下,此服务器绑定到 `localhost` 的 `8082` 端口。
+
+目前,只能将 HistoryServer 作为独立的进程运行。
+
+
+
+## 配置参数
+
+配置项 `jobmanager.archive.fs.dir` 和 `historyserver.archive.fs.refresh-interval` 需要根据 `作业存档目录` 和 `刷新作业存档目录的时间间隔` 进行调整。
+
+**JobManager**
+
+已完成作业的存档在 JobManager 上进行,将已存档的作业信息上传到文件系统目录中。你可以在 `flink-conf.yaml` 文件中通过 `jobmanager.archive.fs.dir` 设置一个目录存档已完成的作业。
+
+```yaml
+# 上传已完成作业信息的目录
+jobmanager.archive.fs.dir: hdfs:///completed-jobs
+```
+
+**HistoryServer**
+
+可以通过 `historyserver.archive.fs.dir` 设置 HistoryServer 监视以逗号分隔的目录列表。定期轮询已配置的目录以查找新的存档;轮询间隔可以通过 `historyserver.archive.fs.refresh-interval` 来配置。
+
+```yaml
+# 监视以下目录中已完成的作业
+historyserver.archive.fs.dir: hdfs:///completed-jobs
+
+# 每 10 秒刷新一次
+historyserver.archive.fs.refresh-interval: 10000
+```
+
+所包含的存档被下载缓存在本地文件系统中。本地目录通过 `historyserver.web.tmpdir` 配置。
+
+请查看配置页面以获取[配置选项的完整列表]({{< ref "docs/deployment/config" >}}#history-server)。
+
+
+
+## 可用的请求
+
+以下是可用且带有示例 JSON 响应的请求列表。所有请求格式样例均为 `http://hostname:8082/jobs`,下面我们仅列出了 URLs 的 *path* 部分。
+尖括号中的值为变量,例如作业 `7684be6004e4e955c2a558a9bc463f65` 的
+`http://hostname:port/jobs//exceptions` 请求须写为 `http://hostname:port/jobs/7684be6004e4e955c2a558a9bc463f65/exceptions`。
+
+ - `/config`
+ - `/jobs/overview`
+ - `/jobs/`
+ - `/jobs//vertices`
+ - `/jobs//config`
+ - `/jobs//exceptions`
+ - `/jobs//accumulators`
+ - `/jobs//vertices/`
+ - `/jobs//vertices//subtasktimes`
+ - `/jobs//vertices//taskmanagers`
+ - `/jobs//vertices//accumulators`
+ - `/jobs//vertices//subtasks/accumulators`
+ - `/jobs//vertices//subtasks/`
+ - `/jobs//vertices//subtasks//attempts/`
+ - `/jobs//vertices//subtasks//attempts//accumulators`
+ - `/jobs//plan`
+
+{{< top >}}
diff --git a/docs/content.zh/docs/deployment/advanced/logging.md b/docs/content.zh/docs/deployment/advanced/logging.md
new file mode 100644
index 0000000000000..e276ffca8fd72
--- /dev/null
+++ b/docs/content.zh/docs/deployment/advanced/logging.md
@@ -0,0 +1,126 @@
+---
+title: 日志
+weight: 4
+type: docs
+aliases:
+ - /zh/deployment/advanced/logging.html
+ - /zh/monitoring/logging.html
+---
+
+
+# 如何使用日志记录
+
+Flink 中的日志记录是使用 slf4j 日志接口实现的。使用 log4j2 作为底层日志框架。我们也支持了 logback 日志配置,只要将其配置文件作为参数传递给 JVM 即可。愿意使用 logback 而不是 log4j2 的用户只需排除 log4j2 的依赖(或从 lib/ 文件夹中删除它)即可。
+
+
+
+
+
+## 配置 Log4j2
+
+Log4j2 是使用配置文件指定的。在 Flink 的使用中,该文件通常命名为 `log4j.properties`。我们使用 `-Dlog4j.configurationFile=` 参数将该文件的文件名和位置传递给 JVM。
+
+Flink 附带以下默认日志配置文件:
+
+- `log4j-cli.properties`:由 Flink 命令行客户端使用(例如 `flink run`)(不包括在集群上执行的代码)
+- `log4j-session.properties`:Flink 命令行客户端在启动 YARN 或 Kubernetes session 时使用(`yarn-session.sh`,`kubernetes-session.sh`)
+- `log4j.properties`:作为 JobManager/TaskManager 日志配置使用(standalone 和 YARN 两种模式下皆使用)
+
+
+
+### 与 Log4j1 的兼容性
+
+Flink 附带了 [Log4j API bridge](https://logging.apache.org/log4j/log4j-2.2/log4j-1.2-api/index.html),使得现有作业能够继续使用 log4j1 的接口。
+
+如果你有基于 Log4j 的自定义配置文件或代码,请查看官方 Log4j [兼容性](https://logging.apache.org/log4j/2.x/manual/compatibility.html)和[迁移](https://logging.apache.org/log4j/2.x/manual/migration.html)指南。
+
+
+
+## 配置 Log4j1
+
+要将 Flink 与 Log4j1 一起使用,必须确保:
+- Classpath 中不存在 `org.apache.logging.log4j:log4j-core`,`org.apache.logging.log4j:log4j-slf4j-impl` 和 `org.apache.logging.log4j:log4j-1.2-api`,
+- 且 Classpath 中存在 `log4j:log4j`,`org.slf4j:slf4j-log4j12`,`org.apache.logging.log4j:log4j-to-slf4j` 和 `org.apache.logging.log4j:log4j-api`。
+
+在 IDE 中使用 log4j1,你必须在 pom 文件中使用上述 `Classpath 中存在的 jars` 依赖项替换 `Classpath 中不存在的 jars` 依赖项,并尽可能在传递依赖于 `Classpath 中不存在的 jars` 的依赖项上添加排除 `Classpath 中不存在的 jars` 配置。
+
+对于 Flink 发行版,这意味着你必须
+- 从 `lib` 目录中移除 `log4j-core`,`log4j-slf4j-impl` 和 `log4j-1.2-api` jars,
+- 向 `lib` 目录中添加 `log4j`,`slf4j-log4j12` 和 `log4j-to-slf4j` jars,
+- 用兼容的 Log4j1 版本替换 `conf` 目录中的所有 log4j 配置文件。
+
+
+
+## 配置 logback
+
+对于用户和开发人员来说,控制日志框架非常重要。日志框架的配置完全由配置文件完成。必须通过设置环境参数 `-Dlogback.configurationFile=` 或将 `logback.xml` 放在 classpath 中来指定配置文件。`conf` 目录包含一个 `logback.xml` 文件,该文件可以修改,如果使用附带的启动脚本在 IDE 之外启动 Flink 则会使用该日志配置文件。提供的 `logback.xml` 具有以下格式:
+
+```xml
+
+
+ ${log.file}
+ false
+
+ %d{HH:mm:ss.SSS} [%thread] %-5level %logger{60} %X{sourceThread} - %msg%n
+
+
+
+
+
+
+
+```
+
+例如,为了控制 `org.apache.flink.runtime.jobgraph.JobGraph` 的日志记录级别,必须将以下行添加到配置文件中。
+
+```xml
+
+```
+
+有关配置日志的更多信息,请参见 [LOGback 手册](http://logback.qos.ch/manual/configuration.html)。
+
+
+
+## 开发人员的最佳实践
+
+Slf4j 的 loggers 通过调用 `LoggerFactory` 的 `getLogger()` 方法创建
+
+```java
+import org.slf4j.LoggerFactory
+import org.slf4j.Logger
+
+Logger LOG = LoggerFactory.getLogger(Foobar.class)
+```
+
+为了最大限度地利用 slf4j,建议使用其占位符机制。使用占位符可以避免不必要的字符串构造,以防日志级别设置得太高而不会记录消息。占位符的语法如下:
+
+```java
+LOG.info("This message contains {} placeholders. {}", 2, "Yippie");
+```
+
+占位符也可以和要记录的异常一起使用。
+
+```java
+catch(Exception exception){
+ LOG.error("An {} occurred.", "error", exception);
+}
+```
+
+{{< top >}}
diff --git a/docs/content.zh/docs/deployment/cli.md b/docs/content.zh/docs/deployment/cli.md
new file mode 100644
index 0000000000000..13d8bb603ad46
--- /dev/null
+++ b/docs/content.zh/docs/deployment/cli.md
@@ -0,0 +1,503 @@
+---
+title: 命令行界面
+weight: 5
+type: docs
+aliases:
+ - /zh/deployment/cli.html
+ - /zh/apis/cli.html
+---
+
+
+# 命令行界面
+
+Flink provides a Command-Line Interface (CLI) `bin/flink` to run programs that
+are packaged as JAR files and to control their execution. The CLI is part of any
+Flink setup, available in local single node setups and in distributed setups.
+It connects to the running JobManager specified in `conf/flink-config.yaml`.
+
+
+
+## Job Lifecycle Management
+
+A prerequisite for the commands listed in this section to work is to have a running Flink deployment
+like [Kubernetes]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}),
+[YARN]({{< ref "docs/deployment/resource-providers/yarn" >}}) or any other option available. Feel free to
+[start a Flink cluster locally]({{< ref "docs/deployment/resource-providers/standalone/overview" >}}#starting-a-standalone-cluster-session-mode)
+to try the commands on your own machine.
+
+### Submitting a Job
+
+Submitting a job means uploading the job's JAR and related dependencies to the Flink cluster and
+initiating the job execution. For the sake of this example, we select a long-running job like
+`examples/streaming/StateMachineExample.jar`. Feel free to select any other JAR archive from the
+`examples/` folder or deploy your own job.
+```bash
+$ ./bin/flink run \
+ --detached \
+ ./examples/streaming/StateMachineExample.jar
+```
+Submitting the job using `--detached` will make the command return after the submission is done.
+The output contains (besides other things) the ID of the newly submitted job.
+```
+Usage with built-in data generator: StateMachineExample [--error-rate ] [--sleep ]
+Usage with Kafka: StateMachineExample --kafka-topic [--brokers ]
+Options for both the above setups:
+ [--backend ]
+ [--checkpoint-dir ]
+ [--async-checkpoints ]
+ [--incremental-checkpoints ]
+ [--output OR null for stdout]
+
+Using standalone source with error rate 0.000000 and sleep delay 1 millis
+
+Job has been submitted with JobID cca7bc1061d61cf15238e92312c2fc20
+```
+The usage information printed lists job-related parameters that can be added to the end of the job
+submission command if necessary. For the purpose of readability, we assume that the returned JobID is
+stored in a variable `JOB_ID` for the commands below:
+```bash
+$ export JOB_ID="cca7bc1061d61cf15238e92312c2fc20"
+```
+
+There is another action called `run-application` available to run the job in
+[Application Mode]({{< ref "docs/deployment/overview" >}}#application-mode). This documentation does not address
+this action individually as it works similarly to the `run` action in terms of the CLI frontend.
+
+### Job Monitoring
+
+You can monitor any running jobs using the `list` action:
+```bash
+$ ./bin/flink list
+```
+```
+Waiting for response...
+------------------ Running/Restarting Jobs -------------------
+30.11.2020 16:02:29 : cca7bc1061d61cf15238e92312c2fc20 : State machine job (RUNNING)
+--------------------------------------------------------------
+No scheduled jobs.
+```
+Jobs that were submitted but not started, yet, would be listed under "Scheduled Jobs".
+
+### Creating a Savepoint
+
+[Savepoints]({{< ref "docs/ops/state/savepoints" >}}) can be created to save the current state a job is
+in. All that's needed is the JobID:
+```bash
+$ ./bin/flink savepoint \
+ $JOB_ID \
+ /tmp/flink-savepoints
+```
+```
+Triggering savepoint for job cca7bc1061d61cf15238e92312c2fc20.
+Waiting for response...
+Savepoint completed. Path: file:/tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab
+You can resume your program from this savepoint with the run command.
+```
+The savepoint folder is optional and needs to be specified if
+[state.savepoints.dir]({{< ref "docs/deployment/config" >}}#state-savepoints-dir) isn't set.
+
+The path to the savepoint can be used later on to [restart the Flink job](#starting-a-job-from-a-savepoint).
+
+#### Disposing a Savepoint
+
+The `savepoint` action can be also used to remove savepoints. `--dispose` with the corresponding
+savepoint path needs to be added:
+```bash
+$ ./bin/flink savepoint \
+ --dispose \
+ /tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab \
+ $JOB_ID
+```
+```
+Disposing savepoint '/tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab'.
+Waiting for response...
+Savepoint '/tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab' disposed.
+```
+
+If you use custom state instances (for example custom reducing state or RocksDB state), you have to
+specify the path to the program JAR with which the savepoint was triggered. Otherwise, you will run
+into a `ClassNotFoundException`:
+```bash
+$ ./bin/flink savepoint \
+ --dispose \
+ --jarfile
+```
+
+Triggering the savepoint disposal through the `savepoint` action does not only remove the data from
+the storage but makes Flink clean up the savepoint-related metadata as well.
+
+### Terminating a Job
+
+#### Stopping a Job Gracefully Creating a Final Savepoint
+
+Another action for stopping a job is `stop`. It is a more graceful way of stopping a running streaming
+job as the `stop` flows from source to sink. When the user requests to stop a job, all sources will
+be requested to send the last checkpoint barrier that will trigger a savepoint, and after the successful
+completion of that savepoint, they will finish by calling their `cancel()` method.
+
+```bash
+$ ./bin/flink stop \
+ --savepointPath /tmp-flink-savepoints \
+ $JOB_ID
+```
+```
+Suspending job "cca7bc1061d61cf15238e92312c2fc20" with a savepoint.
+Savepoint completed. Path: file:/tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab
+```
+We have to use `--savepointPath` to specify the savepoint folder if
+[state.savepoints.dir]({{< ref "docs/deployment/config" >}}#state-savepoints-dir) isn't set.
+
+If the `--drain` flag is specified, then a `MAX_WATERMARK` will be emitted before the last checkpoint
+barrier. This will make all registered event-time timers fire, thus flushing out any state that
+is waiting for a specific watermark, e.g. windows. The job will keep running until all sources properly
+shut down. This allows the job to finish processing all in-flight data, which can produce some
+records to process after the savepoint taken while stopping.
+
+{{< hint danger >}}
+Use the `--drain` flag if you want to terminate the job permanently.
+If you want to resume the job at a later point in time, then do not drain the pipeline because it could lead to incorrect results when the job is resumed.
+{{< /hint >}}
+
+#### Cancelling a Job Ungracefully
+
+Cancelling a job can be achieved through the `cancel` action:
+```bash
+$ ./bin/flink cancel $JOB_ID
+```
+```
+Cancelling job cca7bc1061d61cf15238e92312c2fc20.
+Cancelled job cca7bc1061d61cf15238e92312c2fc20.
+```
+The corresponding job's state will be transitioned from `Running` to `Cancelled`. Any computations
+will be stopped.
+
+{{< hint danger >}}
+The `--withSavepoint` flag allows creating a savepoint as part of the job cancellation.
+This feature is deprecated.
+Use the [stop](#stopping-a-job-gracefully-creating-a-final-savepoint) action instead.
+{{< /hint >}}
+
+### Starting a Job from a Savepoint
+
+Starting a job from a savepoint can be achieved using the `run` (and `run-application`) action.
+```bash
+$ ./bin/flink run \
+ --detached \
+ --fromSavepoint /tmp/flink-savepoints/savepoint-cca7bc-bb1e257f0dab \
+ ./examples/streaming/StateMachineExample.jar
+```
+```
+Usage with built-in data generator: StateMachineExample [--error-rate ] [--sleep ]
+Usage with Kafka: StateMachineExample --kafka-topic [--brokers ]
+Options for both the above setups:
+ [--backend ]
+ [--checkpoint-dir ]
+ [--async-checkpoints ]
+ [--incremental-checkpoints ]
+ [--output OR null for stdout]
+
+Using standalone source with error rate 0.000000 and sleep delay 1 millis
+
+Job has been submitted with JobID 97b20a0a8ffd5c1d656328b0cd6436a6
+```
+
+See how the command is equal to the [initial run command](#submitting-a-job) except for the
+`--fromSavepoint` parameter which is used to refer to the state of the
+[previously stopped job](#stopping-a-job-gracefully-creating-a-final-savepoint). A new JobID is
+generated that can be used to maintain the job.
+
+By default, we try to match the whole savepoint state to the job being submitted. If you want to
+allow to skip savepoint state that cannot be restored with the new job you can set the
+`--allowNonRestoredState` flag. You need to allow this if you removed an operator from your program
+that was part of the program when the savepoint was triggered and you still want to use the savepoint.
+
+```bash
+$ ./bin/flink run \
+ --fromSavepoint \
+ --allowNonRestoredState ...
+```
+This is useful if your program dropped an operator that was part of the savepoint.
+
+{{< top >}}
+
+## CLI Actions
+
+Here's an overview of actions supported by Flink's CLI tool:
+
+
+
+
Action
+
Purpose
+
+
+
+
+
run
+
+ This action executes jobs. It requires at least the jar containing the job. Flink-
+ or job-related arguments can be passed if necessary.
+
+ This action can be used to print an optimized execution graph of the passed job. Again,
+ the jar containing the job needs to be passed.
+
+
+
+
list
+
+ This action lists all running or scheduled jobs.
+
+
+
+
savepoint
+
+ This action can be used to create or disposing savepoints for a given job. It might be
+ necessary to specify a savepoint directory besides the JobID, if the
+ }}#state-savepoints-dir">state.savepoints.dir
+ parameter was not specified in conf/flink-config.yaml.
+
+
+
+
cancel
+
+ This action can be used to cancel running jobs based on their JobID.
+
+
+
+
stop
+
+ This action combines the cancel and
+ savepoint actions to stop a running job
+ but also create a savepoint to start from again.
+
+
+
+
+
+A more fine-grained description of all actions and their parameters can be accessed through `bin/flink --help`
+or the usage information of each individual action `bin/flink --help`.
+
+{{< top >}}
+
+## Advanced CLI
+
+### REST API
+
+The Flink cluster can be also managed using the [REST API]({{< ref "docs/ops/rest_api" >}}). The commands
+described in previous sections are a subset of what is offered by Flink's REST endpoints. Therefore,
+tools like `curl` can be used to get even more out of Flink.
+
+### Selecting Deployment Targets
+
+Flink is compatible with multiple cluster management frameworks like
+[Kubernetes]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}) or
+[YARN]({{< ref "docs/deployment/resource-providers/yarn" >}}) which are described in more detail in the
+Resource Provider section. Jobs can be submitted in different [Deployment Modes]({{< ref "docs/deployment/overview" >}}#deployment-modes).
+The parameterization of a job submission differs based on the underlying framework and Deployment Mode.
+
+`bin/flink` offers a parameter `--target` to handle the different options. In addition to that, jobs
+have to be submitted using either `run` (for [Session]({{< ref "docs/deployment/overview" >}}#session-mode)
+and [Per-Job Mode]({{< ref "docs/deployment/overview" >}}#per-job-mode)) or `run-application` (for
+[Application Mode]({{< ref "docs/deployment/overview" >}}#application-mode)). See the following summary of
+parameter combinations:
+* YARN
+ * `./bin/flink run --target yarn-session`: Submission to an already running Flink on YARN cluster
+ * `./bin/flink run --target yarn-per-job`: Submission spinning up a Flink on YARN cluster in Per-Job Mode
+ * `./bin/flink run-application --target yarn-application`: Submission spinning up Flink on YARN cluster in Application Mode
+* Kubernetes
+ * `./bin/flink run --target kubernetes-session`: Submission to an already running Flink on Kubernetes cluster
+ * `./bin/flink run-application --target kubernetes-application`: Submission spinning up a Flink on Kubernetes cluster in Application Mode
+* Mesos
+ * `./bin/flink run --target remote`: Submission to an already running Flink on Mesos cluster
+* Standalone:
+ * `./bin/flink run --target local`: Local submission using a MiniCluster in Session Mode
+ * `./bin/flink run --target remote`: Submission to an already running Flink cluster
+
+The `--target` will overwrite the [execution.target]({{< ref "docs/deployment/config" >}}#execution-target)
+specified in the `config/flink-config.yaml`.
+
+For more details on the commands and the available options, please refer to the Resource Provider-specific
+pages of the documentation.
+
+### Submitting PyFlink Jobs
+
+Currently, users are able to submit a PyFlink job via the CLI. It does not require to specify the
+JAR file path or the entry main class, which is different from the Java job submission.
+
+{{< hint info >}}
+When submitting Python job via `flink run`, Flink will run the command "python". Please run the following command to confirm that the python executable in current environment points to a supported Python version of 3.6+.
+{{< /hint >}}
+```bash
+$ python --version
+# the version printed here must be 3.6+
+```
+
+The following commands show different PyFlink job submission use-cases:
+
+- Run a PyFlink job:
+```bash
+$ ./bin/flink run --python examples/python/table/batch/word_count.py
+```
+
+- Run a PyFlink job with additional source and resource files. Files specified in `--pyFiles` will be
+added to the `PYTHONPATH` and, therefore, available in the Python code.
+```bash
+$ ./bin/flink run \
+ --python examples/python/table/batch/word_count.py \
+ --pyFiles file:///user.txt,hdfs:///$namenode_address/username.txt
+```
+
+- Run a PyFlink job which will reference Java UDF or external connectors. JAR file specified in `--jarfile` will be uploaded
+to the cluster.
+```bash
+$ ./bin/flink run \
+ --python examples/python/table/batch/word_count.py \
+ --jarfile
+```
+
+- Run a PyFlink job with pyFiles and the main entry module specified in `--pyModule`:
+```bash
+$ ./bin/flink run \
+ --pyModule batch.word_count \
+ --pyFiles examples/python/table/batch
+```
+
+- Submit a PyFlink job on a specific JobManager running on host `` (adapt the command accordingly):
+```bash
+$ ./bin/flink run \
+ --jobmanager :8081 \
+ --python examples/python/table/batch/word_count.py
+```
+
+- Run a PyFlink job using a [YARN cluster in Per-Job Mode]({{< ref "docs/deployment/resource-providers/yarn" >}}#per-job-cluster-mode):
+```bash
+$ ./bin/flink run \
+ --target yarn-per-job
+ --python examples/python/table/batch/word_count.py
+```
+
+- Run a PyFlink application on a native Kubernetes cluster having the cluster ID ``, it requires a docker image with PyFlink installed, please refer to [Enabling PyFlink in docker]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#enabling-python):
+```bash
+$ ./bin/flink run-application \
+ --target kubernetes-application \
+ --parallelism 8 \
+ -Dkubernetes.cluster-id= \
+ -Dtaskmanager.memory.process.size=4096m \
+ -Dkubernetes.taskmanager.cpu=2 \
+ -Dtaskmanager.numberOfTaskSlots=4 \
+ -Dkubernetes.container.image= \
+ --pyModule word_count \
+ --pyFiles /opt/flink/examples/python/table/batch/word_count.py
+```
+
+To learn more available options, please refer to [Kubernetes]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}})
+or [YARN]({{< ref "docs/deployment/resource-providers/yarn" >}}) which are described in more detail in the
+Resource Provider section.
+
+Besides `--pyFiles`, `--pyModule` and `--python` mentioned above, there are also some other Python
+related options. Here's an overview of all the Python related options for the actions
+`run` and `run-application` supported by Flink's CLI tool:
+
+
+
+
Option
+
Description
+
+
+
+
+
-py,--python
+
+ Python script with the program entry. The dependent resources can be configured
+ with the --pyFiles option.
+
+
+
+
-pym,--pyModule
+
+ Python module with the program entry point.
+ This option must be used in conjunction with --pyFiles.
+
+
+
+
-pyfs,--pyFiles
+
+ Attach custom files for job. The standard resource file suffixes such as .py/.egg/.zip/.whl or directory are all supported.
+ These files will be added to the PYTHONPATH of both the local client and the remote python UDF worker.
+ Files suffixed with .zip will be extracted and added to PYTHONPATH.
+ Comma (',') could be used as the separator to specify multiple files
+ (e.g., --pyFiles file:///tmp/myresource.zip,hdfs:///$namenode_address/myresource2.zip).
+
+
+
+
-pyarch,--pyArchives
+
+ Add python archive files for job. The archive files will be extracted to the working directory
+ of python UDF worker. Currently only zip-format is supported. For each archive file, a target directory
+ be specified. If the target directory name is specified, the archive file will be extracted to a
+ directory with the specified name. Otherwise, the archive file will be extracted to a
+ directory with the same name of the archive file. The files uploaded via this option are accessible
+ via relative path. '#' could be used as the separator of the archive file path and the target directory
+ name. Comma (',') could be used as the separator to specify multiple archive files.
+ This option can be used to upload the virtual environment, the data files used in Python UDF
+ (e.g., --pyArchives file:///tmp/py37.zip,file:///tmp/data.zip#data --pyExecutable
+ py37.zip/py37/bin/python). The data files could be accessed in Python UDF, e.g.:
+ f = open('data/data.txt', 'r').
+
+
+
+
-pyexec,--pyExecutable
+
+ Specify the path of the python interpreter used to execute the python UDF worker
+ (e.g.: --pyExecutable /usr/local/bin/python3).
+ The python UDF worker depends on Python 3.6+, Apache Beam (version == 2.27.0),
+ Pip (version >= 7.1.0) and SetupTools (version >= 37.0.0).
+ Please ensure that the specified environment meets the above requirements.
+
+
+
+
-pyreq,--pyRequirements
+
+ Specify the requirements.txt file which defines the third-party dependencies.
+ These dependencies will be installed and added to the PYTHONPATH of the python UDF worker.
+ A directory which contains the installation packages of these dependencies could be specified
+ optionally. Use '#' as the separator if the optional parameter exists
+ (e.g., --pyRequirements file:///tmp/requirements.txt#file:///tmp/cached_dir).
+
+
+
+
+
+In addition to the command line options during submitting the job, it also supports to specify the
+dependencies via configuration or Python API inside the code. Please refer to the
+[dependency management]({{< ref "docs/dev/python/dependency_management" >}}) for more details.
+
+{{< top >}}
diff --git a/docs/content.zh/docs/deployment/config.md b/docs/content.zh/docs/deployment/config.md
new file mode 100644
index 0000000000000..cf1c24bede36c
--- /dev/null
+++ b/docs/content.zh/docs/deployment/config.md
@@ -0,0 +1,474 @@
+---
+title: "配置参数"
+weight: 3
+type: docs
+bookToc: false
+aliases:
+ - /zh/deployment/config.html
+ - /zh/ops/config.html
+---
+
+
+# 配置参数
+
+All configuration is done in `conf/flink-conf.yaml`, which is expected to be a flat collection of [YAML key value pairs](http://www.yaml.org/spec/1.2/spec.html) with format `key: value`.
+
+The configuration is parsed and evaluated when the Flink processes are started. Changes to the configuration file require restarting the relevant processes.
+
+The out of the box configuration will use your default Java installation. You can manually set the environment variable `JAVA_HOME` or the configuration key `env.java.home` in `conf/flink-conf.yaml` if you want to manually override the Java runtime to use.
+
+You can specify a different configuration directory location by defining the `FLINK_CONF_DIR` environment variable. For resource providers which provide non-session deployments, you can specify per-job configurations this way. Make a copy of the `conf` directory from the Flink distribution and modify the settings on a per-job basis. Note that this is not supported in Docker or standalone Kubernetes deployments. On Docker-based deployments, you can use the `FLINK_PROPERTIES` environment variable for passing configuration values.
+
+On session clusters, the provided configuration will only be used for configuring [execution](#execution) parameters, e.g. configuration parameters affecting the job, not the underlying cluster.
+
+# Basic Setup
+
+The default configuration supports starting a single-node Flink session cluster without any changes.
+The options in this section are the ones most commonly needed for a basic distributed Flink setup.
+
+**Hostnames / Ports**
+
+These options are only necessary for *standalone* application- or session deployments ([simple standalone]({{< ref "docs/deployment/resource-providers/standalone/overview" >}}) or [Kubernetes]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}})).
+
+If you use Flink with [Yarn]({{< ref "docs/deployment/resource-providers/yarn" >}}), [Mesos]({{< ref "docs/deployment/resource-providers/mesos" >}}), or the [*active* Kubernetes integration]({{< ref "docs/deployment/resource-providers/native_kubernetes" >}}), the hostnames and ports are automatically discovered.
+
+ - `rest.address`, `rest.port`: These are used by the client to connect to Flink. Set this to the hostname where the JobManager runs, or to the hostname of the (Kubernetes) service in front of the JobManager's REST interface.
+
+ - The `jobmanager.rpc.address` (defaults to *"localhost"*) and `jobmanager.rpc.port` (defaults to *6123*) config entries are used by the TaskManager to connect to the JobManager/ResourceManager. Set this to the hostname where the JobManager runs, or to the hostname of the (Kubernetes internal) service for the JobManager. This option is ignored on [setups with high-availability]({{< ref "docs/deployment/ha/overview" >}}) where the leader election mechanism is used to discover this automatically.
+
+**Memory Sizes**
+
+The default memory sizes support simple streaming/batch applications, but are too low to yield good performance for more complex applications.
+
+ - `jobmanager.memory.process.size`: Total size of the *JobManager* (JobMaster / ResourceManager / Dispatcher) process.
+ - `taskmanager.memory.process.size`: Total size of the TaskManager process.
+
+The total sizes include everything. Flink will subtract some memory for the JVM's own memory requirements (metaspace and others), and divide and configure the rest automatically between its components (JVM Heap, Off-Heap, for Task Managers also network, managed memory etc.).
+
+These value are configured as memory sizes, for example *1536m* or *2g*.
+
+**Parallelism**
+
+ - `taskmanager.numberOfTaskSlots`: The number of slots that a TaskManager offers *(default: 1)*. Each slot can take one task or pipeline.
+ Having multiple slots in a TaskManager can help amortize certain constant overheads (of the JVM, application libraries, or network connections) across parallel tasks or pipelines. See the [Task Slots and Resources]({{< ref "docs/concepts/flink-architecture" >}}#task-slots-and-resources) concepts section for details.
+
+ Running more smaller TaskManagers with one slot each is a good starting point and leads to the best isolation between tasks. Dedicating the same resources to fewer larger TaskManagers with more slots can help to increase resource utilization, at the cost of weaker isolation between the tasks (more tasks share the same JVM).
+
+ - `parallelism.default`: The default parallelism used when no parallelism is specified anywhere *(default: 1)*.
+
+**Checkpointing**
+
+You can configure checkpointing directly in code within your Flink job or application. Putting these values here in the configuration defines them as defaults in case the application does not configure anything.
+
+ - `state.backend`: The state backend to use. This defines the data structure mechanism for taking snapshots. Common values are `filesystem` or `rocksdb`.
+ - `state.checkpoints.dir`: The directory to write checkpoints to. This takes a path URI like *s3://mybucket/flink-app/checkpoints* or *hdfs://namenode:port/flink/checkpoints*.
+ - `state.savepoints.dir`: The default directory for savepoints. Takes a path URI, similar to `state.checkpoints.dir`.
+
+**Web UI**
+
+ - `web.submit.enable`: Enables uploading and starting jobs through the Flink UI *(true by default)*. Please note that even when this is disabled, session clusters still accept jobs through REST requests (HTTP calls). This flag only guards the feature to upload jobs in the UI.
+ - `web.cancel.enable`: Enables canceling jobs through the Flink UI *(true by default)*. Please note that even when this is disabled, session clusters still cancel jobs through REST requests (HTTP calls). This flag only guards the feature to cancel jobs in the UI.
+ - `web.upload.dir`: The directory where to store uploaded jobs. Only used when `web.submit.enable` is true.
+
+**Other**
+
+ - `io.tmp.dirs`: The directories where Flink puts local data, defaults to the system temp directory (`java.io.tmpdir` property). If a list of directories is configured, Flink will rotate files across the directories.
+
+ The data put in these directories include by default the files created by RocksDB, spilled intermediate results (batch algorithms), and cached jar files.
+
+ This data is NOT relied upon for persistence/recovery, but if this data gets deleted, it typically causes a heavyweight recovery operation. It is hence recommended to set this to a directory that is not automatically periodically purged.
+
+ Yarn, Mesos, and Kubernetes setups automatically configure this value to the local working directories by default.
+
+----
+----
+
+# Common Setup Options
+
+*Common options to configure your Flink application or cluster.*
+
+### Hosts and Ports
+
+Options to configure hostnames and ports for the different Flink components.
+
+The JobManager hostname and port are only relevant for standalone setups without high-availability.
+In that setup, the config values are used by the TaskManagers to find (and connect to) the JobManager.
+In all highly-available setups, the TaskManagers discover the JobManager via the High-Availability-Service (for example ZooKeeper).
+
+Setups using resource orchestration frameworks (K8s, Yarn, Mesos) typically use the framework's service discovery facilities.
+
+You do not need to configure any TaskManager hosts and ports, unless the setup requires the use of specific port ranges or specific network interfaces to bind to.
+
+{{< generated/common_host_port_section >}}
+
+### Fault Tolerance
+
+These configuration options control Flink's restart behaviour in case of failures during the execution.
+By configuring these options in your `flink-conf.yaml`, you define the cluster's default restart strategy.
+
+The default restart strategy will only take effect if no job specific restart strategy has been configured via the `ExecutionConfig`.
+
+{{< generated/restart_strategy_configuration >}}
+
+**Fixed Delay Restart Strategy**
+
+{{< generated/fixed_delay_restart_strategy_configuration >}}
+
+**Failure Rate Restart Strategy**
+
+{{< generated/failure_rate_restart_strategy_configuration >}}
+
+### Checkpoints and State Backends
+
+These options control the basic setup of state backends and checkpointing behavior.
+
+The options are only relevant for jobs/applications executing in a continuous streaming fashion.
+Jobs/applications executing in a batch fashion do not use state backends and checkpoints, but different internal data structures that are optimized for batch processing.
+
+{{< generated/common_state_backends_section >}}
+
+### High Availability
+
+High-availability here refers to the ability of the JobManager process to recover from failures.
+
+The JobManager ensures consistency during recovery across TaskManagers. For the JobManager itself to recover consistently, an external service must store a minimal amount of recovery metadata (like "ID of last committed checkpoint"), as well as help to elect and lock which JobManager is the leader (to avoid split-brain situations).
+
+{{< generated/common_high_availability_section >}}
+
+**Options for high-availability setups with ZooKeeper**
+
+{{< generated/common_high_availability_zk_section >}}
+
+### Memory Configuration
+
+These configuration values control the way that TaskManagers and JobManagers use memory.
+
+Flink tries to shield users as much as possible from the complexity of configuring the JVM for data-intensive processing.
+In most cases, users should only need to set the values `taskmanager.memory.process.size` or `taskmanager.memory.flink.size` (depending on how the setup), and possibly adjusting the ratio of JVM heap and Managed Memory via `taskmanager.memory.managed.fraction`. The other options below can be used for performance tuning and fixing memory related errors.
+
+For a detailed explanation of how these options interact,
+see the documentation on [TaskManager]({{< ref "docs/deployment/memory/mem_setup_tm" >}}) and
+[JobManager]({{< ref "docs/deployment/memory/mem_setup_jobmanager" >}} ) memory configurations.
+
+{{< generated/common_memory_section >}}
+
+### Miscellaneous Options
+
+{{< generated/common_miscellaneous_section >}}
+
+----
+----
+
+# Security
+
+Options for configuring Flink's security and secure interaction with external systems.
+
+### SSL
+
+Flink's network connections can be secured via SSL. Please refer to the [SSL Setup Docs]({{< ref "docs/deployment/security/security-ssl" >}}) for detailed setup guide and background.
+
+{{< generated/security_ssl_section >}}
+
+
+### Auth with External Systems
+
+**ZooKeeper Authentication / Authorization**
+
+These options are necessary when connecting to a secured ZooKeeper quorum.
+
+{{< generated/security_auth_zk_section >}}
+
+**Kerberos-based Authentication / Authorization**
+
+Please refer to the [Flink and Kerberos Docs]({{< ref "docs/deployment/security/security-kerberos" >}}) for a setup guide and a list of external system to which Flink can authenticate itself via Kerberos.
+
+{{< generated/security_auth_kerberos_section >}}
+
+----
+----
+
+# Resource Orchestration Frameworks
+
+This section contains options related to integrating Flink with resource orchestration frameworks, like Kubernetes, Yarn, Mesos, etc.
+
+Note that is not always necessary to integrate Flink with the resource orchestration framework.
+For example, you can easily deploy Flink applications on Kubernetes without Flink knowing that it runs on Kubernetes (and without specifying any of the Kubernetes config options here.) See [this setup guide]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}}) for an example.
+
+The options in this section are necessary for setups where Flink itself actively requests and releases resources from the orchestrators.
+
+### YARN
+
+{{< generated/yarn_config_configuration >}}
+
+### Kubernetes
+
+{{< generated/kubernetes_config_configuration >}}
+
+### Mesos
+
+{{< hint warning >}}
+Apache Mesos support was deprecated in Flink 1.13 and is subject to removal in the future (see
+[FLINK-22352](https://issues.apache.org/jira/browse/FLINK-22352) for further details).
+{{< /hint >}}
+
+{{< generated/mesos_configuration >}}
+
+**Mesos TaskManager**
+
+{{< generated/mesos_task_manager_configuration >}}
+
+----
+----
+
+# State Backends
+
+Please refer to the [State Backend Documentation]({{< ref "docs/ops/state/state_backends" >}}) for background on State Backends.
+
+### RocksDB State Backend
+
+These are the options commonly needed to configure the RocksDB state backend. See the [Advanced RocksDB Backend Section](#advanced-rocksdb-state-backends-options) for options necessary for advanced low level configurations and trouble-shooting.
+
+{{< generated/state_backend_rocksdb_section >}}
+
+----
+----
+
+# Metrics
+
+Please refer to the [metrics system documentation]({{< ref "docs/ops/metrics" >}}) for background on Flink's metrics infrastructure.
+
+{{< generated/metric_configuration >}}
+
+### RocksDB Native Metrics
+
+Flink can report metrics from RocksDB's native code, for applications using the RocksDB state backend.
+The metrics here are scoped to the operators and then further broken down by column family; values are reported as unsigned longs.
+
+{{< hint warning >}}
+Enabling RocksDB's native metrics may cause degraded performance and should be set carefully.
+{{< /hint >}}
+
+{{< generated/rocksdb_native_metric_configuration >}}
+
+----
+----
+
+# History Server
+
+The history server keeps the information of completed jobs (graphs, runtimes, statistics). To enable it, you have to enable "job archiving" in the JobManager (`jobmanager.archive.fs.dir`).
+
+See the [History Server Docs]({{< ref "docs/deployment/advanced/historyserver" >}}) for details.
+
+{{< generated/history_server_configuration >}}
+
+----
+----
+
+# Experimental
+
+*Options for experimental features in Flink.*
+
+### Queryable State
+
+*Queryable State* is an experimental features that gives lets you access Flink's internal state like a key/value store.
+See the [Queryable State Docs]({{< ref "docs/dev/datastream/fault-tolerance/queryable_state" >}}) for details.
+
+{{< generated/queryable_state_configuration >}}
+
+----
+----
+
+# Client
+
+{{< generated/client_configuration >}}
+
+----
+----
+
+# Execution
+
+{{< generated/deployment_configuration >}}
+{{< generated/savepoint_config_configuration >}}
+{{< generated/execution_configuration >}}
+
+### Pipeline
+
+{{< generated/pipeline_configuration >}}
+{{< generated/stream_pipeline_configuration >}}
+
+### Checkpointing
+
+{{< generated/execution_checkpointing_configuration >}}
+
+----
+----
+
+# Debugging & Expert Tuning
+
+
+ The options below here are meant for expert users and for fixing/debugging problems. Most setups should not need to configure these options.
+
+
+### Class Loading
+
+Flink dynamically loads the code for jobs submitted to a session cluster. In addition, Flink tries to hide many dependencies in the classpath from the application. This helps to reduce dependency conflicts between the application code and the dependencies in the classpath.
+
+Please refer to the [Debugging Classloading Docs]({{< ref "docs/ops/debugging/debugging_classloading" >}}) for details.
+
+{{< generated/expert_class_loading_section >}}
+
+### Advanced Options for the debugging
+
+{{< generated/expert_debugging_and_tuning_section >}}
+
+### Advanced State Backends Options
+
+{{< generated/expert_state_backends_section >}}
+
+### State Backends Latency Tracking Options
+
+{{< generated/state_backend_latency_tracking_section >}}
+
+### Advanced RocksDB State Backends Options
+
+Advanced options to tune RocksDB and RocksDB checkpoints.
+
+{{< generated/expert_rocksdb_section >}}
+
+**RocksDB Configurable Options**
+
+These options give fine-grained control over the behavior and resoures of ColumnFamilies.
+With the introduction of `state.backend.rocksdb.memory.managed` and `state.backend.rocksdb.memory.fixed-per-slot` (Apache Flink 1.10), it should be only necessary to use the options here for advanced performance tuning. These options here can also be specified in the application program via `RocksDBStateBackend.setRocksDBOptions(RocksDBOptionsFactory)`.
+
+{{< generated/rocksdb_configurable_configuration >}}
+
+### Advanced Fault Tolerance Options
+
+*These parameters can help with problems related to failover and to components erroneously considering each other as failed.*
+
+{{< generated/expert_fault_tolerance_section >}}
+
+### Advanced Cluster Options
+
+{{< generated/expert_cluster_section >}}
+
+### Advanced Scheduling Options
+
+*These parameters can help with fine-tuning scheduling for specific situations.*
+
+{{< generated/expert_scheduling_section >}}
+
+### Advanced High-availability Options
+
+{{< generated/expert_high_availability_section >}}
+
+### Advanced High-availability ZooKeeper Options
+
+{{< generated/expert_high_availability_zk_section >}}
+
+### Advanced High-availability Kubernetes Options
+
+{{< generated/expert_high_availability_k8s_section >}}
+
+### Advanced SSL Security Options
+
+{{< generated/expert_security_ssl_section >}}
+
+### Advanced Options for the REST endpoint and Client
+
+{{< generated/expert_rest_section >}}
+
+### Advanced Options for Flink Web UI
+
+{{< generated/web_configuration >}}
+
+### Full JobManager Options
+
+**JobManager**
+
+{{< generated/all_jobmanager_section >}}
+
+**Blob Server**
+
+The Blob Server is a component in the JobManager. It is used for distribution of objects that are too large to be attached to a RPC message and that benefit from caching (like Jar files or large serialized code objects).
+
+{{< generated/blob_server_configuration >}}
+
+**ResourceManager**
+
+These configuration keys control basic Resource Manager behavior, independent of the used resource orchestration management framework (YARN, Mesos, etc.)
+
+{{< generated/resource_manager_configuration >}}
+
+### Full TaskManagerOptions
+
+{{< generated/all_taskmanager_section >}}
+
+**Data Transport Network Stack**
+
+These options are for the network stack that handles the streaming and batch data exchanges between TaskManagers.
+
+{{< generated/all_taskmanager_network_section >}}
+
+### RPC / Akka
+
+Flink uses Akka for RPC between components (JobManager/TaskManager/ResourceManager).
+Flink does not use Akka for data transport.
+
+{{< generated/akka_configuration >}}
+
+----
+----
+
+# JVM and Logging Options
+
+{{< generated/environment_configuration >}}
+
+# Forwarding Environment Variables
+
+You can configure environment variables to be set on the JobManager and TaskManager processes started on Yarn/Mesos.
+
+ - `containerized.master.env.`: Prefix for passing custom environment variables to Flink's JobManager process.
+ For example for passing LD_LIBRARY_PATH as an env variable to the JobManager, set containerized.master.env.LD_LIBRARY_PATH: "/usr/lib/native"
+ in the flink-conf.yaml.
+
+ - `containerized.taskmanager.env.`: Similar to the above, this configuration prefix allows setting custom environment variables for the workers (TaskManagers).
+
+----
+----
+
+# Deprecated Options
+
+These options relate to parts of Flink that are not actively developed any more.
+These options may be removed in a future release.
+
+**DataSet API Optimizer**
+
+{{< generated/optimizer_configuration >}}
+
+**DataSet API Runtime Algorithms**
+
+{{< generated/algorithm_configuration >}}
+
+**DataSet File Sinks**
+
+{{< generated/deprecated_file_sinks_section >}}
+
+{{< top >}}
diff --git a/docs/content.zh/docs/deployment/elastic_scaling.md b/docs/content.zh/docs/deployment/elastic_scaling.md
new file mode 100644
index 0000000000000..e4046ce38d752
--- /dev/null
+++ b/docs/content.zh/docs/deployment/elastic_scaling.md
@@ -0,0 +1,156 @@
+---
+title: Elastic Scaling
+weight: 5
+type: docs
+
+---
+
+
+# Elastic Scaling
+
+Apache Flink allows you to rescale your jobs. You can do this manually by stopping the job and restarting from the savepoint created during shutdown with a different parallelism.
+
+This page describes options where Flink automatically adjusts the parallelism instead.
+
+## Reactive Mode
+
+{{< hint info >}}
+Reactive mode is an MVP ("minimum viable product") feature. The Flink community is actively looking for feedback by users through our mailing lists. Please check the limitations listed on this page.
+{{< /hint >}}
+
+Reactive Mode configures a job so that it always uses all resources available in the cluster. Adding a TaskManager will scale up your job, removing resources will scale it down. Flink will manage the parallelism of the job, always setting it to the highest possible values.
+
+Reactive Mode restarts a job on a rescaling event, restoring it from the latest completed checkpoint. This means that there is no overhead of creating a savepoint (which is needed for manually rescaling a job). Also, the amount of data that is reprocessed after rescaling depends on the checkpointing interval, and the restore time depends on the state size.
+
+The Reactive Mode allows Flink users to implement a powerful autoscaling mechanism, by having an external service monitor certain metrics, such as consumer lag, aggregate CPU utilization, throughput or latency. As soon as these metrics are above or below a certain threshold, additional TaskManagers can be added or removed from the Flink cluster. This could be implemented through changing the [replica factor](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#replicas) of a Kubernetes deployment, or an [autoscaling group](https://docs.aws.amazon.com/autoscaling/ec2/userguide/AutoScalingGroup.html) on AWS. This external service only needs to handle the resource allocation and deallocation. Flink will take care of keeping the job running with the resources available.
+
+### Getting started
+
+If you just want to try out Reactive Mode, follow these instructions. They assume that you are deploying Flink on a single machine.
+
+```bash
+
+# these instructions assume you are in the root directory of a Flink distribution.
+
+# Put Job into lib/ directory
+cp ./examples/streaming/TopSpeedWindowing.jar lib/
+# Submit Job in Reactive Mode
+./bin/standalone-job.sh start -Dscheduler-mode=reactive -Dexecution.checkpointing.interval="10s" -j org.apache.flink.streaming.examples.windowing.TopSpeedWindowing
+# Start first TaskManager
+./bin/taskmanager.sh start
+```
+
+Let's quickly examine the used submission command:
+- `./bin/standalone-job.sh start` deploys Flink in [Application Mode]({{< ref "docs/deployment/overview" >}}#application-mode)
+- `-Dscheduler-mode=reactive` enables Reactive Mode.
+- `-Dexecution.checkpointing.interval="10s"` configure checkpointing and restart strategy.
+- the last argument is passing the Job's main class name.
+
+You have now started a Flink job in Reactive Mode. The [web interface](http://localhost:8081) shows that the job is running on one TaskManager. If you want to scale up the job, simply add another TaskManager to the cluster:
+```bash
+# Start additional TaskManager
+./bin/taskmanager.sh start
+```
+
+To scale down, remove a TaskManager instance.
+```bash
+# Remove a TaskManager
+./bin/taskmanager.sh stop
+```
+
+### Usage
+
+#### Configuration
+
+To enable Reactive Mode, you need to configure `scheduler-mode` to `reactive`.
+
+The **parallelism of individual operators in a job will be determined by the scheduler**. It is not configurable
+and will be ignored if explicitly set, either on individual operators or the entire job.
+
+The only way of influencing the parallelism is by setting a max parallelism for an operator
+(which will be respected by the scheduler). The maxParallelism is bounded by 2^15 (32768).
+If you do not set a max parallelism for individual operators or the entire job, the
+[default parallelism rules]({{< ref "docs/dev/execution/parallel" >}}#setting-the-maximum-parallelism) will be applied,
+potentially applying lower bounds than the max possible value. As with the default scheduling mode, please take
+the [best practices for parallelism]({{< ref "docs/ops/production_ready" >}}#set-an-explicit-max-parallelism) into consideration.
+
+Note that such a high max parallelism might affect performance of the job, since more internal structures are needed to maintain [some internal structures](https://flink.apache.org/features/2017/07/04/flink-rescalable-state.html) of Flink.
+
+When enabling Reactive Mode, the [`jobmanager.adaptive-scheduler.resource-wait-timeout`]({{< ref "docs/deployment/config">}}#jobmanager-adaptive-scheduler-resource-wait-timeout) configuration key will default to `-1`. This means that the JobManager will run forever waiting for sufficient resources.
+If you want the JobManager to stop after a certain time without enough TaskManagers to run the job, configure `jobmanager.adaptive-scheduler.resource-wait-timeout`.
+
+With Reactive Mode enabled, the [`jobmanager.adaptive-scheduler.resource-stabilization-timeout`]({{< ref "docs/deployment/config">}}#jobmanager-adaptive-scheduler-resource-stabilization-timeout) configuration key will default to `0`: Flink will start runnning the job, as soon as there are sufficient resources available.
+In scenarios where TaskManagers are not connecting at the same time, but slowly one after another, this behavior leads to a job restart whenever a TaskManager connects. Increase this configuration value if you want to wait for the resources to stabilize before scheduling the job.
+Additionally, one can configure [`jobmanager.adaptive-scheduler.min-parallelism-increase`]({{< ref "docs/deployment/config">}}#jobmanager-adaptive-scheduler-min-parallelism-increase): This configuration option specifices the minumum amount of additional, aggregate parallelism increase before triggering a scale-up. For example if you have a job with a source (parallelism=2) and a sink (parallelism=2), the aggregate parallelism is 4. By default, the configuration key is set to 1, so any increase in the aggregate parallelism will trigger a restart.
+
+#### Recommendations
+
+- **Configure periodic checkpointing for stateful jobs**: Reactive mode restores from the latest completed checkpoint on a rescale event. If no periodic checkpointing is enabled, your program will lose its state. Checkpointing also configures a **restart strategy**. Reactive Mode will respect the configured restarting strategy: If no restarting strategy is configured, reactive mode will fail your job, instead of scaling it.
+
+- Downscaling in Reactive Mode might cause longer stalls in your processing because Flink waits for the heartbeat between JobManager and the stopped TaskManager(s) to time out. You will see that your Flink job is stuck for roughly 50 seconds before redeploying your job with a lower parallelism.
+
+ The default timeout is configured to 50 seconds. Adjust the [`heartbeat.timeout`]({{< ref "docs/deployment/config">}}#heartbeat-timeout) configuration to a lower value, if your infrastructure permits this. Setting a low heartbeat timeout can lead to failures if a TaskManager fails to respond to a heartbeat, for example due to a network congestion or a long garbage collection pause. Note that the [`heartbeat.interval`]({{< ref "docs/deployment/config">}}#heartbeat-interval) always needs to be lower than the timeout.
+
+
+### Limitations
+
+Since Reactive Mode is a new, experimental feature, not all features supported by the default scheduler are also available with Reactive Mode (and its adaptive scheduler). The Flink community is working on addressing these limitations.
+
+- **Deployment is only supported as a standalone application deployment**. Active resource providers (such as native Kubernetes, YARN or Mesos) are explicitly not supported. Standalone session clusters are not supported either. The application deployment is limited to single job applications.
+
+ The only supported deployment options are [Standalone in Application Mode]({{< ref "docs/deployment/resource-providers/standalone/overview" >}}#application-mode) ([described](#getting-started) on this page), [Docker in Application Mode]({{< ref "docs/deployment/resource-providers/standalone/docker" >}}#application-mode-on-docker) and [Standalone Kubernetes Application Cluster]({{< ref "docs/deployment/resource-providers/standalone/kubernetes" >}}#deploy-application-cluster).
+
+The [limitations of Adaptive Scheduler](#limitations-1) also apply to Reactive Mode.
+
+
+## Adaptive Scheduler
+
+{{< hint warning >}}
+Using Adaptive Scheduler directly (not through Reactive Mode) is only advised for advanced users because slot allocation on a session cluster with multiple jobs is not defined.
+{{< /hint >}}
+
+The Adaptive Scheduler can adjust the parallelism of a job based on available slots. It will automatically reduce the parallelism if not enough slots are available to run the job with the originally configured parallelism; be it due to not enough resources being available at the time of submission, or TaskManager outages during the job execution. If new slots become available the job will be scaled up again, up to the configured parallelism.
+In Reactive Mode (see above) the configured parallelism is ignored and treated as if it was set to infinity, letting the job always use as many resources as possible.
+You can also use Adaptive Scheduler without Reactive Mode, but there are some practical limitations:
+- If you are using Adaptive Scheduler on a session cluster, there are no guarantees regarding the distribution of slots between multiple running jobs in the same session.
+
+One benefit of the Adaptive Scheduler over the default scheduler is that it can handle TaskManager losses gracefully, since it would just scale down in these cases.
+
+### Usage
+
+The following configuration parameters need to be set:
+
+- `jobmanager.scheduler: adaptive`: Change from the default scheduler to adaptive scheduler
+- `cluster.declarative-resource-management.enabled` Declarative resource management must be enabled (enabled by default).
+
+The behavior of Adaptive Scheduler is configured by [all configuration options containing `adaptive-scheduler`]({{< ref "docs/deployment/config">}}#advanced-scheduling-options) in their name.
+
+### Limitations
+
+- **Streaming jobs only**: The first version of Adaptive Scheduler runs with streaming jobs only. When submitting a batch job, we will automatically fall back to the default scheduler.
+- **No support for [local recovery]({{< ref "docs/ops/state/large_state_tuning">}}#task-local-recovery)**: Local recovery is a feature that schedules tasks to machines so that the state on that machine gets re-used if possible. The lack of this feature means that Adaptive Scheduler will always need to download the entire state from the checkpoint storage.
+- **No support for partial failover**: Partial failover means that the scheduler is able to restart parts ("regions" in Flink's internals) of a failed job, instead of the entire job. This limitation impacts only recovery time of embarrassingly parallel jobs: Flink's default scheduler can restart failed parts, while Adaptive Scheduler will restart the entire job.
+- **Limited integration with Flink's Web UI**: Adaptive Scheduler allows that a job's parallelism can change over its lifetime. The web UI only shows the current parallelism the job.
+- **Limited Job metrics**: With the exception of `numRestarts` all [availability]({{< ref "docs/ops/metrics" >}}#availability) and [checkpointing]({{< ref "docs/ops/metrics" >}}#checkpointing) metrics with the `Job` scope are not working correctly.
+- **Unused slots**: If the max parallelism for slot sharing groups is not equal, slots offered to Adaptive Scheduler might be unused.
+- Scaling events trigger job and task restarts, which will increase the number of Task attempts.
+
+
+{{< top >}}
diff --git a/docs/content.zh/docs/deployment/filesystems/_index.md b/docs/content.zh/docs/deployment/filesystems/_index.md
new file mode 100644
index 0000000000000..e77dee7bead36
--- /dev/null
+++ b/docs/content.zh/docs/deployment/filesystems/_index.md
@@ -0,0 +1,23 @@
+---
+title: File Systems
+bookCollapseSection: true
+weight: 6
+---
+
\ No newline at end of file
diff --git a/docs/content.zh/docs/deployment/filesystems/azure.md b/docs/content.zh/docs/deployment/filesystems/azure.md
new file mode 100644
index 0000000000000..728882126023c
--- /dev/null
+++ b/docs/content.zh/docs/deployment/filesystems/azure.md
@@ -0,0 +1,82 @@
+---
+title: Azure Blob 存储
+weight: 4
+type: docs
+aliases:
+ - /zh/deployment/filesystems/azure.html
+ - /zh/ops/filesystems/azure
+---
+
+
+# Azure Blob 存储
+
+[Azure Blob 存储](https://docs.microsoft.com/en-us/azure/storage/) 是一项由 Microsoft 管理的服务,能提供多种应用场景下的云存储。
+Azure Blob 存储可与 Flink 一起使用以**读取**和**写入数据**,以及与[流 State Backend]({{< ref "docs/ops/state/state_backends" >}}) 结合使用。
+
+
+
+通过以下格式指定路径,Azure Blob 存储对象可类似于普通文件使用:
+
+```plain
+wasb://@$.blob.core.windows.net/
+
+// SSL 加密访问
+wasbs://@$.blob.core.windows.net/
+```
+
+参见以下代码了解如何在 Flink 作业中使用 Azure Blob 存储:
+
+```java
+// 读取 Azure Blob 存储
+env.readTextFile("wasb://@$.blob.core.windows.net/");
+
+// 写入 Azure Blob 存储
+stream.writeAsText("wasb://@$.blob.core.windows.net/