From 9324ba9ef674aec857f6172c50a7290e3df5be71 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Mon, 20 Apr 2020 19:38:03 +0800 Subject: [PATCH 1/8] [SPARK-31498][SQL] Dump public static sql configurations through doc generation --- .../spark/sql/api/python/PythonSQLUtils.scala | 4 +- .../sql/api/python/PythonSQLUtilsSuite.scala | 44 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/api/python/PythonSQLUtilsSuite.scala diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala index 03f5a60aec438..634e3efcefea7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala @@ -23,7 +23,7 @@ import java.nio.channels.Channels import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.PythonRDDServer import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Dataset, SQLContext} +import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.ExpressionInfo import org.apache.spark.sql.catalyst.parser.CatalystSqlParser @@ -42,6 +42,8 @@ private[sql] object PythonSQLUtils { def listSQLConfigs(): Array[(String, String, String, String)] = { val conf = new SQLConf() + // Force to build StaticSQLConf, which is a little bit hacky here + conf.warehousePath // Py4J doesn't seem to translate Seq well, so we convert to an Array. conf.getAllDefinedConfs.toArray } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/api/python/PythonSQLUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/api/python/PythonSQLUtilsSuite.scala new file mode 100644 index 0000000000000..eb85241b7839f --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/api/python/PythonSQLUtilsSuite.scala @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.api.python + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} + +class PythonSQLUtilsSuite extends SparkFunSuite { + + test("list sql configurations should include all public one") { + val configs = PythonSQLUtils.listSQLConfigs() + + // static sql configurations + assert(configs.exists(entry => entry._1 == StaticSQLConf.SPARK_SESSION_EXTENSIONS.key), + "listSQLConfigs should contain public static sql configuration") + assert(!configs.exists(entry => entry._1 == StaticSQLConf.DEBUG_MODE.key), + "listSQLConfigs should not contain internal static sql configuration") + + // dynamic sql configurations + assert(configs.exists(entry => entry._1 == SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key), + "listSQLConfigs should contain public dynamic sql configuration") + assert(!configs.exists(entry => entry._1 == SQLConf.ANALYZER_MAX_ITERATIONS.key), + "listSQLConfigs should not contain internal dynamic sql configuration") + + // spark core configurations + assert(!configs.exists(entry => entry._1 == "spark.master"), + "listSQLConfigs should not contain internal dynamic sql configuration") + } +} From 991a7ecd8de3339feacede9af1e053e8f1673ce6 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Tue, 21 Apr 2020 00:31:27 +0800 Subject: [PATCH 2/8] separate sql confs --- docs/.gitignore | 2 +- docs/configuration.md | 21 ++++++++++++--- .../spark/sql/api/python/PythonSQLUtils.scala | 11 +++++--- .../sql/api/python/PythonSQLUtilsSuite.scala | 26 ++++++++++++++++--- sql/create-docs.sh | 7 +++-- sql/gen-sql-config-docs.py | 19 +++++++++++--- 6 files changed, 70 insertions(+), 16 deletions(-) diff --git a/docs/.gitignore b/docs/.gitignore index 2260493b46ab3..4511dd55f7ec6 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1 +1 @@ -sql-configs.html +*sql-configs.html diff --git a/docs/configuration.md b/docs/configuration.md index 676ecf5a82d48..a6170d9fb52f3 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -2622,16 +2622,31 @@ Please refer to the [Security](security.html) page for available options on how Spark subsystems. -{% for static_file in site.static_files %} - {% if static_file.name == 'sql-configs.html' %} ### Spark SQL - {% include_relative sql-configs.html %} +#### Runtime SQL Configuration + +Runtime SQL configurations inner-session, mutable Spark SQL configurations. They can be set and queried by SET commands and rest by RESET command, or by `SparkSession.conf`'s setter and getter methods. + +{% for static_file in site.static_files %} + {% if static_file.name == 'runtime-sql-configs.html' %} + {% include_relative runtime-sql-configs.html %} {% break %} {% endif %} {% endfor %} +#### Static SQL Configuration + +Static SQL configurations are cross-session, immutable Spark SQL configurations. External users can query the static sql configs value via `SparkSession.conf` or via set command, e.g. `SET spark.sql.extensions;`, but cannot set/unset them. + +{% for static_file in site.static_files %} + {% if static_file.name == 'static-sql-configs.html' %} + {% include_relative static-sql-configs.html %} + {% break %} + {% endif %} +{% endfor %} + ### Spark Streaming diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala index 634e3efcefea7..eef86cbdce41f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.ExpressionInfo import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.{ExplainMode, QueryExecution} import org.apache.spark.sql.execution.arrow.ArrowConverters -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.types.DataType private[sql] object PythonSQLUtils { @@ -42,12 +42,17 @@ private[sql] object PythonSQLUtils { def listSQLConfigs(): Array[(String, String, String, String)] = { val conf = new SQLConf() - // Force to build StaticSQLConf, which is a little bit hacky here - conf.warehousePath // Py4J doesn't seem to translate Seq well, so we convert to an Array. conf.getAllDefinedConfs.toArray } + def listStaticSQLConfigs(): Array[(String, String, String, String)] = { + val conf = new SQLConf() + // Force to build static SQL configurations + StaticSQLConf.WAREHOUSE_PATH.key -> () + conf.getAllDefinedConfs.filter(p => SQLConf.staticConfKeys.contains(p._1)).toArray + } + /** * Python callable function to read a file in Arrow stream format and create a [[RDD]] * using each serialized ArrowRecordBatch as a partition. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/api/python/PythonSQLUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/api/python/PythonSQLUtilsSuite.scala index eb85241b7839f..0d18d123e328a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/api/python/PythonSQLUtilsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/api/python/PythonSQLUtilsSuite.scala @@ -22,11 +22,11 @@ import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} class PythonSQLUtilsSuite extends SparkFunSuite { - test("list sql configurations should include all public one") { + test("listing sql configurations contains runtime ones only") { val configs = PythonSQLUtils.listSQLConfigs() // static sql configurations - assert(configs.exists(entry => entry._1 == StaticSQLConf.SPARK_SESSION_EXTENSIONS.key), + assert(!configs.exists(entry => entry._1 == StaticSQLConf.SPARK_SESSION_EXTENSIONS.key), "listSQLConfigs should contain public static sql configuration") assert(!configs.exists(entry => entry._1 == StaticSQLConf.DEBUG_MODE.key), "listSQLConfigs should not contain internal static sql configuration") @@ -39,6 +39,26 @@ class PythonSQLUtilsSuite extends SparkFunSuite { // spark core configurations assert(!configs.exists(entry => entry._1 == "spark.master"), - "listSQLConfigs should not contain internal dynamic sql configuration") + "listSQLConfigs should not contain core configuration") + } + + test("listing static sql configurations contains public static ones only") { + val configs = PythonSQLUtils.listStaticSQLConfigs() + + // static sql configurations + assert(configs.exists(entry => entry._1 == StaticSQLConf.SPARK_SESSION_EXTENSIONS.key), + "listStaticSQLConfigs should contain public static sql configuration") + assert(!configs.exists(entry => entry._1 == StaticSQLConf.DEBUG_MODE.key), + "listStaticSQLConfigs should not contain internal static sql configuration") + + // dynamic sql configurations + assert(!configs.exists(entry => entry._1 == SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key), + "listStaticSQLConfigs should not contain dynamic sql configuration") + assert(!configs.exists(entry => entry._1 == SQLConf.ANALYZER_MAX_ITERATIONS.key), + "listStaticSQLConfigs should not contain internal dynamic sql configuration") + + // spark core configurations + assert(!configs.exists(entry => entry._1 == "spark.master"), + "listStaticSQLConfigs should not contain core configuration") } } diff --git a/sql/create-docs.sh b/sql/create-docs.sh index 44aa877332fd5..d4ed96f396a2f 100755 --- a/sql/create-docs.sh +++ b/sql/create-docs.sh @@ -45,8 +45,11 @@ mkdir docs echo "Generating SQL API Markdown files." "$SPARK_HOME/bin/spark-submit" gen-sql-api-docs.py -echo "Generating SQL configuration table HTML file." -"$SPARK_HOME/bin/spark-submit" gen-sql-config-docs.py +echo "Generating runtime SQL runtime configuration table HTML file." +"$SPARK_HOME/bin/spark-submit" gen-sql-config-docs.py runtime + +echo "Generating static SQL configuration table HTML file." +"$SPARK_HOME/bin/spark-submit" gen-sql-config-docs.py static echo "Generating HTML files for SQL API documentation." mkdocs build --clean diff --git a/sql/gen-sql-config-docs.py b/sql/gen-sql-config-docs.py index 98212ad373370..02832d1d58c9f 100644 --- a/sql/gen-sql-config-docs.py +++ b/sql/gen-sql-config-docs.py @@ -17,6 +17,7 @@ import os import re +import sys from collections import namedtuple from textwrap import dedent @@ -28,7 +29,11 @@ "SQLConfEntry", ["name", "default", "description", "version"]) -def get_public_sql_configs(jvm): +def get_public_sql_configs(jvm, group): + if group == "static": + config_set = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listStaticSQLConfigs() + else: + config_set = jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listSQLConfigs() sql_configs = [ SQLConfEntry( name=_sql_config._1(), @@ -36,7 +41,7 @@ def get_public_sql_configs(jvm): description=_sql_config._3(), version=_sql_config._4() ) - for _sql_config in jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listSQLConfigs() + for _sql_config in config_set ] return sql_configs @@ -112,10 +117,16 @@ def generate_sql_configs_table(sql_configs, path): if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: ./bin/spark-submit sql/gen-sql-config-docs.py ") + sys.exit(-1) + else: + group = sys.argv[1] + jvm = launch_gateway().jvm - sql_configs = get_public_sql_configs(jvm) + sql_configs = get_public_sql_configs(jvm, group) spark_root_dir = os.path.dirname(os.path.dirname(__file__)) - sql_configs_table_path = os.path.join(spark_root_dir, "docs/sql-configs.html") + sql_configs_table_path = os.path.join(spark_root_dir, "docs", group + "-sql-configs.html") generate_sql_configs_table(sql_configs, path=sql_configs_table_path) From 284ab7008a8cbb5f9df1d27652b4ad0b622c2395 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Tue, 21 Apr 2020 09:53:58 +0800 Subject: [PATCH 3/8] fix test --- .../scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala index eef86cbdce41f..31c5f301738b8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala @@ -43,7 +43,7 @@ private[sql] object PythonSQLUtils { def listSQLConfigs(): Array[(String, String, String, String)] = { val conf = new SQLConf() // Py4J doesn't seem to translate Seq well, so we convert to an Array. - conf.getAllDefinedConfs.toArray + conf.getAllDefinedConfs.filterNot(p => SQLConf.staticConfKeys.contains(p._1)).toArray } def listStaticSQLConfigs(): Array[(String, String, String, String)] = { From b61b46cc06dfc8f13a9164f5f3fd9584dcdbeb99 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Tue, 21 Apr 2020 10:16:56 +0800 Subject: [PATCH 4/8] nit --- docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration.md b/docs/configuration.md index ff7b2a933f824..4240bfcc64f0b 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -2638,7 +2638,7 @@ Runtime SQL configurations inner-session, mutable Spark SQL configurations. They #### Static SQL Configuration -Static SQL configurations are cross-session, immutable Spark SQL configurations. External users can query the static sql configs value via `SparkSession.conf` or via set command, e.g. `SET spark.sql.extensions;`, but cannot set/unset them. +Static SQL configurations are cross-session, immutable Spark SQL configurations. External users can query the static sql config values via `SparkSession.conf` or via set command, e.g. `SET spark.sql.extensions;`, but cannot set/unset them. {% for static_file in site.static_files %} {% if static_file.name == 'generated-runtime-sql-config-table.html' %} From 4fca3fdb0205cf572c56967567b9492d6db84a37 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Tue, 21 Apr 2020 12:46:51 +0800 Subject: [PATCH 5/8] fix --- docs/configuration.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 4240bfcc64f0b..3b9acc6d6b92b 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -2641,8 +2641,8 @@ Runtime SQL configurations inner-session, mutable Spark SQL configurations. They Static SQL configurations are cross-session, immutable Spark SQL configurations. External users can query the static sql config values via `SparkSession.conf` or via set command, e.g. `SET spark.sql.extensions;`, but cannot set/unset them. {% for static_file in site.static_files %} - {% if static_file.name == 'generated-runtime-sql-config-table.html' %} - {% include_relative generated-runtime-sql-config-table.html %} + {% if static_file.name == 'generated-static-sql-config-table.html' %} + {% include_relative generated-static-sql-config-table.html %} {% break %} {% endif %} {% endfor %} From 5a42b0df3ce64b72572a69a4a5a8f667518f487e Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Tue, 21 Apr 2020 17:22:49 +0800 Subject: [PATCH 6/8] doc --- docs/configuration.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 3b9acc6d6b92b..36b967016d549 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -2626,7 +2626,10 @@ Spark subsystems. #### Runtime SQL Configuration -Runtime SQL configurations inner-session, mutable Spark SQL configurations. They can be set and queried by SET commands and rest by RESET command, or by `SparkSession.conf`'s setter and getter methods. +Runtime SQL configurations per-session, mutable Spark SQL configurations. They can be set with initial values by the config file +and command-line options with `--conf/-c` prefixed, or by setting `SparkConf` that are used to create `SparkSession`. +Also, they can be set and queried by SET commands and rest to their initial values by RESET command, +or by `SparkSession.conf`'s setter and getter methods in runtime. {% for static_file in site.static_files %} {% if static_file.name == 'generated-runtime-sql-config-table.html' %} @@ -2638,7 +2641,9 @@ Runtime SQL configurations inner-session, mutable Spark SQL configurations. They #### Static SQL Configuration -Static SQL configurations are cross-session, immutable Spark SQL configurations. External users can query the static sql config values via `SparkSession.conf` or via set command, e.g. `SET spark.sql.extensions;`, but cannot set/unset them. +Static SQL configurations are cross-session, immutable Spark SQL configurations. They can be set with final values by the config file +and command-line options with `--conf/-c` prefixed, or by setting `SparkConf` that are used to create `SparkSession`. +External users can query the static sql config values via `SparkSession.conf` or via set command, e.g. `SET spark.sql.extensions;`, but cannot set/unset them. {% for static_file in site.static_files %} {% if static_file.name == 'generated-static-sql-config-table.html' %} From 1de696e81f23361693519acc50a65f9870e6dea7 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Tue, 21 Apr 2020 17:24:40 +0800 Subject: [PATCH 7/8] doc --- docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration.md b/docs/configuration.md index 36b967016d549..6faa5e749bfad 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -2626,7 +2626,7 @@ Spark subsystems. #### Runtime SQL Configuration -Runtime SQL configurations per-session, mutable Spark SQL configurations. They can be set with initial values by the config file +Runtime SQL configurations are per-session, mutable Spark SQL configurations. They can be set with initial values by the config file and command-line options with `--conf/-c` prefixed, or by setting `SparkConf` that are used to create `SparkSession`. Also, they can be set and queried by SET commands and rest to their initial values by RESET command, or by `SparkSession.conf`'s setter and getter methods in runtime. From 81e043f592da25d03fe7674f00eeab9333a473a5 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Tue, 21 Apr 2020 19:11:17 +0800 Subject: [PATCH 8/8] simplify --- .../scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala index 31c5f301738b8..2e5f59edcf1da 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala @@ -49,7 +49,7 @@ private[sql] object PythonSQLUtils { def listStaticSQLConfigs(): Array[(String, String, String, String)] = { val conf = new SQLConf() // Force to build static SQL configurations - StaticSQLConf.WAREHOUSE_PATH.key -> () + StaticSQLConf conf.getAllDefinedConfs.filter(p => SQLConf.staticConfKeys.contains(p._1)).toArray }