|
| 1 | +<!--- |
| 2 | + Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + or more contributor license agreements. See the NOTICE file |
| 4 | + distributed with this work for additional information |
| 5 | + regarding copyright ownership. The ASF licenses this file |
| 6 | + to you under the Apache License, Version 2.0 (the |
| 7 | + "License"); you may not use this file except in compliance |
| 8 | + with the License. You may obtain a copy of the License at |
| 9 | +
|
| 10 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +
|
| 12 | + Unless required by applicable law or agreed to in writing, |
| 13 | + software distributed under the License is distributed on an |
| 14 | + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + KIND, either express or implied. See the License for the |
| 16 | + specific language governing permissions and limitations |
| 17 | + under the License. |
| 18 | +--> |
| 19 | + |
| 20 | +# Datafusion output field name semantic |
| 21 | + |
| 22 | +This specification documents how field names in output record batches should be |
| 23 | +generated based on given user queries. The filed name rules apply to |
| 24 | +Datafusion queries planned from both SQL queries and Dataframe APIs. |
| 25 | + |
| 26 | +## Field name rules |
| 27 | + |
| 28 | +* All field names MUST not contain relation/table qualifier. |
| 29 | + * Both `SELECT t1.id`, `SELECT id` and `df.select_columns(&["id"])` SHOULD result in field name: `id` |
| 30 | +* Function names MUST be converted to lowercase. |
| 31 | + * `SELECT AVG(c1)` SHOULD result in field name: `avg(c1)` |
| 32 | +* Literal string MUST not be wrapped with quotes or double quotes. |
| 33 | + * `SELECT 'foo'` SHOULD result in field name: `foo` |
| 34 | +* Operator expressions MUST be wrapped with parentheses. |
| 35 | + * `SELECT -2` SHOULD result in field name: `(- 2)` |
| 36 | +* Operator and operand MUST be separated by spaces. |
| 37 | + * `SELECT 1+2` SHOULD result in field name: `(1 + 2)` |
| 38 | +* Function arguments MUST be separated by a comma `,` and a space. |
| 39 | + * `SELECT f(c1,c2)` and `df.select(vec![f.udf("f")?.call(vec![col("c1"), col("c2")])])` SHOULD result in field name: `f(c1, c2)` |
| 40 | + |
| 41 | +## Appendices |
| 42 | + |
| 43 | +### Examples and comparison with other systems |
| 44 | + |
| 45 | +Data schema for test sample queries: |
| 46 | + |
| 47 | +``` |
| 48 | +CREATE TABLE t1 (id INT, a VARCHAR(5)); |
| 49 | +INSERT INTO t1 (id, a) VALUES (1, 'foo'); |
| 50 | +INSERT INTO t1 (id, a) VALUES (2, 'bar'); |
| 51 | +
|
| 52 | +CREATE TABLE t2 (id INT, b VARCHAR(5)); |
| 53 | +INSERT INTO t2 (id, b) VALUES (1, 'hello'); |
| 54 | +INSERT INTO t2 (id, b) VALUES (2, 'world'); |
| 55 | +``` |
| 56 | + |
| 57 | +#### Projected columns |
| 58 | + |
| 59 | +Query: |
| 60 | + |
| 61 | +``` |
| 62 | +SELECT t1.id, a, t2.id, b |
| 63 | +FROM t1 |
| 64 | +JOIN t2 ON t1.id = t2.id |
| 65 | +``` |
| 66 | + |
| 67 | +Datafusion Arrow record batches output: |
| 68 | + |
| 69 | +| id | a | id | b | |
| 70 | +|----|-----|----|-------| |
| 71 | +| 1 | foo | 1 | hello | |
| 72 | +| 2 | bar | 2 | world | |
| 73 | + |
| 74 | + |
| 75 | +Spark, MySQL 8 and PostgreSQL 13 output: |
| 76 | + |
| 77 | +| id | a | id | b | |
| 78 | +|----|-----|----|-------| |
| 79 | +| 1 | foo | 1 | hello | |
| 80 | +| 2 | bar | 2 | world | |
| 81 | + |
| 82 | +SQLite 3 output: |
| 83 | + |
| 84 | +| id | a | b | |
| 85 | +|----|-----|-------| |
| 86 | +| 1 | foo | hello | |
| 87 | +| 2 | bar | world | |
| 88 | + |
| 89 | + |
| 90 | +#### Function transformed columns |
| 91 | + |
| 92 | +Query: |
| 93 | + |
| 94 | +``` |
| 95 | +SELECT ABS(t1.id), abs(-id) FROM t1; |
| 96 | +``` |
| 97 | + |
| 98 | +Datafusion Arrow record batches output: |
| 99 | + |
| 100 | +| abs(id) | abs((- id)) | |
| 101 | +|---------|-------------| |
| 102 | +| 1 | 1 | |
| 103 | +| 2 | 2 | |
| 104 | + |
| 105 | + |
| 106 | +Spark output: |
| 107 | + |
| 108 | +| abs(id) | abs((- id)) | |
| 109 | +|---------|-------------| |
| 110 | +| 1 | 1 | |
| 111 | +| 2 | 2 | |
| 112 | + |
| 113 | + |
| 114 | +MySQL 8 output: |
| 115 | + |
| 116 | +| ABS(t1.id) | abs(-id) | |
| 117 | +|------------|----------| |
| 118 | +| 1 | 1 | |
| 119 | +| 2 | 2 | |
| 120 | + |
| 121 | +PostgreSQL 13 output: |
| 122 | + |
| 123 | +| abs | abs | |
| 124 | +|-----|-----| |
| 125 | +| 1 | 1 | |
| 126 | +| 2 | 2 | |
| 127 | + |
| 128 | +SQlite 3 output: |
| 129 | + |
| 130 | +| ABS(t1.id) | abs(-id) | |
| 131 | +|------------|----------| |
| 132 | +| 1 | 1 | |
| 133 | +| 2 | 2 | |
| 134 | + |
| 135 | + |
| 136 | +#### Function with operators |
| 137 | + |
| 138 | +Query: |
| 139 | + |
| 140 | +``` |
| 141 | +SELECT t1.id + ABS(id), ABS(id * t1.id) FROM t1; |
| 142 | +``` |
| 143 | + |
| 144 | +Datafusion Arrow record batches output: |
| 145 | + |
| 146 | +| id + abs(id) | abs(id * id) | |
| 147 | +|--------------|--------------| |
| 148 | +| 2 | 1 | |
| 149 | +| 4 | 4 | |
| 150 | + |
| 151 | + |
| 152 | +Spark output: |
| 153 | + |
| 154 | +| id + abs(id) | abs(id * id) | |
| 155 | +|--------------|--------------| |
| 156 | +| 2 | 1 | |
| 157 | +| 4 | 4 | |
| 158 | + |
| 159 | +MySQL 8 output: |
| 160 | + |
| 161 | +| t1.id + ABS(id) | ABS(id * t1.id) | |
| 162 | +|-----------------|-----------------| |
| 163 | +| 2 | 1 | |
| 164 | +| 4 | 4 | |
| 165 | + |
| 166 | +PostgreSQL output: |
| 167 | + |
| 168 | +| ?column? | abs | |
| 169 | +|----------|-----| |
| 170 | +| 2 | 1 | |
| 171 | +| 4 | 4 | |
| 172 | + |
| 173 | +SQLite output: |
| 174 | + |
| 175 | +| t1.id + ABS(id) | ABS(id * t1.id) | |
| 176 | +|-----------------|-----------------| |
| 177 | +| 2 | 1 | |
| 178 | +| 4 | 4 | |
| 179 | + |
| 180 | + |
| 181 | +#### Project literals |
| 182 | + |
| 183 | +Query: |
| 184 | + |
| 185 | +``` |
| 186 | +SELECT 1, 2+5, 'foo_bar'; |
| 187 | +``` |
| 188 | + |
| 189 | +Datafusion Arrow record batches output: |
| 190 | + |
| 191 | +| 1 | (2 + 5) | foo_bar | |
| 192 | +|---|---------|---------| |
| 193 | +| 1 | 7 | foo_bar | |
| 194 | + |
| 195 | + |
| 196 | +Spark output: |
| 197 | + |
| 198 | +| 1 | (2 + 5) | foo_bar | |
| 199 | +|---|---------|---------| |
| 200 | +| 1 | 7 | foo_bar | |
| 201 | + |
| 202 | +MySQL output: |
| 203 | + |
| 204 | +| 1 | 2+5 | foo_bar | |
| 205 | +|---|-----|---------| |
| 206 | +| 1 | 7 | foo_bar | |
| 207 | + |
| 208 | + |
| 209 | +PostgreSQL output: |
| 210 | + |
| 211 | +| ?column? | ?column? | ?column? | |
| 212 | +|----------|----------|----------| |
| 213 | +| 1 | 7 | foo_bar | |
| 214 | + |
| 215 | + |
| 216 | +SQLite 3 output: |
| 217 | + |
| 218 | +| 1 | 2+5 | 'foo_bar' | |
| 219 | +|---|-----|-----------| |
| 220 | +| 1 | 7 | foo_bar | |
0 commit comments