Skip to content

Commit c6b09c0

Browse files
[SPARK-49894][PYTHON][CONNECT] Refine the string representation of column field operations
### What changes were proposed in this pull request? Refine the string representation of column field operations: `GetField`, `WithField`, and `DropFields` ### Why are the changes needed? make the string representations consistent between pyspark classic and connect ### Does this PR introduce _any_ user-facing change? yes before ``` In [1]: from pyspark.sql import functions as sf In [2]: c = sf.col("c") In [3]: c.x Out[3]: Column<'UnresolvedExtractValue(c, x)'> ``` after ``` In [1]: from pyspark.sql import functions as sf In [2]: c = sf.col("c") In [3]: c.x Out[3]: Column<'c['x']'> ``` ### How was this patch tested? added ut ### Was this patch authored or co-authored using generative AI tooling? no Closes #48369 from zhengruifeng/py_connect_col_str. Lead-authored-by: Ruifeng Zheng <[email protected]> Co-authored-by: Hyukjin Kwon <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
1 parent d8aca18 commit c6b09c0

File tree

2 files changed

+74
-3
lines changed

2 files changed

+74
-3
lines changed

python/pyspark/sql/connect/expressions.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -809,7 +809,7 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression:
809809
return expr
810810

811811
def __repr__(self) -> str:
812-
return f"WithField({self._structExpr}, {self._fieldName}, {self._valueExpr})"
812+
return f"update_field({self._structExpr}, {self._fieldName}, {self._valueExpr})"
813813

814814

815815
class DropField(Expression):
@@ -833,7 +833,7 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression:
833833
return expr
834834

835835
def __repr__(self) -> str:
836-
return f"DropField({self._structExpr}, {self._fieldName})"
836+
return f"drop_field({self._structExpr}, {self._fieldName})"
837837

838838

839839
class UnresolvedExtractValue(Expression):
@@ -857,7 +857,7 @@ def to_plan(self, session: "SparkConnectClient") -> proto.Expression:
857857
return expr
858858

859859
def __repr__(self) -> str:
860-
return f"UnresolvedExtractValue({str(self._child)}, {str(self._extraction)})"
860+
return f"{self._child}['{self._extraction}']"
861861

862862

863863
class UnresolvedRegex(Expression):

python/pyspark/sql/tests/test_column.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,77 @@ def test_expr_str_representation(self):
283283
when_cond = sf.when(expression, sf.lit(None))
284284
self.assertEqual(str(when_cond), "Column<'CASE WHEN foo THEN NULL END'>")
285285

286+
def test_col_field_ops_representation(self):
287+
# SPARK-49894: Test string representation of columns
288+
c = sf.col("c")
289+
290+
# getField
291+
self.assertEqual(str(c.x), "Column<'c['x']'>")
292+
self.assertEqual(str(c.x.y), "Column<'c['x']['y']'>")
293+
self.assertEqual(str(c.x.y.z), "Column<'c['x']['y']['z']'>")
294+
295+
self.assertEqual(str(c["x"]), "Column<'c['x']'>")
296+
self.assertEqual(str(c["x"]["y"]), "Column<'c['x']['y']'>")
297+
self.assertEqual(str(c["x"]["y"]["z"]), "Column<'c['x']['y']['z']'>")
298+
299+
self.assertEqual(str(c.getField("x")), "Column<'c['x']'>")
300+
self.assertEqual(
301+
str(c.getField("x").getField("y")),
302+
"Column<'c['x']['y']'>",
303+
)
304+
self.assertEqual(
305+
str(c.getField("x").getField("y").getField("z")),
306+
"Column<'c['x']['y']['z']'>",
307+
)
308+
309+
self.assertEqual(str(c.getItem("x")), "Column<'c['x']'>")
310+
self.assertEqual(
311+
str(c.getItem("x").getItem("y")),
312+
"Column<'c['x']['y']'>",
313+
)
314+
self.assertEqual(
315+
str(c.getItem("x").getItem("y").getItem("z")),
316+
"Column<'c['x']['y']['z']'>",
317+
)
318+
319+
self.assertEqual(
320+
str(c.x["y"].getItem("z")),
321+
"Column<'c['x']['y']['z']'>",
322+
)
323+
self.assertEqual(
324+
str(c["x"].getField("y").getItem("z")),
325+
"Column<'c['x']['y']['z']'>",
326+
)
327+
self.assertEqual(
328+
str(c.getField("x").getItem("y").z),
329+
"Column<'c['x']['y']['z']'>",
330+
)
331+
self.assertEqual(
332+
str(c["x"].y.getField("z")),
333+
"Column<'c['x']['y']['z']'>",
334+
)
335+
336+
# WithField
337+
self.assertEqual(
338+
str(c.withField("x", sf.col("y"))),
339+
"Column<'update_field(c, x, y)'>",
340+
)
341+
self.assertEqual(
342+
str(c.withField("x", sf.col("y")).withField("x", sf.col("z"))),
343+
"Column<'update_field(update_field(c, x, y), x, z)'>",
344+
)
345+
346+
# DropFields
347+
self.assertEqual(str(c.dropFields("x")), "Column<'drop_field(c, x)'>")
348+
self.assertEqual(
349+
str(c.dropFields("x", "y")),
350+
"Column<'drop_field(drop_field(c, x), y)'>",
351+
)
352+
self.assertEqual(
353+
str(c.dropFields("x", "y", "z")),
354+
"Column<'drop_field(drop_field(drop_field(c, x), y), z)'>",
355+
)
356+
286357
def test_lit_time_representation(self):
287358
dt = datetime.date(2021, 3, 4)
288359
self.assertEqual(str(sf.lit(dt)), "Column<'2021-03-04'>")

0 commit comments

Comments
 (0)