From be7ada4309c509152e8488f5c098cda9ef052cd3 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Fri, 10 Oct 2025 16:18:37 -0400 Subject: [PATCH 01/21] Extract Spark Plan product with keys for Scala 2.13 --- .../spark/Spark213Instrumentation.java | 42 +++++++ .../instrumentation/spark/SparkPlanUtils.java | 111 ++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/SparkPlanUtils.java diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java index 1b9f19e7178..33dc7a0f155 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java @@ -8,8 +8,12 @@ import datadog.trace.api.Config; import net.bytebuddy.asm.Advice; import org.apache.spark.SparkContext; +import org.apache.spark.sql.execution.SparkPlan; +import org.apache.spark.sql.execution.SparkPlanInfo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import scala.collection.JavaConverters; +import scala.collection.immutable.HashMap; @AutoService(InstrumenterModule.class) public class Spark213Instrumentation extends AbstractSparkInstrumentation { @@ -27,9 +31,22 @@ public String[] helperClassNames() { packageName + ".SparkSQLUtils", packageName + ".SparkSQLUtils$SparkPlanInfoForStage", packageName + ".SparkSQLUtils$AccumulatorWithStage", + packageName + ".SparkPlanUtils", }; } + @Override + public String[] knownMatchingTypes() { + String[] res = new String[super.knownMatchingTypes().length + 1]; + int idx = 0; + for (String match : super.knownMatchingTypes()) { + res[idx] = match; + idx++; + } + res[idx] = "org.apache.spark.sql.execution.SparkPlanInfo$"; + return res; + } + @Override public void methodAdvice(MethodTransformer transformer) { super.methodAdvice(transformer); @@ -40,6 +57,13 @@ public void methodAdvice(MethodTransformer transformer) { .and(isDeclaredBy(named("org.apache.spark.SparkContext"))) .and(takesNoArguments()), Spark213Instrumentation.class.getName() + "$InjectListener"); + + transformer.applyAdvice( + isMethod() + .and(named("fromSparkPlan")) + .and(takesArgument(0, named("org.apache.spark.sql.execution.SparkPlan"))) + .and(isDeclaredBy(named("org.apache.spark.sql.execution.SparkPlanInfo$"))), + Spark213Instrumentation.class.getName() + "$SparkPlanInfoAdvice"); } public static class InjectListener { @@ -79,4 +103,22 @@ public static void enter(@Advice.This SparkContext sparkContext) { sparkContext.listenerBus().addToSharedQueue(AbstractDatadogSparkListener.listener); } } + + public static class SparkPlanInfoAdvice { + @Advice.OnMethodExit(suppress = Throwable.class, onThrowable = Throwable.class) + public static void exit( + @Advice.Return(readOnly = false) SparkPlanInfo planInfo, + @Advice.Argument(0) SparkPlan plan) { + if (planInfo.metadata().size() == 0) { + planInfo = + new SparkPlanInfo( + planInfo.nodeName(), + planInfo.simpleString(), + planInfo.children(), + HashMap.from( + JavaConverters.asScala(SparkPlanUtils.extractPlanProduct(plan)).toList()), + planInfo.metrics()); + } + } + } } diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/SparkPlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/SparkPlanUtils.java new file mode 100644 index 00000000000..a3227e75b6d --- /dev/null +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/SparkPlanUtils.java @@ -0,0 +1,111 @@ +package datadog.trace.instrumentation.spark; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.spark.sql.catalyst.trees.TreeNode; +import org.apache.spark.sql.execution.ReusedSubqueryExec; +import org.apache.spark.sql.execution.SparkPlan; +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec; +import org.apache.spark.sql.execution.adaptive.QueryStageExec; +import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec; +import org.apache.spark.sql.execution.exchange.ReusedExchangeExec; +import scala.None$; +import scala.collection.JavaConverters; +import scala.collection.immutable.$colon$colon; +import scala.collection.immutable.Iterable; +import scala.collection.immutable.Nil$; + +// An extension of how Spark translates `SparkPlan`s to `SparkPlanInfo`, see here: +// https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 +public class SparkPlanUtils { + public static ArrayList extractChildren(SparkPlan plan) { + /* + Get children of this node. Logic in Spark: + + val children = plan match { + case ReusedExchangeExec(_, child) => child :: Nil + case ReusedSubqueryExec(child) => child :: Nil + case a: AdaptiveSparkPlanExec => a.executedPlan :: Nil + case stage: QueryStageExec => stage.plan :: Nil + case inMemTab: InMemoryTableScanExec => inMemTab.relation.cachedPlan :: Nil + case EmptyRelationExec(logical) => (logical :: Nil) + case _ => plan.children ++ plan.subqueries + } + */ + ArrayList children = new ArrayList<>(); + if (plan instanceof ReusedExchangeExec) { + children.add(((ReusedExchangeExec) plan).child()); + } else if (plan instanceof ReusedSubqueryExec) { + children.add(((ReusedSubqueryExec) plan).child()); + } else if (plan instanceof AdaptiveSparkPlanExec) { + children.add(((AdaptiveSparkPlanExec) plan).executedPlan()); + } else if (plan instanceof QueryStageExec) { + children.add(((QueryStageExec) plan).plan()); + } else if (plan instanceof InMemoryTableScanExec) { + children.add(((InMemoryTableScanExec) plan).relation().cachedPlan()); + // New as of Spark 4.0.0 + // } else if (plan instanceof EmptyRelationExec) { + // children.add(((EmptyRelationExec) plan).logical); + } + + for (Iterator it = JavaConverters.asJavaIterator(plan.subqueries().iterator()); + it.hasNext(); ) { + children.add(it.next()); + } + for (Iterator it = JavaConverters.asJavaIterator(plan.children().iterator()); + it.hasNext(); ) { + children.add(it.next()); + } + + return children; + } + + public static Map extractPlanProduct(SparkPlan plan) { + HashMap args = new HashMap<>(); + HashMap unparsed = new HashMap<>(); + + int i = 0; + for (Iterator it = JavaConverters.asJavaIterator(plan.productIterator()); + it.hasNext(); ) { + Object obj = it.next(); + + // TODO: improve parsing of certain types + // 1. Some() should be unwrapped + // 2. requiredSchema on Scan * (currently showing StructType) + + // TODO: support a few more common types? + // condition=org.apache.spark.sql.catalyst.expressions.objects.Invoke + // joinType=org.apache.spark.sql.catalyst.plans.Inner$ + // buildSide=org.apache.spark.sql.catalyst.optimizer.BuildRight$ + // shuffleOrigin=org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS$ + // outputPartitioning=org.apache.spark.sql.catalyst.plans.physical.SinglePartition$ + if (obj instanceof String + || obj instanceof Boolean + || obj instanceof Collection + || obj instanceof None$ + || obj instanceof Integer) { + args.put(plan.productElementName(i), obj.toString()); + } else if (obj instanceof $colon$colon || obj instanceof Nil$) { + args.put(plan.productElementName(i), JavaConverters.asJava(((Iterable) obj)).toString()); + } else if (obj instanceof TreeNode) { + // Filter out any potential child nodes + // TODO: Exempt conditions from this branch + // e.g. condition=class org.apache.spark.sql.catalyst.expressions.objects.Invoke + unparsed.put(plan.productElementName(i), obj.getClass().getName()); + } else { + args.put(plan.productElementName(i), obj.toString()); + } + + i++; + } + + if (unparsed.size() > 0) { + // For now, place what we can't parse here with the types so we're aware of them + args.put("_dd.unparsed", unparsed.toString()); + } + return args; + } +} From 9c19e11f7c2486e887dd16fec94c4c138be49dd6 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Fri, 10 Oct 2025 17:10:33 -0400 Subject: [PATCH 02/21] Extract Spark Plan product values for Spark 2.12 --- .../spark/Spark212Instrumentation.java | 43 ++++++++++++ .../spark/Spark212PlanUtils.java | 66 +++++++++++++++++++ .../spark/Spark213Instrumentation.java | 5 +- ...kPlanUtils.java => Spark213PlanUtils.java} | 39 +++-------- .../spark/CommonSparkPlanUtils.java | 40 +++++++++++ 5 files changed, 160 insertions(+), 33 deletions(-) create mode 100644 dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java rename dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/{SparkPlanUtils.java => Spark213PlanUtils.java} (65%) create mode 100644 dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/CommonSparkPlanUtils.java diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java index 1e5dba63bfb..2d7757bbdac 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java @@ -8,8 +8,12 @@ import datadog.trace.api.Config; import net.bytebuddy.asm.Advice; import org.apache.spark.SparkContext; +import org.apache.spark.sql.execution.SparkPlan; +import org.apache.spark.sql.execution.SparkPlanInfo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import scala.collection.JavaConverters; +import scala.collection.immutable.HashMap; @AutoService(InstrumenterModule.class) public class Spark212Instrumentation extends AbstractSparkInstrumentation { @@ -27,9 +31,22 @@ public String[] helperClassNames() { packageName + ".SparkSQLUtils", packageName + ".SparkSQLUtils$SparkPlanInfoForStage", packageName + ".SparkSQLUtils$AccumulatorWithStage", + packageName + ".Spark212PlanUtils", }; } + @Override + public String[] knownMatchingTypes() { + String[] res = new String[super.knownMatchingTypes().length + 1]; + int idx = 0; + for (String match : super.knownMatchingTypes()) { + res[idx] = match; + idx++; + } + res[idx] = "org.apache.spark.sql.execution.SparkPlanInfo$"; + return res; + } + @Override public void methodAdvice(MethodTransformer transformer) { super.methodAdvice(transformer); @@ -40,6 +57,13 @@ public void methodAdvice(MethodTransformer transformer) { .and(isDeclaredBy(named("org.apache.spark.SparkContext"))) .and(takesNoArguments()), Spark212Instrumentation.class.getName() + "$InjectListener"); + + transformer.applyAdvice( + isMethod() + .and(named("fromSparkPlan")) + .and(takesArgument(0, named("org.apache.spark.sql.execution.SparkPlan"))) + .and(isDeclaredBy(named("org.apache.spark.sql.execution.SparkPlanInfo$"))), + Spark212Instrumentation.class.getName() + "$SparkPlanInfoAdvice"); } public static class InjectListener { @@ -78,4 +102,23 @@ public static void enter(@Advice.This SparkContext sparkContext) { sparkContext.listenerBus().addToSharedQueue(AbstractDatadogSparkListener.listener); } } + + public static class SparkPlanInfoAdvice { + @Advice.OnMethodExit(suppress = Throwable.class, onThrowable = Throwable.class) + public static void exit( + @Advice.Return(readOnly = false) SparkPlanInfo planInfo, + @Advice.Argument(0) SparkPlan plan) { + if (planInfo.metadata().size() == 0) { + HashMap args = new HashMap<>(); + planInfo = + new SparkPlanInfo( + planInfo.nodeName(), + planInfo.simpleString(), + planInfo.children(), + args.$plus$plus( + JavaConverters.mapAsScalaMap(Spark212PlanUtils.extractPlanProduct(plan))), + planInfo.metrics()); + } + } + } } diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java new file mode 100644 index 00000000000..c02671968a5 --- /dev/null +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java @@ -0,0 +1,66 @@ +package datadog.trace.instrumentation.spark; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import org.apache.spark.sql.execution.SparkPlan; +import org.apache.spark.sql.execution.exchange.ReusedExchangeExec; +import scala.collection.JavaConverters; + +// An extension of how Spark translates `SparkPlan`s to `SparkPlanInfo`, see here: +// https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 +public class Spark212PlanUtils { + public static ArrayList extractChildren(SparkPlan plan) { + /* + Get children of this node. Logic in Spark: + + val children = plan match { + case ReusedExchangeExec(_, child) => child :: Nil + case _ => plan.children ++ plan.subqueries + } + */ + ArrayList children = new ArrayList<>(); + if (plan instanceof ReusedExchangeExec) { + children.add(((ReusedExchangeExec) plan).child()); + } + + for (Iterator it = JavaConverters.asJavaIterator(plan.subqueries().iterator()); + it.hasNext(); ) { + children.add(it.next()); + } + for (Iterator it = JavaConverters.asJavaIterator(plan.children().iterator()); + it.hasNext(); ) { + children.add(it.next()); + } + + return children; + } + + public static Map extractPlanProduct(SparkPlan plan) { + HashMap args = new HashMap<>(); + HashMap unparsed = new HashMap<>(); + + int i = 0; + for (Iterator it = JavaConverters.asJavaIterator(plan.productIterator()); + it.hasNext(); ) { + Object obj = it.next(); + String key = String.format("_dd.unknown_key.%d", i); + + String val = CommonSparkPlanUtils.parsePlanProduct(obj); + if (val != null) { + args.put(key, val); + } else { + unparsed.put(key, obj.getClass().getName()); + } + + i++; + } + + if (unparsed.size() > 0) { + // For now, place what we can't parse here with the types so we're aware of them + args.put("_dd.unparsed", unparsed.toString()); + } + return args; + } +} diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java index 33dc7a0f155..37675cfa375 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java @@ -31,7 +31,8 @@ public String[] helperClassNames() { packageName + ".SparkSQLUtils", packageName + ".SparkSQLUtils$SparkPlanInfoForStage", packageName + ".SparkSQLUtils$AccumulatorWithStage", - packageName + ".SparkPlanUtils", + packageName + ".Spark213PlanUtils", + packageName + ".CommonSparkPlanUtils", }; } @@ -116,7 +117,7 @@ public static void exit( planInfo.simpleString(), planInfo.children(), HashMap.from( - JavaConverters.asScala(SparkPlanUtils.extractPlanProduct(plan)).toList()), + JavaConverters.asScala(Spark213PlanUtils.extractPlanProduct(plan)).toList()), planInfo.metrics()); } } diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/SparkPlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java similarity index 65% rename from dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/SparkPlanUtils.java rename to dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java index a3227e75b6d..1f9d9d50402 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/SparkPlanUtils.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java @@ -1,26 +1,20 @@ package datadog.trace.instrumentation.spark; import java.util.ArrayList; -import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import org.apache.spark.sql.catalyst.trees.TreeNode; import org.apache.spark.sql.execution.ReusedSubqueryExec; import org.apache.spark.sql.execution.SparkPlan; import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec; import org.apache.spark.sql.execution.adaptive.QueryStageExec; import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec; import org.apache.spark.sql.execution.exchange.ReusedExchangeExec; -import scala.None$; import scala.collection.JavaConverters; -import scala.collection.immutable.$colon$colon; -import scala.collection.immutable.Iterable; -import scala.collection.immutable.Nil$; // An extension of how Spark translates `SparkPlan`s to `SparkPlanInfo`, see here: // https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 -public class SparkPlanUtils { +public class Spark213PlanUtils { public static ArrayList extractChildren(SparkPlan plan) { /* Get children of this node. Logic in Spark: @@ -35,6 +29,8 @@ case EmptyRelationExec(logical) => (logical :: Nil) case _ => plan.children ++ plan.subqueries } */ + // TODO: How does this interact with different versions of Spark? (specifically an older version + // that does not have those types) ArrayList children = new ArrayList<>(); if (plan instanceof ReusedExchangeExec) { children.add(((ReusedExchangeExec) plan).child()); @@ -71,32 +67,13 @@ public static Map extractPlanProduct(SparkPlan plan) { for (Iterator it = JavaConverters.asJavaIterator(plan.productIterator()); it.hasNext(); ) { Object obj = it.next(); + String key = plan.productElementName(i); - // TODO: improve parsing of certain types - // 1. Some() should be unwrapped - // 2. requiredSchema on Scan * (currently showing StructType) - - // TODO: support a few more common types? - // condition=org.apache.spark.sql.catalyst.expressions.objects.Invoke - // joinType=org.apache.spark.sql.catalyst.plans.Inner$ - // buildSide=org.apache.spark.sql.catalyst.optimizer.BuildRight$ - // shuffleOrigin=org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS$ - // outputPartitioning=org.apache.spark.sql.catalyst.plans.physical.SinglePartition$ - if (obj instanceof String - || obj instanceof Boolean - || obj instanceof Collection - || obj instanceof None$ - || obj instanceof Integer) { - args.put(plan.productElementName(i), obj.toString()); - } else if (obj instanceof $colon$colon || obj instanceof Nil$) { - args.put(plan.productElementName(i), JavaConverters.asJava(((Iterable) obj)).toString()); - } else if (obj instanceof TreeNode) { - // Filter out any potential child nodes - // TODO: Exempt conditions from this branch - // e.g. condition=class org.apache.spark.sql.catalyst.expressions.objects.Invoke - unparsed.put(plan.productElementName(i), obj.getClass().getName()); + String val = CommonSparkPlanUtils.parsePlanProduct(obj); + if (val != null) { + args.put(key, val); } else { - args.put(plan.productElementName(i), obj.toString()); + unparsed.put(key, obj.getClass().getName()); } i++; diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/CommonSparkPlanUtils.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/CommonSparkPlanUtils.java new file mode 100644 index 00000000000..4c5cd8ec146 --- /dev/null +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/CommonSparkPlanUtils.java @@ -0,0 +1,40 @@ +package datadog.trace.instrumentation.spark; + +import java.util.Collection; +import org.apache.spark.sql.catalyst.trees.TreeNode; +import scala.None$; +import scala.collection.JavaConverters; +import scala.collection.immutable.$colon$colon; +import scala.collection.immutable.Iterable; +import scala.collection.immutable.Nil$; + +public class CommonSparkPlanUtils { + public static String parsePlanProduct(Object value) { + // TODO: improve parsing of certain types + // 1. Some() should be unwrapped + // 2. requiredSchema on Scan * (currently showing StructType) + + // TODO: support a few more common types? + // condition=org.apache.spark.sql.catalyst.expressions.objects.Invoke + // joinType=org.apache.spark.sql.catalyst.plans.Inner$ + // buildSide=org.apache.spark.sql.catalyst.optimizer.BuildRight$ + // shuffleOrigin=org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS$ + // outputPartitioning=org.apache.spark.sql.catalyst.plans.physical.SinglePartition$ + if (value instanceof String + || value instanceof Boolean + || value instanceof Collection + || value instanceof None$ + || value instanceof Integer) { + return value.toString(); + } else if (value instanceof $colon$colon || value instanceof Nil$) { + return JavaConverters.asJavaIterable(((Iterable) value)).toString(); + } else if (value instanceof TreeNode) { + // Filter out any potential child nodes + // TODO: Exempt conditions from this branch + // e.g. condition=class org.apache.spark.sql.catalyst.expressions.objects.Invoke + return null; + } else { + return value.toString(); + } + } +} From 3f9c26b853b90ad6c2545c873d188d9765dd7b69 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Tue, 14 Oct 2025 17:29:16 -0400 Subject: [PATCH 03/21] Update tests for meta field in Spark SQL plans --- .../spark/AbstractSpark24SqlTest.groovy | 46 +++++++++++++--- .../spark/AbstractSpark32SqlTest.groovy | 52 +++++++++++++++++++ 2 files changed, 91 insertions(+), 7 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy index 7427c89e542..4c9db3e6485 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy @@ -47,7 +47,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { } static assertStringSQLPlanEquals(String expectedString, String actualString) { - System.err.println("Checking if expected $expectedString SQL plan match actual $actualString") + // System.err.println("Checking if expected $expectedString SQL plan match actual $actualString") def jsonSlurper = new JsonSlurper() @@ -60,7 +60,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { // Similar to assertStringSQLPlanEquals, but the actual plan can be a subset of the expected plan // This is used for spark 2.4 where the exact SQL plan is not deterministic protected static assertStringSQLPlanSubset(String expectedString, String actualString) { - System.err.println("Checking if expected $expectedString SQL plan is a super set of $actualString") + // System.err.println("Checking if expected $expectedString SQL plan is a super set of $actualString") def jsonSlurper = new JsonSlurper() @@ -83,7 +83,11 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { // Checking all keys expect children and metrics that are checked after expected.keySet().each { key -> if (!['children', 'metrics'].contains(key)) { - + if (key == "meta") { + var simpleString = actual["nodeDetailString"] + var values = actual[key] + System.err.println("FOUND: node=$actual.node, values=$values, simpleString=$simpleString") + } // Some metric values will varies between runs // In the case, setting the expected value to "any" skips the assertion if (expected[key] != "any") { @@ -159,6 +163,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": -1909876497, "nodeDetailString": "hashpartitioning(string_col#0, 2)", + "meta": "any", "metrics": [ { "data size total (min, med, max)": "any", @@ -169,6 +174,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen", "nodeId": 724251804, + "meta": "any", "metrics": [ { "duration total (min, med, max)": "any", @@ -180,6 +186,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": 1128016273, "nodeDetailString": "(keys=[string_col#0], functions=[partial_avg(double_col#1)])", + "meta": "any", "metrics": [ { "aggregate time total (min, med, max)": "any", @@ -198,11 +205,13 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": 180293, + "meta": "any", "children": [ { "node": "LocalTableScan", "nodeId": 1632930767, "nodeDetailString": "[string_col#0, double_col#1]", + "meta": "any", "metrics": [ { "number of output rows": 3, @@ -224,6 +233,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen", "nodeId": 724251804, + "meta": "any", "metrics": [ { "duration total (min, med, max)": "any", @@ -235,6 +245,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": 126020943, "nodeDetailString": "(keys=[string_col#0], functions=[avg(double_col#1)])", + "meta": "any", "metrics": [ { "aggregate time total (min, med, max)": "any", @@ -256,7 +267,8 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "children": [ { "node": "InputAdapter", - "nodeId": 180293 + "nodeId": 180293, + "meta": "any" } ] } @@ -325,6 +337,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#25, 2)", + "meta": "any", "metrics": [ { "data size total (min, med, max)": "any", @@ -336,6 +349,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "LocalTableScan", "nodeId": "any", "nodeDetailString": "[string_col#25]", + "meta": "any", "metrics": [ { "number of output rows": "any", @@ -351,6 +365,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#21, 2)", + "meta": "any", "metrics": [ { "data size total (min, med, max)": "any", @@ -362,6 +377,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "LocalTableScan", "nodeId": "any", "nodeDetailString": "[string_col#21]", + "meta": "any", "metrics": [ { "number of output rows": "any", @@ -377,6 +393,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": -1350402171, "nodeDetailString": "SinglePartition", + "meta": "any", "metrics": [ { "data size total (min, med, max)": "any", @@ -387,6 +404,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen", "nodeId": 724251804, + "meta": "any", "metrics": [ { "duration total (min, med, max)": "any", @@ -398,6 +416,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": -879128980, "nodeDetailString": "(keys=[], functions=[partial_count(1)])", + "meta": "any", "metrics": [ { "aggregate time total (min, med, max)": "any", @@ -412,11 +431,13 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "Project", "nodeId": 1355342585, + "meta": "any", "children": [ { "node": "SortMergeJoin", "nodeId": -1975876610, "nodeDetailString": "[string_col#21], [string_col#25], Inner", + "meta": "any", "metrics": [ { "number of output rows": "any", @@ -427,10 +448,12 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": 180293, + "meta": "any", "children": [ { "node": "WholeStageCodegen", "nodeId": 724251804, + "meta": "any", "metrics": [ { "duration total (min, med, max)": "any", @@ -442,6 +465,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Sort", "nodeId": 66807398, "nodeDetailString": "[string_col#21 ASC NULLS FIRST], false, 0", + "meta": "any", "metrics": [ { "peak memory total (min, med, max)": "any", @@ -455,7 +479,8 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "children": [ { "node": "InputAdapter", - "nodeId": 180293 + "nodeId": 180293, + "meta": "any" } ] } @@ -466,10 +491,12 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": 180293, + "meta": "any", "children": [ { "node": "WholeStageCodegen", "nodeId": 724251804, + "meta": "any", "metrics": [ { "duration total (min, med, max)": "any", @@ -481,6 +508,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Sort", "nodeId": -952138782, "nodeDetailString": "[string_col#25 ASC NULLS FIRST], false, 0", + "meta": "any", "metrics": [ { "peak memory total (min, med, max)": "any", @@ -490,7 +518,8 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "children": [ { "node": "InputAdapter", - "nodeId": 180293 + "nodeId": 180293, + "meta": "any" } ] } @@ -513,6 +542,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen", "nodeId": 724251804, + "meta": "any", "metrics": [ { "duration total (min, med, max)": "any", @@ -524,6 +554,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": 724815342, "nodeDetailString": "(keys=[], functions=[count(1)])", + "meta": "any", "metrics": [ { "number of output rows": 1, @@ -533,7 +564,8 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "children": [ { "node": "InputAdapter", - "nodeId": 180293 + "nodeId": 180293, + "meta": "any" } ] } diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy index 875cdcafdbc..d31cd13114c 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy @@ -35,6 +35,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#0, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d38]", + "meta": "any", "metrics": [ { "data size": "any", @@ -57,6 +58,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (1)", "nodeId": "nodeId_1", + "meta": "any", "metrics": [ { "duration": "any", @@ -68,6 +70,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": "nodeId_3", "nodeDetailString": "(keys=[string_col#0], functions=[partial_avg(double_col#1)])", + "meta": "any", "metrics": [ { "number of output rows": 3, @@ -87,6 +90,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "LocalTableScan", "nodeId": "nodeId_2", "nodeDetailString": "[string_col#0, double_col#1]", + "meta": "any", "metrics": [ { "number of output rows": 3, @@ -105,6 +109,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (2)", "nodeId": "nodeId_8", + "meta": "any", "metrics": [ { "duration": "any", @@ -116,6 +121,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": "nodeId_9", "nodeDetailString": "(keys\\u003d[string_col#0], functions\\u003d[avg(double_col#1)])", + "meta": "any", "metrics": [ { "avg hash probe bucket list iters": "any", @@ -138,22 +144,26 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_7", + "meta": "any", "children": [ { "node": "AQEShuffleRead", "nodeId": "nodeId_5", "nodeDetailString": "coalesced", + "meta": "any", "metrics": [], "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_6", "nodeDetailString": "0", + "meta": "any", "children": [ { "node": "Exchange", "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#0, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d38]", + "meta": "any", "metrics": [ { "data size": "any", @@ -205,6 +215,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "nodeId_10", "nodeDetailString": "rangepartitioning(avg(double_col)#5 DESC NULLS LAST, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d67]", + "meta": "any", "metrics": [ { "data size": "any", @@ -227,6 +238,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (2)", "nodeId": "nodeId_8", + "meta": "any", "metrics": [ { "duration": "any", @@ -238,6 +250,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": "nodeId_9", "nodeDetailString": "(keys\\u003d[string_col#0], functions\\u003d[avg(double_col#1)])", + "meta": "any", "metrics": [ { "avg hash probe bucket list iters": "any", @@ -260,22 +273,26 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_7", + "meta": "any", "children": [ { "node": "AQEShuffleRead", "nodeId": "nodeId_5", "nodeDetailString": "coalesced", + "meta": "any", "metrics": [], "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_6", "nodeDetailString": "0", + "meta": "any", "children": [ { "node": "Exchange", "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#0, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d38]", + "meta": "any", "metrics": [ { "data size": "any", @@ -328,6 +345,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (3)", "nodeId": "nodeId_12", + "meta": "any", "metrics": [ { "duration": "any", @@ -339,11 +357,13 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Project", "nodeId": "nodeId_11", "nodeDetailString": "[string_col#0, cast(avg(double_col)#5 as string) AS avg(double_col)#12]", + "meta": "any", "children": [ { "node": "Sort", "nodeId": "nodeId_13", "nodeDetailString": "[avg(double_col)#5 DESC NULLS LAST], true, 0", + "meta": "any", "metrics": [ { "peak memory": "any", @@ -362,22 +382,26 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_7", + "meta": "any", "children": [ { "node": "AQEShuffleRead", "nodeId": "nodeId_5", "nodeDetailString": "coalesced", + "meta": "any", "metrics": [], "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_14", "nodeDetailString": "1", + "meta": "any", "children": [ { "node": "Exchange", "nodeId": "nodeId_10", "nodeDetailString": "rangepartitioning(avg(double_col)#5 DESC NULLS LAST, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d67]", + "meta": "any", "metrics": [ { "data size": "any", @@ -525,6 +549,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#28, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d119]", + "meta": "any", "metrics": [ { "data size": "any", @@ -548,6 +573,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "LocalTableScan", "nodeId": "any", "nodeDetailString": "[string_col#28]", + "meta": "any", "metrics": [ { "number of output rows": "any", @@ -563,6 +589,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#32, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d120]", + "meta": "any", "metrics": [ { "data size": "any", @@ -586,6 +613,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "LocalTableScan", "nodeId": "any", "nodeDetailString": "[string_col#32]", + "meta": "any", "metrics": [ { "number of output rows": "any", @@ -601,6 +629,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "nodeId_7", "nodeDetailString": "SinglePartition, ENSURE_REQUIREMENTS, [plan_id\\u003d230]", + "meta": "any", "metrics": [ { "data size": "any", @@ -623,6 +652,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (3)", "nodeId": "nodeId_9", + "meta": "any", "metrics": [ { "duration": "any", @@ -634,6 +664,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": "nodeId_16", "nodeDetailString": "(keys\\u003d[], functions\\u003d[partial_count(1)])", + "meta": "any", "metrics": [ { "number of output rows": 1, @@ -648,11 +679,13 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "Project", "nodeId": "nodeId_13", + "meta": "any", "children": [ { "node": "SortMergeJoin", "nodeId": "nodeId_15", "nodeDetailString": "[string_col#28], [string_col#32], Inner", + "meta": "any", "metrics": [ { "number of output rows": "any", @@ -663,10 +696,12 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_6", + "meta": "any", "children": [ { "node": "WholeStageCodegen (1)", "nodeId": "nodeId_8", + "meta": "any", "metrics": [ { "duration": "any", @@ -678,6 +713,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Sort", "nodeId": "nodeId_11", "nodeDetailString": "[string_col#28 ASC NULLS FIRST], false, 0", + "meta": "any", "metrics": [ { "peak memory": "any", @@ -696,22 +732,26 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_6", + "meta": "any", "children": [ { "node": "AQEShuffleRead", "nodeId": "nodeId_5", "nodeDetailString": "coalesced", + "meta": "any", "metrics": [], "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_14", "nodeDetailString": "0", + "meta": "any", "children": [ { "node": "Exchange", "nodeId": "nodeId_2", "nodeDetailString": "hashpartitioning(string_col#28, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d119]", + "meta": "any", "metrics": [ { "data size": "any", @@ -762,10 +802,12 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_6", + "meta": "any", "children": [ { "node": "WholeStageCodegen (2)", "nodeId": "nodeId_12", + "meta": "any", "metrics": [ { "duration": "any", @@ -777,6 +819,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Sort", "nodeId": "nodeId_17", "nodeDetailString": "[string_col#32 ASC NULLS FIRST], false, 0", + "meta": "any", "metrics": [ { "peak memory": "any", @@ -795,22 +838,26 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_6", + "meta": "any", "children": [ { "node": "AQEShuffleRead", "nodeId": "nodeId_5", "nodeDetailString": "coalesced", + "meta": "any", "metrics": [], "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_10", "nodeDetailString": "1", + "meta": "any", "children": [ { "node": "Exchange", "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#32, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d120]", + "meta": "any", "metrics": [ { "data size": "any", @@ -873,6 +920,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (4)", "nodeId": "nodeId_20", + "meta": "any", "metrics": [ { "duration": "any", @@ -884,6 +932,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": "nodeId_18", "nodeDetailString": "(keys\\u003d[], functions\\u003d[count(1)])", + "meta": "any", "metrics": [ { "number of output rows": "any", @@ -898,16 +947,19 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_6", + "meta": "any", "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_19", "nodeDetailString": "2", + "meta": "any", "children": [ { "node": "Exchange", "nodeId": "nodeId_7", "nodeDetailString": "SinglePartition, ENSURE_REQUIREMENTS, [plan_id\\u003d230]", + "meta": "any", "metrics": [ { "data size": "any", From 94f313939673661b9f2911fb8d8e44f9a64ee103 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Wed, 15 Oct 2025 17:47:06 -0400 Subject: [PATCH 04/21] Remove unused logic to parse children, enrich product parsing to support more types and use JSON arrays --- .../spark/Spark212PlanUtils.java | 28 ----------- .../spark/Spark213PlanUtils.java | 50 ------------------- .../spark/CommonSparkPlanUtils.java | 42 +++++----------- .../instrumentation/spark/SparkSQLUtils.java | 17 ++++++- 4 files changed, 28 insertions(+), 109 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java index c02671968a5..d060d8a6683 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java @@ -1,42 +1,14 @@ package datadog.trace.instrumentation.spark; -import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import org.apache.spark.sql.execution.SparkPlan; -import org.apache.spark.sql.execution.exchange.ReusedExchangeExec; import scala.collection.JavaConverters; // An extension of how Spark translates `SparkPlan`s to `SparkPlanInfo`, see here: // https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 public class Spark212PlanUtils { - public static ArrayList extractChildren(SparkPlan plan) { - /* - Get children of this node. Logic in Spark: - - val children = plan match { - case ReusedExchangeExec(_, child) => child :: Nil - case _ => plan.children ++ plan.subqueries - } - */ - ArrayList children = new ArrayList<>(); - if (plan instanceof ReusedExchangeExec) { - children.add(((ReusedExchangeExec) plan).child()); - } - - for (Iterator it = JavaConverters.asJavaIterator(plan.subqueries().iterator()); - it.hasNext(); ) { - children.add(it.next()); - } - for (Iterator it = JavaConverters.asJavaIterator(plan.children().iterator()); - it.hasNext(); ) { - children.add(it.next()); - } - - return children; - } - public static Map extractPlanProduct(SparkPlan plan) { HashMap args = new HashMap<>(); HashMap unparsed = new HashMap<>(); diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java index 1f9d9d50402..8544d4b6790 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java @@ -1,64 +1,14 @@ package datadog.trace.instrumentation.spark; -import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import org.apache.spark.sql.execution.ReusedSubqueryExec; import org.apache.spark.sql.execution.SparkPlan; -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec; -import org.apache.spark.sql.execution.adaptive.QueryStageExec; -import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec; -import org.apache.spark.sql.execution.exchange.ReusedExchangeExec; import scala.collection.JavaConverters; // An extension of how Spark translates `SparkPlan`s to `SparkPlanInfo`, see here: // https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 public class Spark213PlanUtils { - public static ArrayList extractChildren(SparkPlan plan) { - /* - Get children of this node. Logic in Spark: - - val children = plan match { - case ReusedExchangeExec(_, child) => child :: Nil - case ReusedSubqueryExec(child) => child :: Nil - case a: AdaptiveSparkPlanExec => a.executedPlan :: Nil - case stage: QueryStageExec => stage.plan :: Nil - case inMemTab: InMemoryTableScanExec => inMemTab.relation.cachedPlan :: Nil - case EmptyRelationExec(logical) => (logical :: Nil) - case _ => plan.children ++ plan.subqueries - } - */ - // TODO: How does this interact with different versions of Spark? (specifically an older version - // that does not have those types) - ArrayList children = new ArrayList<>(); - if (plan instanceof ReusedExchangeExec) { - children.add(((ReusedExchangeExec) plan).child()); - } else if (plan instanceof ReusedSubqueryExec) { - children.add(((ReusedSubqueryExec) plan).child()); - } else if (plan instanceof AdaptiveSparkPlanExec) { - children.add(((AdaptiveSparkPlanExec) plan).executedPlan()); - } else if (plan instanceof QueryStageExec) { - children.add(((QueryStageExec) plan).plan()); - } else if (plan instanceof InMemoryTableScanExec) { - children.add(((InMemoryTableScanExec) plan).relation().cachedPlan()); - // New as of Spark 4.0.0 - // } else if (plan instanceof EmptyRelationExec) { - // children.add(((EmptyRelationExec) plan).logical); - } - - for (Iterator it = JavaConverters.asJavaIterator(plan.subqueries().iterator()); - it.hasNext(); ) { - children.add(it.next()); - } - for (Iterator it = JavaConverters.asJavaIterator(plan.children().iterator()); - it.hasNext(); ) { - children.add(it.next()); - } - - return children; - } - public static Map extractPlanProduct(SparkPlan plan) { HashMap args = new HashMap<>(); HashMap unparsed = new HashMap<>(); diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/CommonSparkPlanUtils.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/CommonSparkPlanUtils.java index 4c5cd8ec146..ad8ce7f1969 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/CommonSparkPlanUtils.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/CommonSparkPlanUtils.java @@ -1,37 +1,21 @@ package datadog.trace.instrumentation.spark; -import java.util.Collection; -import org.apache.spark.sql.catalyst.trees.TreeNode; -import scala.None$; -import scala.collection.JavaConverters; -import scala.collection.immutable.$colon$colon; -import scala.collection.immutable.Iterable; -import scala.collection.immutable.Nil$; +import java.util.ArrayList; +import org.apache.spark.sql.catalyst.plans.QueryPlan; +import scala.Option; +import scala.collection.Iterable; public class CommonSparkPlanUtils { public static String parsePlanProduct(Object value) { - // TODO: improve parsing of certain types - // 1. Some() should be unwrapped - // 2. requiredSchema on Scan * (currently showing StructType) - - // TODO: support a few more common types? - // condition=org.apache.spark.sql.catalyst.expressions.objects.Invoke - // joinType=org.apache.spark.sql.catalyst.plans.Inner$ - // buildSide=org.apache.spark.sql.catalyst.optimizer.BuildRight$ - // shuffleOrigin=org.apache.spark.sql.execution.exchange.ENSURE_REQUIREMENTS$ - // outputPartitioning=org.apache.spark.sql.catalyst.plans.physical.SinglePartition$ - if (value instanceof String - || value instanceof Boolean - || value instanceof Collection - || value instanceof None$ - || value instanceof Integer) { - return value.toString(); - } else if (value instanceof $colon$colon || value instanceof Nil$) { - return JavaConverters.asJavaIterable(((Iterable) value)).toString(); - } else if (value instanceof TreeNode) { - // Filter out any potential child nodes - // TODO: Exempt conditions from this branch - // e.g. condition=class org.apache.spark.sql.catalyst.expressions.objects.Invoke + if (value == null) { + return "null"; + } else if (value instanceof Iterable) { + ArrayList list = new ArrayList<>(); + ((Iterable) value).foreach(item -> list.add(parsePlanProduct(item))); + return "[\"" + String.join("\", \"", list) + "\"]"; + } else if (value instanceof Option) { + return parsePlanProduct(((Option) value).getOrElse(() -> "none")); + } else if (value instanceof QueryPlan) { // Filter out values referencing child nodes return null; } else { return value.toString(); diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/SparkSQLUtils.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/SparkSQLUtils.java index e58f24d3dd2..ba1d846dc0d 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/SparkSQLUtils.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/SparkSQLUtils.java @@ -173,13 +173,26 @@ private void toJson(JsonGenerator generator, Map acc generator.writeStringField("nodeDetailString", nodeDetails); } - // Metadata is only present for FileSourceScan nodes + // Metadata is only added natively by Spark for FileSourceScan nodes + // We leverage this to extract & inject additional argument-level data if (!plan.metadata().isEmpty()) { generator.writeFieldName("meta"); generator.writeStartObject(); for (Tuple2 metadata : JavaConverters.asJavaCollection(plan.metadata())) { - generator.writeStringField(metadata._1, metadata._2); + // If it looks like a string array, break apart and write as native JSON array + if (metadata._2.startsWith("[\"") && metadata._2.endsWith("\"]")) { + String[] list = metadata._2.substring(2, metadata._2.length() - 2).split("\", \""); + + generator.writeFieldName(metadata._1); + generator.writeStartArray(); + for (String entry : list) { + generator.writeString(entry); + } + generator.writeEndArray(); + } else { + generator.writeStringField(metadata._1, metadata._2); + } } generator.writeEndObject(); From beb4d5f5d19df0ad98bb1d6776810fd26ec66ac3 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Thu, 16 Oct 2025 15:04:20 -0400 Subject: [PATCH 05/21] Update tests to assert on meta values --- .../spark/AbstractSpark24SqlTest.groovy | 293 +++++++++++++---- .../spark/AbstractSpark32SqlTest.groovy | 308 ++++++++++++++---- 2 files changed, 483 insertions(+), 118 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy index 4c9db3e6485..7efc65ca246 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy @@ -29,70 +29,108 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { spark.createDataFrame(rows, structType) } - static assertStringSQLPlanIn(ArrayList expectedStrings, String actualString) { + static assertStringSQLPlanIn(ArrayList expectedStrings, String actualString, String name) { def jsonSlurper = new JsonSlurper() def actual = jsonSlurper.parseText(actualString) for (String expectedString : expectedStrings) { try { def expected = jsonSlurper.parseText(expectedString) - assertSQLPlanEquals(expected, actual) + assertSQLPlanEquals(expected, actual, name) return } catch (AssertionError e) { - System.println("Failed to assert $expectedString, attempting next") + System.err.println("Failed to assert $expectedString, attempting next") } } throw new AssertionError("No matching SQL Plan found for $actualString in $expectedStrings") } - static assertStringSQLPlanEquals(String expectedString, String actualString) { - // System.err.println("Checking if expected $expectedString SQL plan match actual $actualString") - + static assertStringSQLPlanEquals(String expectedString, String actualString, String name) { def jsonSlurper = new JsonSlurper() def expected = jsonSlurper.parseText(expectedString) def actual = jsonSlurper.parseText(actualString) - assertSQLPlanEquals(expected, actual) + assertSQLPlanEquals(expected, actual, name) } // Similar to assertStringSQLPlanEquals, but the actual plan can be a subset of the expected plan // This is used for spark 2.4 where the exact SQL plan is not deterministic - protected static assertStringSQLPlanSubset(String expectedString, String actualString) { - // System.err.println("Checking if expected $expectedString SQL plan is a super set of $actualString") - + protected static assertStringSQLPlanSubset(String expectedString, String actualString, String name) { def jsonSlurper = new JsonSlurper() def expected = jsonSlurper.parseText(expectedString) def actual = jsonSlurper.parseText(actualString) try { - assertSQLPlanEquals(expected.children[0], actual) + assertSQLPlanEquals(expected.children[0], actual, name) return // If is a subset, the test is successful } - catch (AssertionError e) {} + catch (AssertionError e) { + System.err.println("Failed to assert $expectedString, attempting parent") + } - assertSQLPlanEquals(expected, actual) + assertSQLPlanEquals(expected, actual, name) } - private static assertSQLPlanEquals(Object expected, Object actual) { + private static assertSQLPlanEquals(Object expected, Object actual, String name) { assert expected.node == actual.node assert expected.keySet() == actual.keySet() - // Checking all keys expect children and metrics that are checked after + def prefix = "$name on $expected.node node:\n\t" + + // Checking all keys except children, meta, and metrics that are checked after expected.keySet().each { key -> - if (!['children', 'metrics'].contains(key)) { - if (key == "meta") { - var simpleString = actual["nodeDetailString"] - var values = actual[key] - System.err.println("FOUND: node=$actual.node, values=$values, simpleString=$simpleString") - } - // Some metric values will varies between runs - // In the case, setting the expected value to "any" skips the assertion + if (!['children', 'metrics', 'meta'].contains(key)) { if (expected[key] != "any") { - assert expected[key] == actual[key]: "$expected does not match $actual" + // Some metric values will varies between runs + // In the case, setting the expected value to "any" skips the assertion + assert expected[key] == actual[key]: prefix + "value of \"$key\" does not match $expected.key, got $actual.key" + } + } + } + + // Checking the meta values are the same on both sides + if (expected.meta == null) { + assert actual.meta == null + } else { + try { + def expectedMeta = expected.meta + def actualMeta = actual.meta + + assert actualMeta.size() == expectedMeta.size() : prefix + "meta size of $expectedMeta does not match $actualMeta" + + def actualUnknown = [] // List of values for all valid unknown keys + actual.meta.each { actualMetaKey, actualMetaValue -> + if (!expectedMeta.containsKey(actualMetaKey) && actualMetaKey.startsWith("_dd.unknown_key.")) { + actualUnknown.add(actualMetaValue) + } else if (!expectedMeta.containsKey(actualMetaKey)) { + throw new AssertionError(prefix + "unexpected key \"$actualMetaKey\" found, not valid unknown key with prefix '_dd.unknown_key.' or in $expectedMeta") + } } + + expected.meta.each { expectedMetaKey, expectedMetaValue -> + if (actualMeta.containsKey(expectedMetaKey)) { + def actualMetaValue = actualMeta[expectedMetaKey] + if (expectedMetaValue instanceof List) { + assert expectedMetaValue ==~ actualMetaValue : prefix + "value of meta key \"$expectedMetaKey\" does not match \"$expectedMetaValue\", got \"$actualMetaValue\"" + } else { + // Don't assert meta values where expectation is "any" + assert expectedMetaValue == "any" || expectedMetaValue == actualMetaValue: prefix + "value of meta key \"$expectedMetaKey\" does not match \"$expectedMetaValue\", got \"$actualMetaValue\"" + } + } else if (actualUnknown.size() > 0) { + // If expected key not found, attempt to match value against those from valid unknown keys + assert actualUnknown.indexOf(expectedMetaValue) >= 0 : prefix + "meta key \"$expectedMetaKey\" not found in $actualMeta\n\tattempted to match against value \"$expectedMetaValue\" with unknown keys, but not found" + actualUnknown.drop(actualUnknown.indexOf(expectedMetaValue)) + } else { + // Defensive, should never happen + assert actualMeta.containsKey(expectedMetaKey) : prefix + "meta key \"$expectedMetaKey\" not found in $actualMeta" + } + } + } catch (AssertionError e) { + generateMetaExpectations(actual, name) + throw e } } @@ -107,15 +145,16 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { def expectedMetric = metricPair[0] def actualMetric = metricPair[1] - assert expectedMetric.size() == actualMetric.size(): "$expected does not match $actual" + assert expectedMetric.size() == actualMetric.size(): prefix + "metric size of $expectedMetric does not match $actualMetric" // Each metric is a dict { "metric_name": "metric_value", "type": "metric_type" } expectedMetric.each { key, expectedValue -> - assert actualMetric.containsKey(key): "$expected does not match $actual" + assert actualMetric.containsKey(key): prefix + "metric key \"$key\" not found in $actualMetric" // Some metric values are duration that will varies between runs // In the case, setting the expected value to "any" skips the assertion - assert expectedValue == "any" || actualMetric[key] == expectedValue: "$expected does not match $actual" + def actualValue = actualMetric[key] + assert expectedValue == "any" || actualValue == expectedValue: prefix + "value of metric key \"$key\" does not match \"$expectedValue\", got $actualValue" } } } @@ -128,7 +167,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { actual.children.sort { it.node } [expected.children, actual.children].transpose().each { childPair -> - assertSQLPlanEquals(childPair[0], childPair[1]) + assertSQLPlanEquals(childPair[0], childPair[1], name) } } } @@ -140,6 +179,36 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { return source } + private static generateMetaExpectations(Object actual, String name) { + var simpleString = actual["nodeDetailString"] + var values = [] + var child = "N/A" + actual["meta"].each() { key, value -> + if (key == "_dd.unparsed") { + values.add("\"_dd.unparsed\": \"any\"") + child = value + } else if (value instanceof List) { + var list = [] + value.each() { it -> + list.add("\"$it\"") + } + def prettyList = "[\n " + list.join(", \n ") + "\n ]" + if (list.size() == 1) { + prettyList = "[" + list.join(", ") + "]" + } + values.add("\"$key\": $prettyList") + } else { + values.add("\"$key\": \"$value\"") + } + } + values.sort() { it } + def prettyValues = "\n\"meta\": {\n " + values.join(", \n ") + "\n}," + if (values.size() == 1) { + prettyValues = "\n\"meta\": {" + values.join(", ") + "}," + } + System.err.println("$actual.node\n\tname=$name\n\tchild=$child\n\tvalues=$prettyValues\n\tsimpleString=$simpleString") + } + def "compute a GROUP BY sql query plan"() { def sparkSession = SparkSession.builder() .config("spark.master", "local[2]") @@ -163,7 +232,11 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": -1909876497, "nodeDetailString": "hashpartitioning(string_col#0, 2)", - "meta": "any", + "meta": { + "_dd.unknown_key.0": "hashpartitioning(string_col#0, 2)", + "_dd.unknown_key.2": "none", + "_dd.unparsed": "any" + }, "metrics": [ { "data size total (min, med, max)": "any", @@ -174,7 +247,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen", "nodeId": 724251804, - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration total (min, med, max)": "any", @@ -186,7 +259,22 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": 1128016273, "nodeDetailString": "(keys=[string_col#0], functions=[partial_avg(double_col#1)])", - "meta": "any", + "meta": { + "_dd.unknown_key.0": "none", + "_dd.unknown_key.1": ["string_col#0"], + "_dd.unknown_key.2": ["partial_avg(double_col#1)"], + "_dd.unknown_key.3": [ + "sum#16", + "count#17L" + ], + "_dd.unknown_key.4": "0", + "_dd.unknown_key.5": [ + "string_col#0", + "sum#18", + "count#19L" + ], + "_dd.unparsed": "any" + }, "metrics": [ { "aggregate time total (min, med, max)": "any", @@ -205,13 +293,23 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": 180293, - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "children": [ { "node": "LocalTableScan", "nodeId": 1632930767, "nodeDetailString": "[string_col#0, double_col#1]", - "meta": "any", + "meta": { + "_dd.unknown_key.0": [ + "string_col#0", + "double_col#1" + ], + "_dd.unknown_key.1": [ + "[first,1.2]", + "[first,1.3]", + "[second,1.6]" + ] + }, "metrics": [ { "number of output rows": 3, @@ -233,7 +331,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen", "nodeId": 724251804, - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration total (min, med, max)": "any", @@ -245,7 +343,18 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": 126020943, "nodeDetailString": "(keys=[string_col#0], functions=[avg(double_col#1)])", - "meta": "any", + "meta": { + "_dd.unknown_key.0": ["string_col#0"], + "_dd.unknown_key.1": ["string_col#0"], + "_dd.unknown_key.2": ["avg(double_col#1)"], + "_dd.unknown_key.3": ["avg(double_col#1)#4"], + "_dd.unknown_key.4": "1", + "_dd.unknown_key.5": [ + "string_col#0", + "avg(double_col#1)#4 AS avg(double_col)#5" + ], + "_dd.unparsed": "any" + }, "metrics": [ { "aggregate time total (min, med, max)": "any", @@ -268,7 +377,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": 180293, - "meta": "any" + "meta": {"_dd.unparsed": "any"} } ] } @@ -297,7 +406,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { operationName "spark.stage" spanType "spark" childOf(span(2)) - assertStringSQLPlanEquals(secondStagePlan, span.tags["_dd.spark.sql_plan"].toString()) + assertStringSQLPlanEquals(secondStagePlan, span.tags["_dd.spark.sql_plan"].toString(), "secondStagePlan") assert span.tags["_dd.spark.physical_plan"] == null assert span.tags["_dd.spark.sql_parent_stage_ids"] == "[0]" } @@ -305,7 +414,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { operationName "spark.stage" spanType "spark" childOf(span(2)) - assertStringSQLPlanEquals(firstStagePlan, span.tags["_dd.spark.sql_plan"].toString()) + assertStringSQLPlanEquals(firstStagePlan, span.tags["_dd.spark.sql_plan"].toString(), "firstStagePlan") assert span.tags["_dd.spark.physical_plan"] == null assert span.tags["_dd.spark.sql_parent_stage_ids"] == "[]" } @@ -337,7 +446,11 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#25, 2)", - "meta": "any", + "meta": { + "_dd.unknown_key.0": "hashpartitioning(string_col#25, 2)", + "_dd.unknown_key.2": "none", + "_dd.unparsed": "any" + }, "metrics": [ { "data size total (min, med, max)": "any", @@ -349,7 +462,14 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "LocalTableScan", "nodeId": "any", "nodeDetailString": "[string_col#25]", - "meta": "any", + "meta": { + "_dd.unknown_key.0": ["string_col#25"], + "_dd.unknown_key.1": [ + "[first]", + "[first]", + "[second]" + ] + }, "metrics": [ { "number of output rows": "any", @@ -365,7 +485,11 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#21, 2)", - "meta": "any", + "meta": { + "_dd.unknown_key.0": "hashpartitioning(string_col#21, 2)", + "_dd.unknown_key.2": "none", + "_dd.unparsed": "any" + }, "metrics": [ { "data size total (min, med, max)": "any", @@ -377,7 +501,13 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "LocalTableScan", "nodeId": "any", "nodeDetailString": "[string_col#21]", - "meta": "any", + "meta": { + "_dd.unknown_key.0": ["string_col#21"], + "_dd.unknown_key.1": [ + "[first]", + "[second]" + ] + }, "metrics": [ { "number of output rows": "any", @@ -393,7 +523,11 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": -1350402171, "nodeDetailString": "SinglePartition", - "meta": "any", + "meta": { + "_dd.unknown_key.0": "SinglePartition", + "_dd.unknown_key.2": "none", + "_dd.unparsed": "any" + }, "metrics": [ { "data size total (min, med, max)": "any", @@ -404,7 +538,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen", "nodeId": 724251804, - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration total (min, med, max)": "any", @@ -416,7 +550,15 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": -879128980, "nodeDetailString": "(keys=[], functions=[partial_count(1)])", - "meta": "any", + "meta": { + "_dd.unknown_key.0": "none", + "_dd.unknown_key.1": [""], + "_dd.unknown_key.2": ["partial_count(1)"], + "_dd.unknown_key.3": ["count#38L"], + "_dd.unknown_key.4": "0", + "_dd.unknown_key.5": ["count#39L"], + "_dd.unparsed": "any" + }, "metrics": [ { "aggregate time total (min, med, max)": "any", @@ -431,13 +573,22 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "Project", "nodeId": 1355342585, - "meta": "any", + "meta": { + "_dd.unknown_key.0": [""], + "_dd.unparsed": "any" + }, "children": [ { "node": "SortMergeJoin", "nodeId": -1975876610, "nodeDetailString": "[string_col#21], [string_col#25], Inner", - "meta": "any", + "meta": { + "_dd.unknown_key.0": ["string_col#21"], + "_dd.unknown_key.1": ["string_col#25"], + "_dd.unknown_key.2": "Inner", + "_dd.unknown_key.3": "none", + "_dd.unparsed": "any" + }, "metrics": [ { "number of output rows": "any", @@ -448,12 +599,12 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": 180293, - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "children": [ { "node": "WholeStageCodegen", "nodeId": 724251804, - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration total (min, med, max)": "any", @@ -465,7 +616,12 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Sort", "nodeId": 66807398, "nodeDetailString": "[string_col#21 ASC NULLS FIRST], false, 0", - "meta": "any", + "meta": { + "_dd.unknown_key.0": ["string_col#21 ASC NULLS FIRST"], + "_dd.unknown_key.1": "false", + "_dd.unknown_key.3": "0", + "_dd.unparsed": "any" + }, "metrics": [ { "peak memory total (min, med, max)": "any", @@ -480,7 +636,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": 180293, - "meta": "any" + "meta": {"_dd.unparsed": "any"} } ] } @@ -491,12 +647,12 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": 180293, - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "children": [ { "node": "WholeStageCodegen", "nodeId": 724251804, - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration total (min, med, max)": "any", @@ -508,7 +664,12 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Sort", "nodeId": -952138782, "nodeDetailString": "[string_col#25 ASC NULLS FIRST], false, 0", - "meta": "any", + "meta": { + "_dd.unknown_key.0": ["string_col#25 ASC NULLS FIRST"], + "_dd.unknown_key.1": "false", + "_dd.unknown_key.3": "0", + "_dd.unparsed": "any" + }, "metrics": [ { "peak memory total (min, med, max)": "any", @@ -519,7 +680,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": 180293, - "meta": "any" + "meta": {"_dd.unparsed": "any"} } ] } @@ -542,7 +703,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen", "nodeId": 724251804, - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration total (min, med, max)": "any", @@ -554,7 +715,15 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": 724815342, "nodeDetailString": "(keys=[], functions=[count(1)])", - "meta": "any", + "meta": { + "_dd.unknown_key.0": [""], + "_dd.unknown_key.1": [""], + "_dd.unknown_key.2": ["count(1)"], + "_dd.unknown_key.3": ["count(1)#35L"], + "_dd.unknown_key.4": "0", + "_dd.unknown_key.5": ["count(1)#35L AS count#36L"], + "_dd.unparsed": "any" + }, "metrics": [ { "number of output rows": 1, @@ -565,7 +734,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": 180293, - "meta": "any" + "meta": {"_dd.unparsed": "any"} } ] } @@ -594,7 +763,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { operationName "spark.stage" spanType "spark" childOf(span(2)) - assertStringSQLPlanSubset(fourthStagePlan, span.tags["_dd.spark.sql_plan"].toString()) + assertStringSQLPlanSubset(fourthStagePlan, span.tags["_dd.spark.sql_plan"].toString(), "fourthStagePlan") assert span.tags["_dd.spark.physical_plan"] == null assert span.tags["_dd.spark.sql_parent_stage_ids"] == "[2]" } @@ -602,7 +771,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { operationName "spark.stage" spanType "spark" childOf(span(2)) - assertStringSQLPlanEquals(thirdStagePlan, span.tags["_dd.spark.sql_plan"].toString()) + assertStringSQLPlanEquals(thirdStagePlan, span.tags["_dd.spark.sql_plan"].toString(), "thirdStagePlan") assert span.tags["_dd.spark.physical_plan"] == null assert ["[0, 1]", "[1, 0]"].contains(span.tags["_dd.spark.sql_parent_stage_ids"]) } @@ -610,7 +779,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { operationName "spark.stage" spanType "spark" childOf(span(2)) - assertStringSQLPlanIn([firstStagePlan, secondStagePlan], span.tags["_dd.spark.sql_plan"].toString()) + assertStringSQLPlanIn([firstStagePlan, secondStagePlan], span.tags["_dd.spark.sql_plan"].toString(), "secondStagePlan") assert span.tags["_dd.spark.physical_plan"] == null assert span.tags["_dd.spark.sql_parent_stage_ids"] == "[]" } @@ -618,7 +787,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { operationName "spark.stage" spanType "spark" childOf(span(2)) - assertStringSQLPlanIn([firstStagePlan, secondStagePlan], span.tags["_dd.spark.sql_plan"].toString()) + assertStringSQLPlanIn([firstStagePlan, secondStagePlan], span.tags["_dd.spark.sql_plan"].toString(), "firstStagePlan") assert span.tags["_dd.spark.physical_plan"] == null assert span.tags["_dd.spark.sql_parent_stage_ids"] == "[]" } diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy index d31cd13114c..5ed32657723 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy @@ -35,7 +35,11 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#0, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d38]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "outputPartitioning": "hashpartitioning(string_col#0, 2)", + "shuffleOrigin": "ENSURE_REQUIREMENTS" + }, "metrics": [ { "data size": "any", @@ -58,7 +62,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (1)", "nodeId": "nodeId_1", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration": "any", @@ -70,7 +74,24 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": "nodeId_3", "nodeDetailString": "(keys=[string_col#0], functions=[partial_avg(double_col#1)])", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "aggregateAttributes": [ + "sum#15", + "count#16L" + ], + "aggregateExpressions": ["partial_avg(double_col#1)"], + "groupingExpressions": ["string_col#0"], + "initialInputBufferOffset": "0", + "isStreaming": "false", + "numShufflePartitions": "none", + "requiredChildDistributionExpressions": "none", + "resultExpressions": [ + "string_col#0", + "sum#17", + "count#18L" + ] + }, "metrics": [ { "number of output rows": 3, @@ -90,7 +111,17 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "LocalTableScan", "nodeId": "nodeId_2", "nodeDetailString": "[string_col#0, double_col#1]", - "meta": "any", + "meta": { + "output": [ + "string_col#0", + "double_col#1" + ], + "rows": [ + "[first,1.2]", + "[first,1.3]", + "[second,1.6]" + ] + }, "metrics": [ { "number of output rows": 3, @@ -109,7 +140,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (2)", "nodeId": "nodeId_8", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration": "any", @@ -121,7 +152,20 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": "nodeId_9", "nodeDetailString": "(keys\\u003d[string_col#0], functions\\u003d[avg(double_col#1)])", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "aggregateAttributes": ["avg(double_col#1)#4"], + "aggregateExpressions": ["avg(double_col#1)"], + "groupingExpressions": ["string_col#0"], + "initialInputBufferOffset": "1", + "isStreaming": "false", + "numShufflePartitions": "none", + "requiredChildDistributionExpressions": ["string_col#0"], + "resultExpressions": [ + "string_col#0", + "avg(double_col#1)#4 AS avg(double_col)#5" + ] + }, "metrics": [ { "avg hash probe bucket list iters": "any", @@ -144,26 +188,36 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_7", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "children": [ { "node": "AQEShuffleRead", "nodeId": "nodeId_5", "nodeDetailString": "coalesced", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "partitionSpecs": ["CoalescedPartitionSpec(0,2,Some(186))"] + }, "metrics": [], "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_6", "nodeDetailString": "0", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "id": "0" + }, "children": [ { "node": "Exchange", "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#0, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d38]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "outputPartitioning": "hashpartitioning(string_col#0, 2)", + "shuffleOrigin": "ENSURE_REQUIREMENTS" + }, "metrics": [ { "data size": "any", @@ -215,7 +269,11 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "nodeId_10", "nodeDetailString": "rangepartitioning(avg(double_col)#5 DESC NULLS LAST, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d67]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "outputPartitioning": "rangepartitioning(avg(double_col)#5 DESC NULLS LAST, 2)", + "shuffleOrigin": "ENSURE_REQUIREMENTS" + }, "metrics": [ { "data size": "any", @@ -238,7 +296,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (2)", "nodeId": "nodeId_8", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration": "any", @@ -250,7 +308,20 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": "nodeId_9", "nodeDetailString": "(keys\\u003d[string_col#0], functions\\u003d[avg(double_col#1)])", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "aggregateAttributes": ["avg(double_col#1)#4"], + "aggregateExpressions": ["avg(double_col#1)"], + "groupingExpressions": ["string_col#0"], + "initialInputBufferOffset": "1", + "isStreaming": "false", + "numShufflePartitions": "none", + "requiredChildDistributionExpressions": ["string_col#0"], + "resultExpressions": [ + "string_col#0", + "avg(double_col#1)#4 AS avg(double_col)#5" + ] + }, "metrics": [ { "avg hash probe bucket list iters": "any", @@ -273,26 +344,36 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_7", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "children": [ { "node": "AQEShuffleRead", "nodeId": "nodeId_5", "nodeDetailString": "coalesced", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "partitionSpecs": ["CoalescedPartitionSpec(0,2,Some(186))"] + }, "metrics": [], "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_6", "nodeDetailString": "0", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "id": "0" + }, "children": [ { "node": "Exchange", "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#0, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d38]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "outputPartitioning": "hashpartitioning(string_col#0, 2)", + "shuffleOrigin": "ENSURE_REQUIREMENTS" + }, "metrics": [ { "data size": "any", @@ -345,7 +426,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (3)", "nodeId": "nodeId_12", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration": "any", @@ -357,13 +438,24 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Project", "nodeId": "nodeId_11", "nodeDetailString": "[string_col#0, cast(avg(double_col)#5 as string) AS avg(double_col)#12]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "projectList": [ + "string_col#0", + "cast(avg(double_col)#5 as string) AS avg(double_col)#12" + ] + }, "children": [ { "node": "Sort", "nodeId": "nodeId_13", "nodeDetailString": "[avg(double_col)#5 DESC NULLS LAST], true, 0", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "global": "true", + "sortOrder": ["avg(double_col)#5 DESC NULLS LAST"], + "testSpillFrequency": "0" + }, "metrics": [ { "peak memory": "any", @@ -382,26 +474,36 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_7", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "children": [ { "node": "AQEShuffleRead", "nodeId": "nodeId_5", "nodeDetailString": "coalesced", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "partitionSpecs": ["CoalescedPartitionSpec(0,2,Some(152))"] + }, "metrics": [], "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_14", "nodeDetailString": "1", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "id": "1" + }, "children": [ { "node": "Exchange", "nodeId": "nodeId_10", "nodeDetailString": "rangepartitioning(avg(double_col)#5 DESC NULLS LAST, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d67]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "outputPartitioning": "rangepartitioning(avg(double_col)#5 DESC NULLS LAST, 2)", + "shuffleOrigin": "ENSURE_REQUIREMENTS" + }, "metrics": [ { "data size": "any", @@ -521,7 +623,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { assertStringSQLPlanEqualsWithConsistentNodeIds( [firstStagePlan, secondStagePlan, thirdStagePlan, fourthStagePlan], - [actualPlans[3], actualPlans[2], actualPlans[1], actualPlans[0]] + [actualPlans[3], actualPlans[2], actualPlans[1], actualPlans[0]], + ["firstStagePlan", "secondStagePlan", "thirdStagePlan", "fourthStagePlan"] ) } @@ -549,7 +652,11 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#28, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d119]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "outputPartitioning": "hashpartitioning(string_col#28, 2)", + "shuffleOrigin": "ENSURE_REQUIREMENTS" + }, "metrics": [ { "data size": "any", @@ -573,7 +680,13 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "LocalTableScan", "nodeId": "any", "nodeDetailString": "[string_col#28]", - "meta": "any", + "meta": { + "output": ["string_col#28"], + "rows": [ + "[first]", + "[second]" + ] + }, "metrics": [ { "number of output rows": "any", @@ -589,7 +702,11 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#32, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d120]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "outputPartitioning": "hashpartitioning(string_col#32, 2)", + "shuffleOrigin": "ENSURE_REQUIREMENTS" + }, "metrics": [ { "data size": "any", @@ -613,7 +730,14 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "LocalTableScan", "nodeId": "any", "nodeDetailString": "[string_col#32]", - "meta": "any", + "meta": { + "output": ["string_col#32"], + "rows": [ + "[first]", + "[first]", + "[second]" + ] + }, "metrics": [ { "number of output rows": "any", @@ -629,7 +753,11 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Exchange", "nodeId": "nodeId_7", "nodeDetailString": "SinglePartition, ENSURE_REQUIREMENTS, [plan_id\\u003d230]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "outputPartitioning": "SinglePartition", + "shuffleOrigin": "ENSURE_REQUIREMENTS" + }, "metrics": [ { "data size": "any", @@ -652,7 +780,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (3)", "nodeId": "nodeId_9", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration": "any", @@ -664,7 +792,17 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": "nodeId_16", "nodeDetailString": "(keys\\u003d[], functions\\u003d[partial_count(1)])", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "aggregateAttributes": ["count#45L"], + "aggregateExpressions": ["partial_count(1)"], + "groupingExpressions": [""], + "initialInputBufferOffset": "0", + "isStreaming": "false", + "numShufflePartitions": "none", + "requiredChildDistributionExpressions": "none", + "resultExpressions": ["count#46L"] + }, "metrics": [ { "number of output rows": 1, @@ -679,13 +817,23 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "Project", "nodeId": "nodeId_13", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "projectList": [""] + }, "children": [ { "node": "SortMergeJoin", "nodeId": "nodeId_15", "nodeDetailString": "[string_col#28], [string_col#32], Inner", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "condition": "none", + "isSkewJoin": "false", + "joinType": "Inner", + "leftKeys": ["string_col#28"], + "rightKeys": ["string_col#32"] + }, "metrics": [ { "number of output rows": "any", @@ -696,12 +844,12 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_6", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "children": [ { "node": "WholeStageCodegen (1)", "nodeId": "nodeId_8", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration": "any", @@ -713,7 +861,12 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Sort", "nodeId": "nodeId_11", "nodeDetailString": "[string_col#28 ASC NULLS FIRST], false, 0", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "global": "false", + "sortOrder": ["string_col#28 ASC NULLS FIRST"], + "testSpillFrequency": "0" + }, "metrics": [ { "peak memory": "any", @@ -732,26 +885,36 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_6", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "children": [ { "node": "AQEShuffleRead", "nodeId": "nodeId_5", "nodeDetailString": "coalesced", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "partitionSpecs": ["CoalescedPartitionSpec(0,2,Some(144))"] + }, "metrics": [], "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_14", "nodeDetailString": "0", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "id": "0" + }, "children": [ { "node": "Exchange", "nodeId": "nodeId_2", "nodeDetailString": "hashpartitioning(string_col#28, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d119]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "outputPartitioning": "hashpartitioning(string_col#28, 2)", + "shuffleOrigin": "ENSURE_REQUIREMENTS" + }, "metrics": [ { "data size": "any", @@ -802,12 +965,12 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_6", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "children": [ { "node": "WholeStageCodegen (2)", "nodeId": "nodeId_12", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration": "any", @@ -819,7 +982,12 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Sort", "nodeId": "nodeId_17", "nodeDetailString": "[string_col#32 ASC NULLS FIRST], false, 0", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "global": "false", + "sortOrder": ["string_col#32 ASC NULLS FIRST"], + "testSpillFrequency": "0" + }, "metrics": [ { "peak memory": "any", @@ -838,26 +1006,36 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_6", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "children": [ { "node": "AQEShuffleRead", "nodeId": "nodeId_5", "nodeDetailString": "coalesced", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "partitionSpecs": ["CoalescedPartitionSpec(0,2,Some(160))"] + }, "metrics": [], "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_10", "nodeDetailString": "1", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "id": "1" + }, "children": [ { "node": "Exchange", "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#32, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d120]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "outputPartitioning": "hashpartitioning(string_col#32, 2)", + "shuffleOrigin": "ENSURE_REQUIREMENTS" + }, "metrics": [ { "data size": "any", @@ -920,7 +1098,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "WholeStageCodegen (4)", "nodeId": "nodeId_20", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "metrics": [ { "duration": "any", @@ -932,7 +1110,17 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "HashAggregate", "nodeId": "nodeId_18", "nodeDetailString": "(keys\\u003d[], functions\\u003d[count(1)])", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "aggregateAttributes": ["count(1)#42L"], + "aggregateExpressions": ["count(1)"], + "groupingExpressions": [""], + "initialInputBufferOffset": "0", + "isStreaming": "false", + "numShufflePartitions": "none", + "requiredChildDistributionExpressions": [""], + "resultExpressions": ["count(1)#42L AS count#43L"] + }, "metrics": [ { "number of output rows": "any", @@ -947,19 +1135,26 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { { "node": "InputAdapter", "nodeId": "nodeId_6", - "meta": "any", + "meta": {"_dd.unparsed": "any"}, "children": [ { "node": "ShuffleQueryStage", "nodeId": "nodeId_19", "nodeDetailString": "2", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "id": "2" + }, "children": [ { "node": "Exchange", "nodeId": "nodeId_7", "nodeDetailString": "SinglePartition, ENSURE_REQUIREMENTS, [plan_id\\u003d230]", - "meta": "any", + "meta": { + "_dd.unparsed": "any", + "outputPartitioning": "SinglePartition", + "shuffleOrigin": "ENSURE_REQUIREMENTS" + }, "metrics": [ { "data size": "any", @@ -1074,11 +1269,12 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { } assertStringSQLPlanEqualsWithConsistentNodeIds( [firstStagePlan, secondStagePlan, thirdStagePlan, fourthStagePlan], - [actualPlans[3], actualPlans[2], actualPlans[1], actualPlans[0]] + [actualPlans[3], actualPlans[2], actualPlans[1], actualPlans[0]], + ["firstStagePlan", "secondStagePlan", "thirdStagePlan", "fourthStagePlan"] ) } - static assertStringSQLPlanEqualsWithConsistentNodeIds(List expectedPlans, List actualPlans) { + static assertStringSQLPlanEqualsWithConsistentNodeIds(List expectedPlans, List actualPlans, List names) { def jsonSlurper = new JsonSlurper() def actualToNormalized = [:] @@ -1097,7 +1293,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { } def normalizedActual = normalizeNodeIds(actualParsed, actualToNormalized) - AbstractSpark24SqlTest.assertSQLPlanEquals(expectedParsed, normalizedActual) + AbstractSpark24SqlTest.assertSQLPlanEquals(expectedParsed, normalizedActual, names[i]) } } } From 750c68f16cf18f8451ad57f58556335f2ca7b081 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Thu, 16 Oct 2025 16:39:30 -0400 Subject: [PATCH 06/21] Use Abstract class for common functions --- .../instrumentation/spark/Spark212Instrumentation.java | 7 ++++--- .../trace/instrumentation/spark/Spark212PlanUtils.java | 8 ++++---- .../instrumentation/spark/Spark213Instrumentation.java | 8 ++++---- .../trace/instrumentation/spark/Spark213PlanUtils.java | 8 ++++---- ...monSparkPlanUtils.java => AbstractSparkPlanUtils.java} | 8 ++++++-- .../instrumentation/spark/AbstractSpark24SqlTest.groovy | 6 +++--- 6 files changed, 25 insertions(+), 20 deletions(-) rename dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/{CommonSparkPlanUtils.java => AbstractSparkPlanUtils.java} (75%) diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java index 2d7757bbdac..9ba785630e2 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java @@ -21,6 +21,7 @@ public class Spark212Instrumentation extends AbstractSparkInstrumentation { public String[] helperClassNames() { return new String[] { packageName + ".AbstractDatadogSparkListener", + packageName + ".AbstractSparkPlanUtils", packageName + ".DatabricksParentContext", packageName + ".OpenlineageParentContext", packageName + ".DatadogSpark212Listener", @@ -31,7 +32,7 @@ public String[] helperClassNames() { packageName + ".SparkSQLUtils", packageName + ".SparkSQLUtils$SparkPlanInfoForStage", packageName + ".SparkSQLUtils$AccumulatorWithStage", - packageName + ".Spark212PlanUtils", + packageName + ".Spark212PlanUtils" }; } @@ -109,14 +110,14 @@ public static void exit( @Advice.Return(readOnly = false) SparkPlanInfo planInfo, @Advice.Argument(0) SparkPlan plan) { if (planInfo.metadata().size() == 0) { + Spark212PlanUtils planUtils = new Spark212PlanUtils(); HashMap args = new HashMap<>(); planInfo = new SparkPlanInfo( planInfo.nodeName(), planInfo.simpleString(), planInfo.children(), - args.$plus$plus( - JavaConverters.mapAsScalaMap(Spark212PlanUtils.extractPlanProduct(plan))), + args.$plus$plus(JavaConverters.mapAsScalaMap(planUtils.extractPlanProduct(plan))), planInfo.metrics()); } } diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java index d060d8a6683..849b3e7eaf8 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java @@ -3,13 +3,13 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import org.apache.spark.sql.execution.SparkPlan; +import org.apache.spark.sql.catalyst.trees.TreeNode; import scala.collection.JavaConverters; // An extension of how Spark translates `SparkPlan`s to `SparkPlanInfo`, see here: // https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 -public class Spark212PlanUtils { - public static Map extractPlanProduct(SparkPlan plan) { +public class Spark212PlanUtils extends AbstractSparkPlanUtils { + public Map extractPlanProduct(TreeNode plan) { HashMap args = new HashMap<>(); HashMap unparsed = new HashMap<>(); @@ -19,7 +19,7 @@ public static Map extractPlanProduct(SparkPlan plan) { Object obj = it.next(); String key = String.format("_dd.unknown_key.%d", i); - String val = CommonSparkPlanUtils.parsePlanProduct(obj); + String val = parsePlanProduct(obj); if (val != null) { args.put(key, val); } else { diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java index 37675cfa375..4490adc7385 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java @@ -21,6 +21,7 @@ public class Spark213Instrumentation extends AbstractSparkInstrumentation { public String[] helperClassNames() { return new String[] { packageName + ".AbstractDatadogSparkListener", + packageName + ".AbstractSparkPlanUtils", packageName + ".DatabricksParentContext", packageName + ".OpenlineageParentContext", packageName + ".DatadogSpark213Listener", @@ -31,8 +32,7 @@ public String[] helperClassNames() { packageName + ".SparkSQLUtils", packageName + ".SparkSQLUtils$SparkPlanInfoForStage", packageName + ".SparkSQLUtils$AccumulatorWithStage", - packageName + ".Spark213PlanUtils", - packageName + ".CommonSparkPlanUtils", + packageName + ".Spark213PlanUtils" }; } @@ -111,13 +111,13 @@ public static void exit( @Advice.Return(readOnly = false) SparkPlanInfo planInfo, @Advice.Argument(0) SparkPlan plan) { if (planInfo.metadata().size() == 0) { + Spark213PlanUtils planUtils = new Spark213PlanUtils(); planInfo = new SparkPlanInfo( planInfo.nodeName(), planInfo.simpleString(), planInfo.children(), - HashMap.from( - JavaConverters.asScala(Spark213PlanUtils.extractPlanProduct(plan)).toList()), + HashMap.from(JavaConverters.asScala(planUtils.extractPlanProduct(plan)).toList()), planInfo.metrics()); } } diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java index 8544d4b6790..9f6e64e2db7 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java @@ -3,13 +3,13 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; -import org.apache.spark.sql.execution.SparkPlan; +import org.apache.spark.sql.catalyst.trees.TreeNode; import scala.collection.JavaConverters; // An extension of how Spark translates `SparkPlan`s to `SparkPlanInfo`, see here: // https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 -public class Spark213PlanUtils { - public static Map extractPlanProduct(SparkPlan plan) { +public class Spark213PlanUtils extends AbstractSparkPlanUtils { + public Map extractPlanProduct(TreeNode plan) { HashMap args = new HashMap<>(); HashMap unparsed = new HashMap<>(); @@ -19,7 +19,7 @@ public static Map extractPlanProduct(SparkPlan plan) { Object obj = it.next(); String key = plan.productElementName(i); - String val = CommonSparkPlanUtils.parsePlanProduct(obj); + String val = parsePlanProduct(obj); if (val != null) { args.put(key, val); } else { diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/CommonSparkPlanUtils.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java similarity index 75% rename from dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/CommonSparkPlanUtils.java rename to dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java index ad8ce7f1969..90868fc8fb7 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/CommonSparkPlanUtils.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java @@ -1,12 +1,16 @@ package datadog.trace.instrumentation.spark; import java.util.ArrayList; +import java.util.Map; import org.apache.spark.sql.catalyst.plans.QueryPlan; +import org.apache.spark.sql.catalyst.trees.TreeNode; import scala.Option; import scala.collection.Iterable; -public class CommonSparkPlanUtils { - public static String parsePlanProduct(Object value) { +public abstract class AbstractSparkPlanUtils { + public abstract Map extractPlanProduct(TreeNode node); + + public String parsePlanProduct(Object value) { if (value == null) { return "null"; } else if (value instanceof Iterable) { diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy index 7efc65ca246..7124e08918f 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy @@ -183,13 +183,13 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { var simpleString = actual["nodeDetailString"] var values = [] var child = "N/A" - actual["meta"].each() { key, value -> + actual["meta"].each { key, value -> if (key == "_dd.unparsed") { values.add("\"_dd.unparsed\": \"any\"") child = value } else if (value instanceof List) { var list = [] - value.each() { it -> + value.each { it -> list.add("\"$it\"") } def prettyList = "[\n " + list.join(", \n ") + "\n ]" @@ -201,7 +201,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { values.add("\"$key\": \"$value\"") } } - values.sort() { it } + values.sort { it } def prettyValues = "\n\"meta\": {\n " + values.join(", \n ") + "\n}," if (values.size() == 1) { prettyValues = "\n\"meta\": {" + values.join(", ") + "}," From 0279fff1b5b1cf98b1e867b3eff82fed695fe330 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Thu, 16 Oct 2025 17:47:24 -0400 Subject: [PATCH 07/21] Use Jackson JSON parser instead of rolling own parsing --- .../spark/Spark212PlanUtils.java | 6 +-- .../spark/Spark213PlanUtils.java | 6 +-- .../spark/AbstractSparkPlanUtils.java | 24 +++++++-- .../instrumentation/spark/SparkSQLUtils.java | 24 ++++----- .../spark/AbstractSpark24SqlTest.groovy | 54 ++++++++----------- .../spark/AbstractSpark32SqlTest.groovy | 54 +++++++++---------- 6 files changed, 85 insertions(+), 83 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java index 849b3e7eaf8..4b6eb80a106 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java @@ -19,9 +19,9 @@ public Map extractPlanProduct(TreeNode plan) { Object obj = it.next(); String key = String.format("_dd.unknown_key.%d", i); - String val = parsePlanProduct(obj); + Object val = parsePlanProduct(obj); if (val != null) { - args.put(key, val); + args.put(key, writeObjectToString(val)); } else { unparsed.put(key, obj.getClass().getName()); } @@ -31,7 +31,7 @@ public Map extractPlanProduct(TreeNode plan) { if (unparsed.size() > 0) { // For now, place what we can't parse here with the types so we're aware of them - args.put("_dd.unparsed", unparsed.toString()); + args.put("_dd.unparsed", writeObjectToString(unparsed.toString())); } return args; } diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java index 9f6e64e2db7..8816ab86b50 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java @@ -19,9 +19,9 @@ public Map extractPlanProduct(TreeNode plan) { Object obj = it.next(); String key = plan.productElementName(i); - String val = parsePlanProduct(obj); + Object val = parsePlanProduct(obj); if (val != null) { - args.put(key, val); + args.put(key, writeObjectToString(val)); } else { unparsed.put(key, obj.getClass().getName()); } @@ -31,7 +31,7 @@ public Map extractPlanProduct(TreeNode plan) { if (unparsed.size() > 0) { // For now, place what we can't parse here with the types so we're aware of them - args.put("_dd.unparsed", unparsed.toString()); + args.put("_dd.unparsed", writeObjectToString(unparsed.toString())); } return args; } diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java index 90868fc8fb7..7193f92f551 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java @@ -1,5 +1,8 @@ package datadog.trace.instrumentation.spark; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; import java.util.ArrayList; import java.util.Map; import org.apache.spark.sql.catalyst.plans.QueryPlan; @@ -8,19 +11,34 @@ import scala.collection.Iterable; public abstract class AbstractSparkPlanUtils { + private final ObjectMapper mapper = + new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + public abstract Map extractPlanProduct(TreeNode node); - public String parsePlanProduct(Object value) { + // Should only call on final values being written to `meta` + public String writeObjectToString(Object value) { + try { + return mapper.writeValueAsString(value); + } catch (IOException e) { + return null; + } + } + + // Should really only return valid JSON types (Array, Map, String, Boolean, Number, null) + public Object parsePlanProduct(Object value) { if (value == null) { return "null"; } else if (value instanceof Iterable) { - ArrayList list = new ArrayList<>(); + ArrayList list = new ArrayList<>(); ((Iterable) value).foreach(item -> list.add(parsePlanProduct(item))); - return "[\"" + String.join("\", \"", list) + "\"]"; + return list; } else if (value instanceof Option) { return parsePlanProduct(((Option) value).getOrElse(() -> "none")); } else if (value instanceof QueryPlan) { // Filter out values referencing child nodes return null; + } else if (value instanceof Boolean || Number.class.isInstance(value)) { + return value; } else { return value.toString(); } diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/SparkSQLUtils.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/SparkSQLUtils.java index ba1d846dc0d..33718a4b0dc 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/SparkSQLUtils.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/SparkSQLUtils.java @@ -151,7 +151,7 @@ public String toJson(Map accumulators) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator generator = mapper.getFactory().createGenerator(baos)) { - this.toJson(generator, accumulators); + this.toJson(generator, accumulators, mapper); } catch (IOException e) { return null; } @@ -159,7 +159,8 @@ public String toJson(Map accumulators) { return new String(baos.toByteArray(), StandardCharsets.UTF_8); } - private void toJson(JsonGenerator generator, Map accumulators) + private void toJson( + JsonGenerator generator, Map accumulators, ObjectMapper mapper) throws IOException { generator.writeStartObject(); generator.writeStringField("node", plan.nodeName()); @@ -180,18 +181,11 @@ private void toJson(JsonGenerator generator, Map acc generator.writeStartObject(); for (Tuple2 metadata : JavaConverters.asJavaCollection(plan.metadata())) { - // If it looks like a string array, break apart and write as native JSON array - if (metadata._2.startsWith("[\"") && metadata._2.endsWith("\"]")) { - String[] list = metadata._2.substring(2, metadata._2.length() - 2).split("\", \""); - - generator.writeFieldName(metadata._1); - generator.writeStartArray(); - for (String entry : list) { - generator.writeString(entry); - } - generator.writeEndArray(); - } else { - generator.writeStringField(metadata._1, metadata._2); + generator.writeFieldName(metadata._1); + try { + generator.writeTree(mapper.readTree(metadata._2)); + } catch (IOException e) { + generator.writeString(metadata._2); } } @@ -219,7 +213,7 @@ private void toJson(JsonGenerator generator, Map acc generator.writeFieldName("children"); generator.writeStartArray(); for (SparkPlanInfoForStage child : children) { - child.toJson(generator, accumulators); + child.toJson(generator, accumulators, mapper); } generator.writeEndArray(); } diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy index 7124e08918f..2e7a0a3d966 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy @@ -1,5 +1,6 @@ package datadog.trace.instrumentation.spark +import com.fasterxml.jackson.databind.ObjectMapper import datadog.trace.agent.test.InstrumentationSpecification import groovy.json.JsonSlurper import org.apache.spark.sql.Dataset @@ -180,32 +181,21 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { } private static generateMetaExpectations(Object actual, String name) { - var simpleString = actual["nodeDetailString"] - var values = [] - var child = "N/A" + ObjectMapper mapper = new ObjectMapper() + + def simpleString = actual["nodeDetailString"] + def child = "N/A" + def values = [:] + actual["meta"].each { key, value -> if (key == "_dd.unparsed") { - values.add("\"_dd.unparsed\": \"any\"") + values.put("_dd.unparsed", "any") child = value - } else if (value instanceof List) { - var list = [] - value.each { it -> - list.add("\"$it\"") - } - def prettyList = "[\n " + list.join(", \n ") + "\n ]" - if (list.size() == 1) { - prettyList = "[" + list.join(", ") + "]" - } - values.add("\"$key\": $prettyList") } else { - values.add("\"$key\": \"$value\"") + values.put(key, value) } } - values.sort { it } - def prettyValues = "\n\"meta\": {\n " + values.join(", \n ") + "\n}," - if (values.size() == 1) { - prettyValues = "\n\"meta\": {" + values.join(", ") + "}," - } + def prettyValues = "\n\"meta\": " + mapper.writerWithDefaultPrettyPrinter().writeValueAsString(values.sort { it -> it.key }) + "," System.err.println("$actual.node\n\tname=$name\n\tchild=$child\n\tvalues=$prettyValues\n\tsimpleString=$simpleString") } @@ -267,7 +257,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "sum#16", "count#17L" ], - "_dd.unknown_key.4": "0", + "_dd.unknown_key.4": 0, "_dd.unknown_key.5": [ "string_col#0", "sum#18", @@ -348,7 +338,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "_dd.unknown_key.1": ["string_col#0"], "_dd.unknown_key.2": ["avg(double_col#1)"], "_dd.unknown_key.3": ["avg(double_col#1)#4"], - "_dd.unknown_key.4": "1", + "_dd.unknown_key.4": 1, "_dd.unknown_key.5": [ "string_col#0", "avg(double_col#1)#4 AS avg(double_col)#5" @@ -552,10 +542,10 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeDetailString": "(keys=[], functions=[partial_count(1)])", "meta": { "_dd.unknown_key.0": "none", - "_dd.unknown_key.1": [""], + "_dd.unknown_key.1": [], "_dd.unknown_key.2": ["partial_count(1)"], "_dd.unknown_key.3": ["count#38L"], - "_dd.unknown_key.4": "0", + "_dd.unknown_key.4": 0, "_dd.unknown_key.5": ["count#39L"], "_dd.unparsed": "any" }, @@ -574,7 +564,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Project", "nodeId": 1355342585, "meta": { - "_dd.unknown_key.0": [""], + "_dd.unknown_key.0": [], "_dd.unparsed": "any" }, "children": [ @@ -618,8 +608,8 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeDetailString": "[string_col#21 ASC NULLS FIRST], false, 0", "meta": { "_dd.unknown_key.0": ["string_col#21 ASC NULLS FIRST"], - "_dd.unknown_key.1": "false", - "_dd.unknown_key.3": "0", + "_dd.unknown_key.1": false, + "_dd.unknown_key.3": 0, "_dd.unparsed": "any" }, "metrics": [ @@ -666,8 +656,8 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeDetailString": "[string_col#25 ASC NULLS FIRST], false, 0", "meta": { "_dd.unknown_key.0": ["string_col#25 ASC NULLS FIRST"], - "_dd.unknown_key.1": "false", - "_dd.unknown_key.3": "0", + "_dd.unknown_key.1": false, + "_dd.unknown_key.3": 0, "_dd.unparsed": "any" }, "metrics": [ @@ -716,11 +706,11 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": 724815342, "nodeDetailString": "(keys=[], functions=[count(1)])", "meta": { - "_dd.unknown_key.0": [""], - "_dd.unknown_key.1": [""], + "_dd.unknown_key.0": [], + "_dd.unknown_key.1": [], "_dd.unknown_key.2": ["count(1)"], "_dd.unknown_key.3": ["count(1)#35L"], - "_dd.unknown_key.4": "0", + "_dd.unknown_key.4": 0, "_dd.unknown_key.5": ["count(1)#35L AS count#36L"], "_dd.unparsed": "any" }, diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy index 5ed32657723..a9c6bd0a466 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy @@ -82,8 +82,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { ], "aggregateExpressions": ["partial_avg(double_col#1)"], "groupingExpressions": ["string_col#0"], - "initialInputBufferOffset": "0", - "isStreaming": "false", + "initialInputBufferOffset": 0, + "isStreaming": false, "numShufflePartitions": "none", "requiredChildDistributionExpressions": "none", "resultExpressions": [ @@ -157,8 +157,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "aggregateAttributes": ["avg(double_col#1)#4"], "aggregateExpressions": ["avg(double_col#1)"], "groupingExpressions": ["string_col#0"], - "initialInputBufferOffset": "1", - "isStreaming": "false", + "initialInputBufferOffset": 1, + "isStreaming": false, "numShufflePartitions": "none", "requiredChildDistributionExpressions": ["string_col#0"], "resultExpressions": [ @@ -206,7 +206,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeDetailString": "0", "meta": { "_dd.unparsed": "any", - "id": "0" + "id": 0 }, "children": [ { @@ -313,8 +313,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "aggregateAttributes": ["avg(double_col#1)#4"], "aggregateExpressions": ["avg(double_col#1)"], "groupingExpressions": ["string_col#0"], - "initialInputBufferOffset": "1", - "isStreaming": "false", + "initialInputBufferOffset": 1, + "isStreaming": false, "numShufflePartitions": "none", "requiredChildDistributionExpressions": ["string_col#0"], "resultExpressions": [ @@ -362,7 +362,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeDetailString": "0", "meta": { "_dd.unparsed": "any", - "id": "0" + "id": 0 }, "children": [ { @@ -452,9 +452,9 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeDetailString": "[avg(double_col)#5 DESC NULLS LAST], true, 0", "meta": { "_dd.unparsed": "any", - "global": "true", + "global": true, "sortOrder": ["avg(double_col)#5 DESC NULLS LAST"], - "testSpillFrequency": "0" + "testSpillFrequency": 0 }, "metrics": [ { @@ -492,7 +492,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeDetailString": "1", "meta": { "_dd.unparsed": "any", - "id": "1" + "id": 1 }, "children": [ { @@ -796,9 +796,9 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "_dd.unparsed": "any", "aggregateAttributes": ["count#45L"], "aggregateExpressions": ["partial_count(1)"], - "groupingExpressions": [""], - "initialInputBufferOffset": "0", - "isStreaming": "false", + "groupingExpressions": [], + "initialInputBufferOffset": 0, + "isStreaming": false, "numShufflePartitions": "none", "requiredChildDistributionExpressions": "none", "resultExpressions": ["count#46L"] @@ -819,7 +819,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_13", "meta": { "_dd.unparsed": "any", - "projectList": [""] + "projectList": [] }, "children": [ { @@ -829,7 +829,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "meta": { "_dd.unparsed": "any", "condition": "none", - "isSkewJoin": "false", + "isSkewJoin": false, "joinType": "Inner", "leftKeys": ["string_col#28"], "rightKeys": ["string_col#32"] @@ -863,9 +863,9 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeDetailString": "[string_col#28 ASC NULLS FIRST], false, 0", "meta": { "_dd.unparsed": "any", - "global": "false", + "global": false, "sortOrder": ["string_col#28 ASC NULLS FIRST"], - "testSpillFrequency": "0" + "testSpillFrequency": 0 }, "metrics": [ { @@ -903,7 +903,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeDetailString": "0", "meta": { "_dd.unparsed": "any", - "id": "0" + "id": 0 }, "children": [ { @@ -984,9 +984,9 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeDetailString": "[string_col#32 ASC NULLS FIRST], false, 0", "meta": { "_dd.unparsed": "any", - "global": "false", + "global": false, "sortOrder": ["string_col#32 ASC NULLS FIRST"], - "testSpillFrequency": "0" + "testSpillFrequency": 0 }, "metrics": [ { @@ -1024,7 +1024,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeDetailString": "1", "meta": { "_dd.unparsed": "any", - "id": "1" + "id": 1 }, "children": [ { @@ -1114,11 +1114,11 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "_dd.unparsed": "any", "aggregateAttributes": ["count(1)#42L"], "aggregateExpressions": ["count(1)"], - "groupingExpressions": [""], - "initialInputBufferOffset": "0", - "isStreaming": "false", + "groupingExpressions": [], + "initialInputBufferOffset": 0, + "isStreaming": false, "numShufflePartitions": "none", - "requiredChildDistributionExpressions": [""], + "requiredChildDistributionExpressions": [], "resultExpressions": ["count(1)#42L AS count#43L"] }, "metrics": [ @@ -1143,7 +1143,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeDetailString": "2", "meta": { "_dd.unparsed": "any", - "id": "2" + "id": 2 }, "children": [ { From 50fa41a9af0c65bff02faee9e3ff1e5fe60b9fcd Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Fri, 17 Oct 2025 10:03:13 -0400 Subject: [PATCH 08/21] Refactor AbstractSparkPlanUtils to only require key generation on impl --- .../spark/Spark212Instrumentation.java | 3 +- .../spark/Spark212PlanUtils.java | 33 +--------- .../spark/Spark213Instrumentation.java | 3 +- .../spark/Spark213PlanUtils.java | 33 +--------- .../spark/AbstractSparkPlanUtils.java | 65 +++++++++++++++---- 5 files changed, 62 insertions(+), 75 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java index 9ba785630e2..c69484c5ea9 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java @@ -117,7 +117,8 @@ public static void exit( planInfo.nodeName(), planInfo.simpleString(), planInfo.children(), - args.$plus$plus(JavaConverters.mapAsScalaMap(planUtils.extractPlanProduct(plan))), + args.$plus$plus( + JavaConverters.mapAsScalaMap(planUtils.extractFormattedProduct(plan))), planInfo.metrics()); } } diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java index 4b6eb80a106..fc589f5c7cd 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java @@ -1,38 +1,9 @@ package datadog.trace.instrumentation.spark; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; import org.apache.spark.sql.catalyst.trees.TreeNode; -import scala.collection.JavaConverters; -// An extension of how Spark translates `SparkPlan`s to `SparkPlanInfo`, see here: -// https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 public class Spark212PlanUtils extends AbstractSparkPlanUtils { - public Map extractPlanProduct(TreeNode plan) { - HashMap args = new HashMap<>(); - HashMap unparsed = new HashMap<>(); - - int i = 0; - for (Iterator it = JavaConverters.asJavaIterator(plan.productIterator()); - it.hasNext(); ) { - Object obj = it.next(); - String key = String.format("_dd.unknown_key.%d", i); - - Object val = parsePlanProduct(obj); - if (val != null) { - args.put(key, writeObjectToString(val)); - } else { - unparsed.put(key, obj.getClass().getName()); - } - - i++; - } - - if (unparsed.size() > 0) { - // For now, place what we can't parse here with the types so we're aware of them - args.put("_dd.unparsed", writeObjectToString(unparsed.toString())); - } - return args; + public String getKey(int idx, TreeNode node) { + return String.format("_dd.unknown_key.%d", idx); } } diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java index 4490adc7385..120b8c15300 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java @@ -117,7 +117,8 @@ public static void exit( planInfo.nodeName(), planInfo.simpleString(), planInfo.children(), - HashMap.from(JavaConverters.asScala(planUtils.extractPlanProduct(plan)).toList()), + HashMap.from( + JavaConverters.asScala(planUtils.extractFormattedProduct(plan)).toList()), planInfo.metrics()); } } diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java index 8816ab86b50..c4e3285a637 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java @@ -1,38 +1,9 @@ package datadog.trace.instrumentation.spark; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; import org.apache.spark.sql.catalyst.trees.TreeNode; -import scala.collection.JavaConverters; -// An extension of how Spark translates `SparkPlan`s to `SparkPlanInfo`, see here: -// https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 public class Spark213PlanUtils extends AbstractSparkPlanUtils { - public Map extractPlanProduct(TreeNode plan) { - HashMap args = new HashMap<>(); - HashMap unparsed = new HashMap<>(); - - int i = 0; - for (Iterator it = JavaConverters.asJavaIterator(plan.productIterator()); - it.hasNext(); ) { - Object obj = it.next(); - String key = plan.productElementName(i); - - Object val = parsePlanProduct(obj); - if (val != null) { - args.put(key, writeObjectToString(val)); - } else { - unparsed.put(key, obj.getClass().getName()); - } - - i++; - } - - if (unparsed.size() > 0) { - // For now, place what we can't parse here with the types so we're aware of them - args.put("_dd.unparsed", writeObjectToString(unparsed.toString())); - } - return args; + public String getKey(int idx, TreeNode node) { + return node.productElementName(idx); } } diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java index 7193f92f551..2e713ef0687 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java @@ -4,20 +4,62 @@ import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; import java.util.Map; import org.apache.spark.sql.catalyst.plans.QueryPlan; import org.apache.spark.sql.catalyst.trees.TreeNode; import scala.Option; import scala.collection.Iterable; +import scala.collection.JavaConverters; +// An extension of how Spark translates `SparkPlan`s to `SparkPlanInfo`, see here: +// https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 public abstract class AbstractSparkPlanUtils { + private final int MAX_DEPTH = 4; private final ObjectMapper mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - public abstract Map extractPlanProduct(TreeNode node); + public abstract String getKey(int idx, TreeNode node); + + public Map extractFormattedProduct(TreeNode plan) { + HashMap result = new HashMap<>(); + extractPlanProduct(plan, 0) + .forEach( + (key, value) -> { + result.put(key, writeObjectToString(value)); + }); + return result; + } + + private Map extractPlanProduct(TreeNode node, int depth) { + HashMap args = new HashMap<>(); + HashMap unparsed = new HashMap<>(); + + int i = 0; + for (Iterator it = JavaConverters.asJavaIterator(node.productIterator()); + it.hasNext(); ) { + Object obj = it.next(); + + Object val = parsePlanProduct(obj, depth); + if (val != null) { + args.put(getKey(i, node), val); + } else { + unparsed.put(getKey(i, node), obj.getClass().getName()); + } + + i++; + } + + if (unparsed.size() > 0) { + // For now, place what we can't parse here with the types so we're aware of them + args.put("_dd.unparsed", unparsed); + } + return args; + } // Should only call on final values being written to `meta` - public String writeObjectToString(Object value) { + private String writeObjectToString(Object value) { try { return mapper.writeValueAsString(value); } catch (IOException e) { @@ -26,19 +68,20 @@ public String writeObjectToString(Object value) { } // Should really only return valid JSON types (Array, Map, String, Boolean, Number, null) - public Object parsePlanProduct(Object value) { + private Object parsePlanProduct(Object value, int depth) { if (value == null) { return "null"; - } else if (value instanceof Iterable) { - ArrayList list = new ArrayList<>(); - ((Iterable) value).foreach(item -> list.add(parsePlanProduct(item))); - return list; - } else if (value instanceof Option) { - return parsePlanProduct(((Option) value).getOrElse(() -> "none")); - } else if (value instanceof QueryPlan) { // Filter out values referencing child nodes - return null; } else if (value instanceof Boolean || Number.class.isInstance(value)) { return value; + } else if (value instanceof Option) { + return parsePlanProduct(((Option) value).getOrElse(() -> "none"), depth); + } else if (value instanceof QueryPlan) { + // Filter out values referencing child nodes + return null; + } else if (value instanceof Iterable && depth <= MAX_DEPTH) { + ArrayList list = new ArrayList<>(); + ((Iterable) value).foreach(item -> list.add(parsePlanProduct(item, depth + 1))); + return list; } else { return value.toString(); } From 51835fa3ddaa6d9ab490027f65049f00eafa683a Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Mon, 20 Oct 2025 17:38:10 -0400 Subject: [PATCH 09/21] Default to returning null if class not recognized, limit recursion depth and array length --- .../spark/AbstractSparkPlanUtils.java | 100 +++++- .../spark/AbstractSpark24SqlTest.groovy | 178 +++++----- .../spark/AbstractSpark32SqlTest.groovy | 307 +++++++++--------- 3 files changed, 342 insertions(+), 243 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java index 2e713ef0687..2bb7f8dfa19 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java @@ -3,11 +3,17 @@ import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; +import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import org.apache.spark.Partitioner; +import org.apache.spark.sql.catalyst.expressions.Attribute; +import org.apache.spark.sql.catalyst.plans.JoinType; import org.apache.spark.sql.catalyst.plans.QueryPlan; +import org.apache.spark.sql.catalyst.plans.physical.BroadcastMode; +import org.apache.spark.sql.catalyst.plans.physical.Partitioning; import org.apache.spark.sql.catalyst.trees.TreeNode; import scala.Option; import scala.collection.Iterable; @@ -17,9 +23,21 @@ // https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 public abstract class AbstractSparkPlanUtils { private final int MAX_DEPTH = 4; + private final int MAX_LENGTH = 50; private final ObjectMapper mapper = new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + private final Class[] SAFE_CLASSES = { + Attribute.class, // simpleString appends data type, avoid by using toString + JoinType.class, // enum + Partitioner.class, // not a product or TreeNode + BroadcastMode.class, // not a product or TreeNode + maybeGetClass("org.apache.spark.sql.execution.exchange.ShuffleOrigin"), // enum (v3+) + maybeGetClass("org.apache.spark.sql.catalyst.optimizer.BuildSide"), // enum (v3+) + maybeGetClass( + "org.apache.spark.sql.execution.ShufflePartitionSpec"), // not a product or TreeNode (v3+) + }; + public abstract String getKey(int idx, TreeNode node); public Map extractFormattedProduct(TreeNode plan) { @@ -32,7 +50,7 @@ public Map extractFormattedProduct(TreeNode plan) { return result; } - private Map extractPlanProduct(TreeNode node, int depth) { + protected Map extractPlanProduct(TreeNode node, int depth) { HashMap args = new HashMap<>(); HashMap unparsed = new HashMap<>(); @@ -59,7 +77,7 @@ private Map extractPlanProduct(TreeNode node, int depth) { } // Should only call on final values being written to `meta` - private String writeObjectToString(Object value) { + protected String writeObjectToString(Object value) { try { return mapper.writeValueAsString(value); } catch (IOException e) { @@ -67,23 +85,87 @@ private String writeObjectToString(Object value) { } } - // Should really only return valid JSON types (Array, Map, String, Boolean, Number, null) - private Object parsePlanProduct(Object value, int depth) { + protected Object parsePlanProduct(Object value, int depth) { + // This function MUST not arbitrarily serialize the object as we can't be sure what it is. + // A null return indicates object is unserializable, otherwise it should really only return + // valid + // JSON types (Array, Map, String, Boolean, Number, null) + if (value == null) { return "null"; - } else if (value instanceof Boolean || Number.class.isInstance(value)) { + } else if (value instanceof String + || value instanceof Boolean + || Number.class.isInstance(value)) { return value; } else if (value instanceof Option) { return parsePlanProduct(((Option) value).getOrElse(() -> "none"), depth); } else if (value instanceof QueryPlan) { - // Filter out values referencing child nodes + // don't duplicate child nodes return null; - } else if (value instanceof Iterable && depth <= MAX_DEPTH) { + } else if (value instanceof Iterable && depth < MAX_DEPTH) { ArrayList list = new ArrayList<>(); - ((Iterable) value).foreach(item -> list.add(parsePlanProduct(item, depth + 1))); + for (Object item : JavaConverters.asJavaIterable((Iterable) value)) { + Object res = parsePlanProduct(item, depth + 1); + if (list.size() < MAX_LENGTH && res != null) { + list.add(res); + } + } return list; - } else { + } else if (value instanceof Partitioning) { + if (value instanceof TreeNode && depth + 1 < MAX_DEPTH) { + HashMap inner = new HashMap<>(); + inner.put( + value.getClass().getSimpleName(), extractPlanProduct(((TreeNode) value), depth + 2)); + return inner; + } else { + return value.toString(); + } + } else if (instanceOf(value, SAFE_CLASSES)) { return value.toString(); + } else if (value instanceof TreeNode) { + // fallback case, leave at bottom + return getSimpleString((TreeNode) value); + } + + return null; + } + + private String getSimpleString(TreeNode value) { + try { + // in Spark v3+, the signature of `simpleString` includes an int parameter for `maxFields` + return TreeNode.class + .getDeclaredMethod("simpleString", new Class[] {int.class}) + .invoke(value, MAX_LENGTH) + .toString(); + } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException exception) { + try { + // Attempt the Spark v2 `simpleString` signature + return TreeNode.class.getDeclaredMethod("simpleString").invoke(value).toString(); + } catch (NoSuchMethodException + | IllegalAccessException + | InvocationTargetException innerException) { + } + + return null; + } + } + + // Use reflection rather than native `instanceof` for classes added in later Spark versions + private boolean instanceOf(Object value, Class[] classes) { + for (Class cls : classes) { + if (cls != null && cls.isInstance(value)) { + return true; + } + } + + return false; + } + + private Class maybeGetClass(String cls) { + try { + return Class.forName(cls); + } catch (ClassNotFoundException e) { + return null; } } } diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy index 2e7a0a3d966..180509cbc7c 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy @@ -105,7 +105,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { def actualUnknown = [] // List of values for all valid unknown keys actual.meta.each { actualMetaKey, actualMetaValue -> if (!expectedMeta.containsKey(actualMetaKey) && actualMetaKey.startsWith("_dd.unknown_key.")) { - actualUnknown.add(actualMetaValue) + actualUnknown.add(parseNestedMetaObject(actualMetaValue)) } else if (!expectedMeta.containsKey(actualMetaKey)) { throw new AssertionError(prefix + "unexpected key \"$actualMetaKey\" found, not valid unknown key with prefix '_dd.unknown_key.' or in $expectedMeta") } @@ -122,8 +122,9 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { } } else if (actualUnknown.size() > 0) { // If expected key not found, attempt to match value against those from valid unknown keys - assert actualUnknown.indexOf(expectedMetaValue) >= 0 : prefix + "meta key \"$expectedMetaKey\" not found in $actualMeta\n\tattempted to match against value \"$expectedMetaValue\" with unknown keys, but not found" - actualUnknown.drop(actualUnknown.indexOf(expectedMetaValue)) + def expectedMetaValueToCompare = parseNestedMetaObject(expectedMetaValue) + assert actualUnknown.indexOf(expectedMetaValueToCompare) >= 0 : prefix + "meta key \"$expectedMetaKey\" not found in $actualMeta\n\tattempted to match against value \"$expectedMetaValueToCompare\" with unknown keys, but not found" + actualUnknown.drop(actualUnknown.indexOf(expectedMetaValueToCompare)) } else { // Defensive, should never happen assert actualMeta.containsKey(expectedMetaKey) : prefix + "meta key \"$expectedMetaKey\" not found in $actualMeta" @@ -173,6 +174,16 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { } } + // Parse any nested objects to a standard form that ignores the keys + // Right now we do this by just asserting the key set and none of the values + static Object parseNestedMetaObject(Object value) { + if (value instanceof Map) { + return value.keySet() + } else { + return value + } + } + static String escapeJsonString(String source) { for (char c : """{}"[]()""".toCharArray()) { source = source.replace(c.toString(), "\\" + c.toString()) @@ -223,9 +234,14 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": -1909876497, "nodeDetailString": "hashpartitioning(string_col#0, 2)", "meta": { - "_dd.unknown_key.0": "hashpartitioning(string_col#0, 2)", - "_dd.unknown_key.2": "none", - "_dd.unparsed": "any" + "_dd.unknown_key.0" : { + "HashPartitioning" : { + "_dd.unknown_key.0" : [ "string_col#0" ], + "_dd.unknown_key.1" : 2 + } + }, + "_dd.unknown_key.2" : "none", + "_dd.unparsed" : "any" }, "metrics": [ { @@ -250,20 +266,13 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": 1128016273, "nodeDetailString": "(keys=[string_col#0], functions=[partial_avg(double_col#1)])", "meta": { - "_dd.unknown_key.0": "none", - "_dd.unknown_key.1": ["string_col#0"], - "_dd.unknown_key.2": ["partial_avg(double_col#1)"], - "_dd.unknown_key.3": [ - "sum#16", - "count#17L" - ], - "_dd.unknown_key.4": 0, - "_dd.unknown_key.5": [ - "string_col#0", - "sum#18", - "count#19L" - ], - "_dd.unparsed": "any" + "_dd.unknown_key.0" : "none", + "_dd.unknown_key.1" : [ "string_col#0" ], + "_dd.unknown_key.2" : [ "partial_avg(double_col#1)" ], + "_dd.unknown_key.3" : [ "sum#16", "count#17L" ], + "_dd.unknown_key.4" : 0, + "_dd.unknown_key.5" : [ "string_col#0", "sum#18", "count#19L" ], + "_dd.unparsed" : "any" }, "metrics": [ { @@ -290,15 +299,8 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": 1632930767, "nodeDetailString": "[string_col#0, double_col#1]", "meta": { - "_dd.unknown_key.0": [ - "string_col#0", - "double_col#1" - ], - "_dd.unknown_key.1": [ - "[first,1.2]", - "[first,1.3]", - "[second,1.6]" - ] + "_dd.unknown_key.0" : [ "string_col#0", "double_col#1" ], + "_dd.unknown_key.1" : [ ] }, "metrics": [ { @@ -334,16 +336,13 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": 126020943, "nodeDetailString": "(keys=[string_col#0], functions=[avg(double_col#1)])", "meta": { - "_dd.unknown_key.0": ["string_col#0"], - "_dd.unknown_key.1": ["string_col#0"], - "_dd.unknown_key.2": ["avg(double_col#1)"], - "_dd.unknown_key.3": ["avg(double_col#1)#4"], - "_dd.unknown_key.4": 1, - "_dd.unknown_key.5": [ - "string_col#0", - "avg(double_col#1)#4 AS avg(double_col)#5" - ], - "_dd.unparsed": "any" + "_dd.unknown_key.0" : [ "string_col#0" ], + "_dd.unknown_key.1" : [ "string_col#0" ], + "_dd.unknown_key.2" : [ "avg(double_col#1)" ], + "_dd.unknown_key.3" : [ "avg(double_col#1)#4" ], + "_dd.unknown_key.4" : 1, + "_dd.unknown_key.5" : [ "string_col#0", "avg(double_col#1)#4 AS avg(double_col)#5" ], + "_dd.unparsed" : "any" }, "metrics": [ { @@ -437,9 +436,14 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#25, 2)", "meta": { - "_dd.unknown_key.0": "hashpartitioning(string_col#25, 2)", - "_dd.unknown_key.2": "none", - "_dd.unparsed": "any" + "_dd.unknown_key.0" : { + "HashPartitioning" : { + "_dd.unknown_key.0" : [ "string_col#25" ], + "_dd.unknown_key.1" : 2 + } + }, + "_dd.unknown_key.2" : "none", + "_dd.unparsed" : "any" }, "metrics": [ { @@ -453,12 +457,8 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": "any", "nodeDetailString": "[string_col#25]", "meta": { - "_dd.unknown_key.0": ["string_col#25"], - "_dd.unknown_key.1": [ - "[first]", - "[first]", - "[second]" - ] + "_dd.unknown_key.0" : [ "string_col#25" ], + "_dd.unknown_key.1" : [ ] }, "metrics": [ { @@ -476,9 +476,14 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#21, 2)", "meta": { - "_dd.unknown_key.0": "hashpartitioning(string_col#21, 2)", - "_dd.unknown_key.2": "none", - "_dd.unparsed": "any" + "_dd.unknown_key.0" : { + "HashPartitioning" : { + "_dd.unknown_key.0" : [ "string_col#21" ], + "_dd.unknown_key.1" : 2 + } + }, + "_dd.unknown_key.2" : "none", + "_dd.unparsed" : "any" }, "metrics": [ { @@ -492,11 +497,8 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": "any", "nodeDetailString": "[string_col#21]", "meta": { - "_dd.unknown_key.0": ["string_col#21"], - "_dd.unknown_key.1": [ - "[first]", - "[second]" - ] + "_dd.unknown_key.0" : [ "string_col#21" ], + "_dd.unknown_key.1" : [ ] }, "metrics": [ { @@ -514,9 +516,9 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": -1350402171, "nodeDetailString": "SinglePartition", "meta": { - "_dd.unknown_key.0": "SinglePartition", - "_dd.unknown_key.2": "none", - "_dd.unparsed": "any" + "_dd.unknown_key.0" : "SinglePartition", + "_dd.unknown_key.2" : "none", + "_dd.unparsed" : "any" }, "metrics": [ { @@ -541,13 +543,13 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": -879128980, "nodeDetailString": "(keys=[], functions=[partial_count(1)])", "meta": { - "_dd.unknown_key.0": "none", - "_dd.unknown_key.1": [], - "_dd.unknown_key.2": ["partial_count(1)"], - "_dd.unknown_key.3": ["count#38L"], - "_dd.unknown_key.4": 0, - "_dd.unknown_key.5": ["count#39L"], - "_dd.unparsed": "any" + "_dd.unknown_key.0" : "none", + "_dd.unknown_key.1" : [ ], + "_dd.unknown_key.2" : [ "partial_count(1)" ], + "_dd.unknown_key.3" : [ "count#38L" ], + "_dd.unknown_key.4" : 0, + "_dd.unknown_key.5" : [ "count#39L" ], + "_dd.unparsed" : "any" }, "metrics": [ { @@ -564,8 +566,8 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "node": "Project", "nodeId": 1355342585, "meta": { - "_dd.unknown_key.0": [], - "_dd.unparsed": "any" + "_dd.unknown_key.0" : [ ], + "_dd.unparsed" : "any" }, "children": [ { @@ -573,11 +575,11 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": -1975876610, "nodeDetailString": "[string_col#21], [string_col#25], Inner", "meta": { - "_dd.unknown_key.0": ["string_col#21"], - "_dd.unknown_key.1": ["string_col#25"], - "_dd.unknown_key.2": "Inner", - "_dd.unknown_key.3": "none", - "_dd.unparsed": "any" + "_dd.unknown_key.0" : [ "string_col#21" ], + "_dd.unknown_key.1" : [ "string_col#25" ], + "_dd.unknown_key.2" : "Inner", + "_dd.unknown_key.3" : "none", + "_dd.unparsed" : "any" }, "metrics": [ { @@ -607,10 +609,10 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": 66807398, "nodeDetailString": "[string_col#21 ASC NULLS FIRST], false, 0", "meta": { - "_dd.unknown_key.0": ["string_col#21 ASC NULLS FIRST"], - "_dd.unknown_key.1": false, - "_dd.unknown_key.3": 0, - "_dd.unparsed": "any" + "_dd.unknown_key.0" : [ "string_col#21 ASC NULLS FIRST" ], + "_dd.unknown_key.1" : false, + "_dd.unknown_key.3" : 0, + "_dd.unparsed" : "any" }, "metrics": [ { @@ -655,10 +657,10 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": -952138782, "nodeDetailString": "[string_col#25 ASC NULLS FIRST], false, 0", "meta": { - "_dd.unknown_key.0": ["string_col#25 ASC NULLS FIRST"], - "_dd.unknown_key.1": false, - "_dd.unknown_key.3": 0, - "_dd.unparsed": "any" + "_dd.unknown_key.0" : [ "string_col#25 ASC NULLS FIRST" ], + "_dd.unknown_key.1" : false, + "_dd.unknown_key.3" : 0, + "_dd.unparsed" : "any" }, "metrics": [ { @@ -706,13 +708,13 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { "nodeId": 724815342, "nodeDetailString": "(keys=[], functions=[count(1)])", "meta": { - "_dd.unknown_key.0": [], - "_dd.unknown_key.1": [], - "_dd.unknown_key.2": ["count(1)"], - "_dd.unknown_key.3": ["count(1)#35L"], - "_dd.unknown_key.4": 0, - "_dd.unknown_key.5": ["count(1)#35L AS count#36L"], - "_dd.unparsed": "any" + "_dd.unknown_key.0" : [ ], + "_dd.unknown_key.1" : [ ], + "_dd.unknown_key.2" : [ "count(1)" ], + "_dd.unknown_key.3" : [ "count(1)#35L" ], + "_dd.unknown_key.4" : 0, + "_dd.unknown_key.5" : [ "count(1)#35L AS count#36L" ], + "_dd.unparsed" : "any" }, "metrics": [ { diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy index a9c6bd0a466..7b64eff2fe6 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy @@ -36,9 +36,14 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#0, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d38]", "meta": { - "_dd.unparsed": "any", - "outputPartitioning": "hashpartitioning(string_col#0, 2)", - "shuffleOrigin": "ENSURE_REQUIREMENTS" + "_dd.unparsed" : "any", + "outputPartitioning" : { + "HashPartitioning" : { + "numPartitions" : 2, + "expressions" : [ "string_col#0" ] + } + }, + "shuffleOrigin" : "ENSURE_REQUIREMENTS" }, "metrics": [ { @@ -75,22 +80,15 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_3", "nodeDetailString": "(keys=[string_col#0], functions=[partial_avg(double_col#1)])", "meta": { - "_dd.unparsed": "any", - "aggregateAttributes": [ - "sum#15", - "count#16L" - ], - "aggregateExpressions": ["partial_avg(double_col#1)"], - "groupingExpressions": ["string_col#0"], - "initialInputBufferOffset": 0, - "isStreaming": false, - "numShufflePartitions": "none", - "requiredChildDistributionExpressions": "none", - "resultExpressions": [ - "string_col#0", - "sum#17", - "count#18L" - ] + "_dd.unparsed" : "any", + "aggregateAttributes" : [ "sum#15", "count#16L" ], + "aggregateExpressions" : [ "partial_avg(double_col#1)" ], + "groupingExpressions" : [ "string_col#0" ], + "initialInputBufferOffset" : 0, + "isStreaming" : false, + "numShufflePartitions" : "none", + "requiredChildDistributionExpressions" : "none", + "resultExpressions" : [ "string_col#0", "sum#17", "count#18L" ] }, "metrics": [ { @@ -112,15 +110,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_2", "nodeDetailString": "[string_col#0, double_col#1]", "meta": { - "output": [ - "string_col#0", - "double_col#1" - ], - "rows": [ - "[first,1.2]", - "[first,1.3]", - "[second,1.6]" - ] + "output" : [ "string_col#0", "double_col#1" ], + "rows" : [ ] }, "metrics": [ { @@ -153,18 +144,15 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_9", "nodeDetailString": "(keys\\u003d[string_col#0], functions\\u003d[avg(double_col#1)])", "meta": { - "_dd.unparsed": "any", - "aggregateAttributes": ["avg(double_col#1)#4"], - "aggregateExpressions": ["avg(double_col#1)"], - "groupingExpressions": ["string_col#0"], - "initialInputBufferOffset": 1, - "isStreaming": false, - "numShufflePartitions": "none", - "requiredChildDistributionExpressions": ["string_col#0"], - "resultExpressions": [ - "string_col#0", - "avg(double_col#1)#4 AS avg(double_col)#5" - ] + "_dd.unparsed" : "any", + "aggregateAttributes" : [ "avg(double_col#1)#4" ], + "aggregateExpressions" : [ "avg(double_col#1)" ], + "groupingExpressions" : [ "string_col#0" ], + "initialInputBufferOffset" : 1, + "isStreaming" : false, + "numShufflePartitions" : "none", + "requiredChildDistributionExpressions" : [ "string_col#0" ], + "resultExpressions" : [ "string_col#0", "avg(double_col#1)#4 AS avg(double_col)#5" ] }, "metrics": [ { @@ -195,8 +183,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_5", "nodeDetailString": "coalesced", "meta": { - "_dd.unparsed": "any", - "partitionSpecs": ["CoalescedPartitionSpec(0,2,Some(186))"] + "_dd.unparsed" : "any", + "partitionSpecs" : [ "CoalescedPartitionSpec(0,2,Some(186))" ] }, "metrics": [], "children": [ @@ -214,9 +202,14 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#0, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d38]", "meta": { - "_dd.unparsed": "any", - "outputPartitioning": "hashpartitioning(string_col#0, 2)", - "shuffleOrigin": "ENSURE_REQUIREMENTS" + "_dd.unparsed" : "any", + "outputPartitioning" : { + "HashPartitioning" : { + "numPartitions" : 2, + "expressions" : [ "string_col#0" ] + } + }, + "shuffleOrigin" : "ENSURE_REQUIREMENTS" }, "metrics": [ { @@ -270,9 +263,14 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_10", "nodeDetailString": "rangepartitioning(avg(double_col)#5 DESC NULLS LAST, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d67]", "meta": { - "_dd.unparsed": "any", - "outputPartitioning": "rangepartitioning(avg(double_col)#5 DESC NULLS LAST, 2)", - "shuffleOrigin": "ENSURE_REQUIREMENTS" + "_dd.unparsed" : "any", + "outputPartitioning" : { + "RangePartitioning" : { + "ordering" : [ "avg(double_col)#5 DESC NULLS LAST" ], + "numPartitions" : 2 + } + }, + "shuffleOrigin" : "ENSURE_REQUIREMENTS" }, "metrics": [ { @@ -309,18 +307,15 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_9", "nodeDetailString": "(keys\\u003d[string_col#0], functions\\u003d[avg(double_col#1)])", "meta": { - "_dd.unparsed": "any", - "aggregateAttributes": ["avg(double_col#1)#4"], - "aggregateExpressions": ["avg(double_col#1)"], - "groupingExpressions": ["string_col#0"], - "initialInputBufferOffset": 1, - "isStreaming": false, - "numShufflePartitions": "none", - "requiredChildDistributionExpressions": ["string_col#0"], - "resultExpressions": [ - "string_col#0", - "avg(double_col#1)#4 AS avg(double_col)#5" - ] + "_dd.unparsed" : "any", + "aggregateAttributes" : [ "avg(double_col#1)#4" ], + "aggregateExpressions" : [ "avg(double_col#1)" ], + "groupingExpressions" : [ "string_col#0" ], + "initialInputBufferOffset" : 1, + "isStreaming" : false, + "numShufflePartitions" : "none", + "requiredChildDistributionExpressions" : [ "string_col#0" ], + "resultExpressions" : [ "string_col#0", "avg(double_col#1)#4 AS avg(double_col)#5" ] }, "metrics": [ { @@ -351,8 +346,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_5", "nodeDetailString": "coalesced", "meta": { - "_dd.unparsed": "any", - "partitionSpecs": ["CoalescedPartitionSpec(0,2,Some(186))"] + "_dd.unparsed" : "any", + "partitionSpecs" : [ "CoalescedPartitionSpec(0,2,Some(186))" ] }, "metrics": [], "children": [ @@ -370,9 +365,14 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#0, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d38]", "meta": { - "_dd.unparsed": "any", - "outputPartitioning": "hashpartitioning(string_col#0, 2)", - "shuffleOrigin": "ENSURE_REQUIREMENTS" + "_dd.unparsed" : "any", + "outputPartitioning" : { + "HashPartitioning" : { + "numPartitions" : 2, + "expressions" : [ "string_col#0" ] + } + }, + "shuffleOrigin" : "ENSURE_REQUIREMENTS" }, "metrics": [ { @@ -439,11 +439,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_11", "nodeDetailString": "[string_col#0, cast(avg(double_col)#5 as string) AS avg(double_col)#12]", "meta": { - "_dd.unparsed": "any", - "projectList": [ - "string_col#0", - "cast(avg(double_col)#5 as string) AS avg(double_col)#12" - ] + "_dd.unparsed" : "any", + "projectList" : [ "string_col#0", "cast(avg(double_col)#5 as string) AS avg(double_col)#12" ] }, "children": [ { @@ -451,10 +448,10 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_13", "nodeDetailString": "[avg(double_col)#5 DESC NULLS LAST], true, 0", "meta": { - "_dd.unparsed": "any", - "global": true, - "sortOrder": ["avg(double_col)#5 DESC NULLS LAST"], - "testSpillFrequency": 0 + "_dd.unparsed" : "any", + "global" : true, + "sortOrder" : [ "avg(double_col)#5 DESC NULLS LAST" ], + "testSpillFrequency" : 0 }, "metrics": [ { @@ -481,8 +478,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_5", "nodeDetailString": "coalesced", "meta": { - "_dd.unparsed": "any", - "partitionSpecs": ["CoalescedPartitionSpec(0,2,Some(152))"] + "_dd.unparsed" : "any", + "partitionSpecs" : [ "CoalescedPartitionSpec(0,2,Some(152))" ] }, "metrics": [], "children": [ @@ -500,9 +497,14 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_10", "nodeDetailString": "rangepartitioning(avg(double_col)#5 DESC NULLS LAST, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d67]", "meta": { - "_dd.unparsed": "any", - "outputPartitioning": "rangepartitioning(avg(double_col)#5 DESC NULLS LAST, 2)", - "shuffleOrigin": "ENSURE_REQUIREMENTS" + "_dd.unparsed" : "any", + "outputPartitioning" : { + "RangePartitioning" : { + "ordering" : [ "avg(double_col)#5 DESC NULLS LAST" ], + "numPartitions" : 2 + } + }, + "shuffleOrigin" : "ENSURE_REQUIREMENTS" }, "metrics": [ { @@ -653,9 +655,14 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#28, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d119]", "meta": { - "_dd.unparsed": "any", - "outputPartitioning": "hashpartitioning(string_col#28, 2)", - "shuffleOrigin": "ENSURE_REQUIREMENTS" + "_dd.unparsed" : "any", + "outputPartitioning" : { + "HashPartitioning" : { + "numPartitions" : 2, + "expressions" : [ "string_col#28" ] + } + }, + "shuffleOrigin" : "ENSURE_REQUIREMENTS" }, "metrics": [ { @@ -681,11 +688,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "any", "nodeDetailString": "[string_col#28]", "meta": { - "output": ["string_col#28"], - "rows": [ - "[first]", - "[second]" - ] + "output" : [ "string_col#28" ], + "rows" : [ ] }, "metrics": [ { @@ -703,9 +707,14 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "any", "nodeDetailString": "hashpartitioning(string_col#32, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d120]", "meta": { - "_dd.unparsed": "any", - "outputPartitioning": "hashpartitioning(string_col#32, 2)", - "shuffleOrigin": "ENSURE_REQUIREMENTS" + "_dd.unparsed" : "any", + "outputPartitioning" : { + "HashPartitioning" : { + "numPartitions" : 2, + "expressions" : [ "string_col#32" ] + } + }, + "shuffleOrigin" : "ENSURE_REQUIREMENTS" }, "metrics": [ { @@ -731,12 +740,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "any", "nodeDetailString": "[string_col#32]", "meta": { - "output": ["string_col#32"], - "rows": [ - "[first]", - "[first]", - "[second]" - ] + "output" : [ "string_col#32" ], + "rows" : [ ] }, "metrics": [ { @@ -754,9 +759,9 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_7", "nodeDetailString": "SinglePartition, ENSURE_REQUIREMENTS, [plan_id\\u003d230]", "meta": { - "_dd.unparsed": "any", - "outputPartitioning": "SinglePartition", - "shuffleOrigin": "ENSURE_REQUIREMENTS" + "_dd.unparsed" : "any", + "outputPartitioning" : "SinglePartition", + "shuffleOrigin" : "ENSURE_REQUIREMENTS" }, "metrics": [ { @@ -793,15 +798,15 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_16", "nodeDetailString": "(keys\\u003d[], functions\\u003d[partial_count(1)])", "meta": { - "_dd.unparsed": "any", - "aggregateAttributes": ["count#45L"], - "aggregateExpressions": ["partial_count(1)"], - "groupingExpressions": [], - "initialInputBufferOffset": 0, - "isStreaming": false, - "numShufflePartitions": "none", - "requiredChildDistributionExpressions": "none", - "resultExpressions": ["count#46L"] + "_dd.unparsed" : "any", + "aggregateAttributes" : [ "count#45L" ], + "aggregateExpressions" : [ "partial_count(1)" ], + "groupingExpressions" : [ ], + "initialInputBufferOffset" : 0, + "isStreaming" : false, + "numShufflePartitions" : "none", + "requiredChildDistributionExpressions" : "none", + "resultExpressions" : [ "count#46L" ] }, "metrics": [ { @@ -818,8 +823,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "node": "Project", "nodeId": "nodeId_13", "meta": { - "_dd.unparsed": "any", - "projectList": [] + "_dd.unparsed" : "any", + "projectList" : [ ] }, "children": [ { @@ -827,12 +832,12 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_15", "nodeDetailString": "[string_col#28], [string_col#32], Inner", "meta": { - "_dd.unparsed": "any", - "condition": "none", - "isSkewJoin": false, - "joinType": "Inner", - "leftKeys": ["string_col#28"], - "rightKeys": ["string_col#32"] + "_dd.unparsed" : "any", + "condition" : "none", + "isSkewJoin" : false, + "joinType" : "Inner", + "leftKeys" : [ "string_col#28" ], + "rightKeys" : [ "string_col#32" ] }, "metrics": [ { @@ -862,10 +867,10 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_11", "nodeDetailString": "[string_col#28 ASC NULLS FIRST], false, 0", "meta": { - "_dd.unparsed": "any", - "global": false, - "sortOrder": ["string_col#28 ASC NULLS FIRST"], - "testSpillFrequency": 0 + "_dd.unparsed" : "any", + "global" : false, + "sortOrder" : [ "string_col#28 ASC NULLS FIRST" ], + "testSpillFrequency" : 0 }, "metrics": [ { @@ -892,8 +897,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_5", "nodeDetailString": "coalesced", "meta": { - "_dd.unparsed": "any", - "partitionSpecs": ["CoalescedPartitionSpec(0,2,Some(144))"] + "_dd.unparsed" : "any", + "partitionSpecs" : [ "CoalescedPartitionSpec(0,2,Some(144))" ] }, "metrics": [], "children": [ @@ -911,9 +916,14 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_2", "nodeDetailString": "hashpartitioning(string_col#28, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d119]", "meta": { - "_dd.unparsed": "any", - "outputPartitioning": "hashpartitioning(string_col#28, 2)", - "shuffleOrigin": "ENSURE_REQUIREMENTS" + "_dd.unparsed" : "any", + "outputPartitioning" : { + "HashPartitioning" : { + "numPartitions" : 2, + "expressions" : [ "string_col#28" ] + } + }, + "shuffleOrigin" : "ENSURE_REQUIREMENTS" }, "metrics": [ { @@ -983,10 +993,10 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_17", "nodeDetailString": "[string_col#32 ASC NULLS FIRST], false, 0", "meta": { - "_dd.unparsed": "any", - "global": false, - "sortOrder": ["string_col#32 ASC NULLS FIRST"], - "testSpillFrequency": 0 + "_dd.unparsed" : "any", + "global" : false, + "sortOrder" : [ "string_col#32 ASC NULLS FIRST" ], + "testSpillFrequency" : 0 }, "metrics": [ { @@ -1013,8 +1023,8 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_5", "nodeDetailString": "coalesced", "meta": { - "_dd.unparsed": "any", - "partitionSpecs": ["CoalescedPartitionSpec(0,2,Some(160))"] + "_dd.unparsed" : "any", + "partitionSpecs" : [ "CoalescedPartitionSpec(0,2,Some(160))" ] }, "metrics": [], "children": [ @@ -1032,9 +1042,14 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_4", "nodeDetailString": "hashpartitioning(string_col#32, 2), ENSURE_REQUIREMENTS, [plan_id\\u003d120]", "meta": { - "_dd.unparsed": "any", - "outputPartitioning": "hashpartitioning(string_col#32, 2)", - "shuffleOrigin": "ENSURE_REQUIREMENTS" + "_dd.unparsed" : "any", + "outputPartitioning" : { + "HashPartitioning" : { + "numPartitions" : 2, + "expressions" : [ "string_col#32" ] + } + }, + "shuffleOrigin" : "ENSURE_REQUIREMENTS" }, "metrics": [ { @@ -1111,15 +1126,15 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_18", "nodeDetailString": "(keys\\u003d[], functions\\u003d[count(1)])", "meta": { - "_dd.unparsed": "any", - "aggregateAttributes": ["count(1)#42L"], - "aggregateExpressions": ["count(1)"], - "groupingExpressions": [], - "initialInputBufferOffset": 0, - "isStreaming": false, - "numShufflePartitions": "none", - "requiredChildDistributionExpressions": [], - "resultExpressions": ["count(1)#42L AS count#43L"] + "_dd.unparsed" : "any", + "aggregateAttributes" : [ "count(1)#42L" ], + "aggregateExpressions" : [ "count(1)" ], + "groupingExpressions" : [ ], + "initialInputBufferOffset" : 0, + "isStreaming" : false, + "numShufflePartitions" : "none", + "requiredChildDistributionExpressions" : [ ], + "resultExpressions" : [ "count(1)#42L AS count#43L" ] }, "metrics": [ { @@ -1151,9 +1166,9 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { "nodeId": "nodeId_7", "nodeDetailString": "SinglePartition, ENSURE_REQUIREMENTS, [plan_id\\u003d230]", "meta": { - "_dd.unparsed": "any", - "outputPartitioning": "SinglePartition", - "shuffleOrigin": "ENSURE_REQUIREMENTS" + "_dd.unparsed" : "any", + "outputPartitioning" : "SinglePartition", + "shuffleOrigin" : "ENSURE_REQUIREMENTS" }, "metrics": [ { From c68b3565370441595b1ee1c740650ac9e1719f31 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Mon, 20 Oct 2025 18:59:40 -0400 Subject: [PATCH 10/21] Improve testing scheme for Spark32 on Scala 212 with unknown keys --- .../spark/AbstractSpark24SqlTest.groovy | 64 +++++++++++-------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy index 180509cbc7c..37cd660f58f 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy @@ -102,33 +102,40 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { assert actualMeta.size() == expectedMeta.size() : prefix + "meta size of $expectedMeta does not match $actualMeta" - def actualUnknown = [] // List of values for all valid unknown keys + def numKnownKeys = 0 actual.meta.each { actualMetaKey, actualMetaValue -> - if (!expectedMeta.containsKey(actualMetaKey) && actualMetaKey.startsWith("_dd.unknown_key.")) { - actualUnknown.add(parseNestedMetaObject(actualMetaValue)) - } else if (!expectedMeta.containsKey(actualMetaKey)) { + if (expectedMeta.containsKey(actualMetaKey)) { + numKnownKeys += 1 + } else if (!actualMetaKey.startsWith("_dd.unknown_key.")) { throw new AssertionError(prefix + "unexpected key \"$actualMetaKey\" found, not valid unknown key with prefix '_dd.unknown_key.' or in $expectedMeta") } } - expected.meta.each { expectedMetaKey, expectedMetaValue -> - if (actualMeta.containsKey(expectedMetaKey)) { - def actualMetaValue = actualMeta[expectedMetaKey] - if (expectedMetaValue instanceof List) { - assert expectedMetaValue ==~ actualMetaValue : prefix + "value of meta key \"$expectedMetaKey\" does not match \"$expectedMetaValue\", got \"$actualMetaValue\"" + // Assume unknown keys are all or nothing (a single known key means we must compare keys) + if (numKnownKeys == actual.meta.size()) { + expected.meta.each { expectedMetaKey, expectedMetaValue -> + if (actualMeta.containsKey(expectedMetaKey)) { + def actualMetaValue = actualMeta[expectedMetaKey] + if (expectedMetaValue instanceof List) { + assert expectedMetaValue ==~ actualMetaValue : prefix + "value of meta key \"$expectedMetaKey\" does not match \"$expectedMetaValue\", got \"$actualMetaValue\"" + } else { + // Don't assert meta values where expectation is "any" + assert expectedMetaValue == "any" || expectedMetaValue == actualMetaValue: prefix + "value of meta key \"$expectedMetaKey\" does not match \"$expectedMetaValue\", got \"$actualMetaValue\"" + } } else { - // Don't assert meta values where expectation is "any" - assert expectedMetaValue == "any" || expectedMetaValue == actualMetaValue: prefix + "value of meta key \"$expectedMetaKey\" does not match \"$expectedMetaValue\", got \"$actualMetaValue\"" + // Defensive, should never happen + assert actualMeta.containsKey(expectedMetaKey) : prefix + "meta key \"$expectedMetaKey\" not found in $actualMeta" } - } else if (actualUnknown.size() > 0) { - // If expected key not found, attempt to match value against those from valid unknown keys - def expectedMetaValueToCompare = parseNestedMetaObject(expectedMetaValue) - assert actualUnknown.indexOf(expectedMetaValueToCompare) >= 0 : prefix + "meta key \"$expectedMetaKey\" not found in $actualMeta\n\tattempted to match against value \"$expectedMetaValueToCompare\" with unknown keys, but not found" - actualUnknown.drop(actualUnknown.indexOf(expectedMetaValueToCompare)) - } else { - // Defensive, should never happen - assert actualMeta.containsKey(expectedMetaKey) : prefix + "meta key \"$expectedMetaKey\" not found in $actualMeta" } + } else { + def expectedValuesList = getMetaValues(expectedMeta) + def actualValuesList = getMetaValues(actualMeta) + + // We filter out items marked as "any" in `expectedValuesList`, so check that subset of expected values exists + // in the actual list returned. Note this could mean a value that was designated for "any" is used to validate + // another key's value incorrectly: + // e.g. actual={a:b, c:d}, expected={a:any, c:b} + assert actualValuesList.containsAll(expectedValuesList) : prefix + "expected values not contained in actual, expected $expectedValuesList, got $actualValuesList" } } catch (AssertionError e) { generateMetaExpectations(actual, name) @@ -174,14 +181,19 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { } } - // Parse any nested objects to a standard form that ignores the keys - // Right now we do this by just asserting the key set and none of the values - static Object parseNestedMetaObject(Object value) { - if (value instanceof Map) { - return value.keySet() - } else { - return value + static List getMetaValues(Map obj) { + List res = new ArrayList() + obj.values().forEach { val -> + if (val == "any") { + // skip + } else if (val instanceof Map) { + res.add(getMetaValues(val)) + } else { + res.add(val) + } } + // sort to keep nested values stable + return res.sort() } static String escapeJsonString(String source) { From 8bf8488e237a3ec778a44d969335ea9a2e55cfa7 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Tue, 21 Oct 2025 11:22:39 -0400 Subject: [PATCH 11/21] Improve method & class naming, reuse ObjectMapper from listener --- .../spark/Spark212Instrumentation.java | 2 +- ...Utils.java => Spark212PlanSerializer.java} | 2 +- .../spark/Spark213Instrumentation.java | 2 +- ...Utils.java => Spark213PlanSerializer.java} | 2 +- .../spark/AbstractDatadogSparkListener.java | 2 +- ....java => AbstractSparkPlanSerializer.java} | 23 ++++++++----------- 6 files changed, 15 insertions(+), 18 deletions(-) rename dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/{Spark212PlanUtils.java => Spark212PlanSerializer.java} (73%) rename dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/{Spark213PlanUtils.java => Spark213PlanSerializer.java} (72%) rename dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/{AbstractSparkPlanUtils.java => AbstractSparkPlanSerializer.java} (87%) diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java index c69484c5ea9..c5413c1b1c5 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java @@ -110,7 +110,7 @@ public static void exit( @Advice.Return(readOnly = false) SparkPlanInfo planInfo, @Advice.Argument(0) SparkPlan plan) { if (planInfo.metadata().size() == 0) { - Spark212PlanUtils planUtils = new Spark212PlanUtils(); + Spark212PlanSerializer planUtils = new Spark212PlanSerializer(); HashMap args = new HashMap<>(); planInfo = new SparkPlanInfo( diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanSerializer.java similarity index 73% rename from dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java rename to dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanSerializer.java index fc589f5c7cd..7390bd3e33d 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanUtils.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanSerializer.java @@ -2,7 +2,7 @@ import org.apache.spark.sql.catalyst.trees.TreeNode; -public class Spark212PlanUtils extends AbstractSparkPlanUtils { +public class Spark212PlanSerializer extends AbstractSparkPlanSerializer { public String getKey(int idx, TreeNode node) { return String.format("_dd.unknown_key.%d", idx); } diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java index 120b8c15300..bf71805a24b 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java @@ -111,7 +111,7 @@ public static void exit( @Advice.Return(readOnly = false) SparkPlanInfo planInfo, @Advice.Argument(0) SparkPlan plan) { if (planInfo.metadata().size() == 0) { - Spark213PlanUtils planUtils = new Spark213PlanUtils(); + Spark213PlanSerializer planUtils = new Spark213PlanSerializer(); planInfo = new SparkPlanInfo( planInfo.nodeName(), diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanSerializer.java similarity index 72% rename from dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java rename to dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanSerializer.java index c4e3285a637..fa21564a2a8 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanUtils.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanSerializer.java @@ -2,7 +2,7 @@ import org.apache.spark.sql.catalyst.trees.TreeNode; -public class Spark213PlanUtils extends AbstractSparkPlanUtils { +public class Spark213PlanSerializer extends AbstractSparkPlanSerializer { public String getKey(int idx, TreeNode node) { return node.productElementName(idx); } diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractDatadogSparkListener.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractDatadogSparkListener.java index 2404803cb92..bdbe4554593 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractDatadogSparkListener.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractDatadogSparkListener.java @@ -66,7 +66,7 @@ */ public abstract class AbstractDatadogSparkListener extends SparkListener { private static final Logger log = LoggerFactory.getLogger(AbstractDatadogSparkListener.class); - private static final ObjectMapper objectMapper = new ObjectMapper(); + public static final ObjectMapper objectMapper = new ObjectMapper(); public static volatile AbstractDatadogSparkListener listener = null; public static volatile boolean finishTraceOnApplicationEnd = true; diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java similarity index 87% rename from dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java rename to dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java index 2bb7f8dfa19..804178cc360 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanUtils.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java @@ -1,6 +1,5 @@ package datadog.trace.instrumentation.spark; -import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.lang.reflect.InvocationTargetException; @@ -21,11 +20,10 @@ // An extension of how Spark translates `SparkPlan`s to `SparkPlanInfo`, see here: // https://github.com/apache/spark/blob/v3.5.0/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala#L54 -public abstract class AbstractSparkPlanUtils { +public abstract class AbstractSparkPlanSerializer { private final int MAX_DEPTH = 4; private final int MAX_LENGTH = 50; - private final ObjectMapper mapper = - new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + private final ObjectMapper mapper = AbstractDatadogSparkListener.objectMapper; private final Class[] SAFE_CLASSES = { Attribute.class, // simpleString appends data type, avoid by using toString @@ -42,7 +40,7 @@ public abstract class AbstractSparkPlanUtils { public Map extractFormattedProduct(TreeNode plan) { HashMap result = new HashMap<>(); - extractPlanProduct(plan, 0) + safeParseTreeNode(plan, 0) .forEach( (key, value) -> { result.put(key, writeObjectToString(value)); @@ -50,7 +48,7 @@ public Map extractFormattedProduct(TreeNode plan) { return result; } - protected Map extractPlanProduct(TreeNode node, int depth) { + protected Map safeParseTreeNode(TreeNode node, int depth) { HashMap args = new HashMap<>(); HashMap unparsed = new HashMap<>(); @@ -59,7 +57,7 @@ protected Map extractPlanProduct(TreeNode node, int depth) { it.hasNext(); ) { Object obj = it.next(); - Object val = parsePlanProduct(obj, depth); + Object val = safeParseObjectToJson(obj, depth); if (val != null) { args.put(getKey(i, node), val); } else { @@ -85,11 +83,10 @@ protected String writeObjectToString(Object value) { } } - protected Object parsePlanProduct(Object value, int depth) { + protected Object safeParseObjectToJson(Object value, int depth) { // This function MUST not arbitrarily serialize the object as we can't be sure what it is. // A null return indicates object is unserializable, otherwise it should really only return - // valid - // JSON types (Array, Map, String, Boolean, Number, null) + // valid JSON types (Array, Map, String, Boolean, Number, null) if (value == null) { return "null"; @@ -98,14 +95,14 @@ protected Object parsePlanProduct(Object value, int depth) { || Number.class.isInstance(value)) { return value; } else if (value instanceof Option) { - return parsePlanProduct(((Option) value).getOrElse(() -> "none"), depth); + return safeParseObjectToJson(((Option) value).getOrElse(() -> "none"), depth); } else if (value instanceof QueryPlan) { // don't duplicate child nodes return null; } else if (value instanceof Iterable && depth < MAX_DEPTH) { ArrayList list = new ArrayList<>(); for (Object item : JavaConverters.asJavaIterable((Iterable) value)) { - Object res = parsePlanProduct(item, depth + 1); + Object res = safeParseObjectToJson(item, depth + 1); if (list.size() < MAX_LENGTH && res != null) { list.add(res); } @@ -115,7 +112,7 @@ protected Object parsePlanProduct(Object value, int depth) { if (value instanceof TreeNode && depth + 1 < MAX_DEPTH) { HashMap inner = new HashMap<>(); inner.put( - value.getClass().getSimpleName(), extractPlanProduct(((TreeNode) value), depth + 2)); + value.getClass().getSimpleName(), safeParseTreeNode(((TreeNode) value), depth + 2)); return inner; } else { return value.toString(); From 55b917b51ab471e2bff040f183acbaa8bb66ccd4 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Tue, 21 Oct 2025 11:39:13 -0400 Subject: [PATCH 12/21] Gate Spark Plan parsing with flag --- .../spark/Spark212Instrumentation.java | 2 +- .../spark/Spark213Instrumentation.java | 2 +- .../spark/AbstractSpark24SqlTest.groovy | 1 + .../spark/AbstractSpark32SqlTest.groovy | 1 + .../main/java/datadog/trace/api/ConfigDefaults.java | 1 + .../java/datadog/trace/api/config/GeneralConfig.java | 2 ++ .../src/main/java/datadog/trace/api/Config.java | 12 ++++++++++++ 7 files changed, 19 insertions(+), 2 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java index c5413c1b1c5..009e81715f4 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java @@ -109,7 +109,7 @@ public static class SparkPlanInfoAdvice { public static void exit( @Advice.Return(readOnly = false) SparkPlanInfo planInfo, @Advice.Argument(0) SparkPlan plan) { - if (planInfo.metadata().size() == 0) { + if (planInfo.metadata().size() == 0 && Config.get().isDataJobsParseSparkPlanEnabled()) { Spark212PlanSerializer planUtils = new Spark212PlanSerializer(); HashMap args = new HashMap<>(); planInfo = diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java index bf71805a24b..3cb215fe484 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java @@ -110,7 +110,7 @@ public static class SparkPlanInfoAdvice { public static void exit( @Advice.Return(readOnly = false) SparkPlanInfo planInfo, @Advice.Argument(0) SparkPlan plan) { - if (planInfo.metadata().size() == 0) { + if (planInfo.metadata().size() == 0 && Config.get().isDataJobsParseSparkPlanEnabled()) { Spark213PlanSerializer planUtils = new Spark213PlanSerializer(); planInfo = new SparkPlanInfo( diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy index 37cd660f58f..158932ab3a9 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy @@ -16,6 +16,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { super.configurePreAgent() injectSysConfig("dd.integration.spark.enabled", "true") injectSysConfig("dd.integration.openlineage-spark.enabled", "true") + injectSysConfig("dd.data.jobs.parse_spark_plan.enabled", "true") } static Dataset generateSampleDataframe(SparkSession spark) { diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy index 7b64eff2fe6..1590c228352 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy @@ -11,6 +11,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { super.configurePreAgent() injectSysConfig("dd.integration.spark.enabled", "true") injectSysConfig("dd.integration.spark-openlineage.enabled", "true") + injectSysConfig("dd.data.jobs.parse_spark_plan.enabled", "true") } def "compute a GROUP BY sql query plan"() { diff --git a/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java b/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java index 8580c6dfe36..cea0d0063a4 100644 --- a/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java +++ b/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java @@ -231,6 +231,7 @@ public final class ConfigDefaults { static final boolean DEFAULT_DATA_JOBS_ENABLED = false; static final boolean DEFAULT_DATA_JOBS_OPENLINEAGE_ENABLED = false; static final boolean DEFAULT_DATA_JOBS_OPENLINEAGE_TIMEOUT_ENABLED = true; + static final boolean DEFAULT_DATA_JOBS_PARSE_SPARK_PLAN_ENABLED = false; static final boolean DEFAULT_DATA_STREAMS_ENABLED = false; static final int DEFAULT_DATA_STREAMS_BUCKET_DURATION = 10; // seconds diff --git a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java index c3f3d63ca8b..de6f65e597e 100644 --- a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java +++ b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java @@ -80,6 +80,8 @@ public final class GeneralConfig { public static final String DATA_JOBS_OPENLINEAGE_ENABLED = "data.jobs.openlineage.enabled"; public static final String DATA_JOBS_OPENLINEAGE_TIMEOUT_ENABLED = "data.jobs.openlineage.timeout.enabled"; + public static final String DATA_JOBS_PARSE_SPARK_PLAN_ENABLED = + "data.jobs.parse_spark_plan.enabled"; public static final String DATA_STREAMS_ENABLED = "data.streams.enabled"; public static final String DATA_STREAMS_BUCKET_DURATION_SECONDS = diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index ef0a02114da..529c8077579 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -50,6 +50,7 @@ import static datadog.trace.api.ConfigDefaults.DEFAULT_DATA_JOBS_ENABLED; import static datadog.trace.api.ConfigDefaults.DEFAULT_DATA_JOBS_OPENLINEAGE_ENABLED; import static datadog.trace.api.ConfigDefaults.DEFAULT_DATA_JOBS_OPENLINEAGE_TIMEOUT_ENABLED; +import static datadog.trace.api.ConfigDefaults.DEFAULT_DATA_JOBS_PARSE_SPARK_PLAN_ENABLED; import static datadog.trace.api.ConfigDefaults.DEFAULT_DATA_STREAMS_BUCKET_DURATION; import static datadog.trace.api.ConfigDefaults.DEFAULT_DATA_STREAMS_ENABLED; import static datadog.trace.api.ConfigDefaults.DEFAULT_DB_CLIENT_HOST_SPLIT_BY_HOST; @@ -344,6 +345,7 @@ import static datadog.trace.api.config.GeneralConfig.DATA_JOBS_ENABLED; import static datadog.trace.api.config.GeneralConfig.DATA_JOBS_OPENLINEAGE_ENABLED; import static datadog.trace.api.config.GeneralConfig.DATA_JOBS_OPENLINEAGE_TIMEOUT_ENABLED; +import static datadog.trace.api.config.GeneralConfig.DATA_JOBS_PARSE_SPARK_PLAN_ENABLED; import static datadog.trace.api.config.GeneralConfig.DATA_STREAMS_BUCKET_DURATION_SECONDS; import static datadog.trace.api.config.GeneralConfig.DATA_STREAMS_ENABLED; import static datadog.trace.api.config.GeneralConfig.DOGSTATSD_ARGS; @@ -1189,6 +1191,7 @@ public static String getHostName() { private final boolean dataJobsEnabled; private final boolean dataJobsOpenLineageEnabled; private final boolean dataJobsOpenLineageTimeoutEnabled; + private final boolean dataJobsParseSparkPlanEnabled; private final boolean dataStreamsEnabled; private final float dataStreamsBucketDurationSeconds; @@ -2627,6 +2630,9 @@ PROFILING_DATADOG_PROFILER_ENABLED, isDatadogProfilerSafeInCurrentEnvironment()) dataJobsOpenLineageTimeoutEnabled = configProvider.getBoolean( DATA_JOBS_OPENLINEAGE_TIMEOUT_ENABLED, DEFAULT_DATA_JOBS_OPENLINEAGE_TIMEOUT_ENABLED); + dataJobsParseSparkPlanEnabled = + configProvider.getBoolean( + DATA_JOBS_PARSE_SPARK_PLAN_ENABLED, DEFAULT_DATA_JOBS_PARSE_SPARK_PLAN_ENABLED); dataStreamsEnabled = configProvider.getBoolean(DATA_STREAMS_ENABLED, DEFAULT_DATA_STREAMS_ENABLED); @@ -4556,6 +4562,10 @@ public boolean isDataJobsOpenLineageTimeoutEnabled() { return dataJobsOpenLineageTimeoutEnabled; } + public boolean isDataJobsParseSparkPlanEnabled() { + return dataJobsParseSparkPlanEnabled; + } + public boolean isApmTracingEnabled() { return apmTracingEnabled; } @@ -5898,6 +5908,8 @@ public String toString() { + dataJobsOpenLineageEnabled + ", dataJobsOpenLineageTimeoutEnabled=" + dataJobsOpenLineageTimeoutEnabled + + ", dataJobsParseSparkPlanEnabled=" + + dataJobsParseSparkPlanEnabled + ", apmTracingEnabled=" + apmTracingEnabled + ", jdkSocketEnabled=" From 047880ac14e6d01468e78f79dbbd8df2fcce9d70 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Tue, 21 Oct 2025 17:18:28 -0400 Subject: [PATCH 13/21] Match classes by string comparison, add negative cache --- .../spark/AbstractSparkPlanSerializer.java | 90 ++++++++++++++----- 1 file changed, 66 insertions(+), 24 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java index 804178cc360..df554d53a70 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java @@ -4,14 +4,13 @@ import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.Map; -import org.apache.spark.Partitioner; -import org.apache.spark.sql.catalyst.expressions.Attribute; -import org.apache.spark.sql.catalyst.plans.JoinType; +import java.util.Set; import org.apache.spark.sql.catalyst.plans.QueryPlan; -import org.apache.spark.sql.catalyst.plans.physical.BroadcastMode; import org.apache.spark.sql.catalyst.plans.physical.Partitioning; import org.apache.spark.sql.catalyst.trees.TreeNode; import scala.Option; @@ -25,16 +24,37 @@ public abstract class AbstractSparkPlanSerializer { private final int MAX_LENGTH = 50; private final ObjectMapper mapper = AbstractDatadogSparkListener.objectMapper; - private final Class[] SAFE_CLASSES = { - Attribute.class, // simpleString appends data type, avoid by using toString - JoinType.class, // enum - Partitioner.class, // not a product or TreeNode - BroadcastMode.class, // not a product or TreeNode - maybeGetClass("org.apache.spark.sql.execution.exchange.ShuffleOrigin"), // enum (v3+) - maybeGetClass("org.apache.spark.sql.catalyst.optimizer.BuildSide"), // enum (v3+) - maybeGetClass( - "org.apache.spark.sql.execution.ShufflePartitionSpec"), // not a product or TreeNode (v3+) - }; + private final String SPARK_PKG_NAME = "org.apache.spark"; + private final Set SAFE_CLASS_NAMES = + new HashSet<>( + Arrays.asList( + SPARK_PKG_NAME + ".Partitioner", // not a product or TreeNode + SPARK_PKG_NAME + + ".sql.catalyst.expressions.Attribute", // avoid data type added by simpleString + SPARK_PKG_NAME + ".sql.catalyst.optimizer.BuildSide", // enum (v3+) + SPARK_PKG_NAME + ".sql.catalyst.plans.JoinType", // enum + SPARK_PKG_NAME + + ".sql.catalyst.plans.physical.BroadcastMode", // not a product or TreeNode + SPARK_PKG_NAME + + ".sql.execution.ShufflePartitionSpec", // not a product or TreeNode (v3+) + SPARK_PKG_NAME + ".sql.execution.exchange.ShuffleOrigin" // enum (v3+) + )); + + // Add class here if we want to break inheritance and interface traversal early when we see + // this class. Any class added must be a class whose parents we do not want to match + // (inclusive of the class itself). + private final Set NEGATIVE_CACHE_CLASSES = + new HashSet<>( + Arrays.asList( + "java.io.Serializable", + "java.lang.Object", + "scala.Equals", + "scala.Product", + SPARK_PKG_NAME + ".sql.catalyst.InternalRow", + SPARK_PKG_NAME + ".sql.catalyst.expressions.Expression", + SPARK_PKG_NAME + ".sql.catalyst.expressions.UnaryExpression", + SPARK_PKG_NAME + ".sql.catalyst.expressions.Unevaluable", + SPARK_PKG_NAME + ".sql.catalyst.trees.TreeNode")); public abstract String getKey(int idx, TreeNode node); @@ -117,7 +137,7 @@ protected Object safeParseObjectToJson(Object value, int depth) { } else { return value.toString(); } - } else if (instanceOf(value, SAFE_CLASSES)) { + } else if (traversedInstanceOf(value, SAFE_CLASS_NAMES, NEGATIVE_CACHE_CLASSES)) { return value.toString(); } else if (value instanceof TreeNode) { // fallback case, leave at bottom @@ -147,22 +167,44 @@ private String getSimpleString(TreeNode value) { } } - // Use reflection rather than native `instanceof` for classes added in later Spark versions - private boolean instanceOf(Object value, Class[] classes) { - for (Class cls : classes) { - if (cls != null && cls.isInstance(value)) { + private boolean traversedInstanceOf( + Object value, Set expectedClasses, Set negativeCache) { + if (instanceOf(value.getClass(), expectedClasses, negativeCache)) { + return true; + } + + // Traverse up inheritance tree to check for matches + int lim = 0; + Class currClass = value.getClass(); + while (currClass.getSuperclass() != null && lim < MAX_DEPTH) { + currClass = currClass.getSuperclass(); + if (negativeCache.contains(currClass.getName())) { + // don't traverse known paths + break; + } + if (instanceOf(currClass, expectedClasses, negativeCache)) { return true; } + lim += 1; } return false; } - private Class maybeGetClass(String cls) { - try { - return Class.forName(cls); - } catch (ClassNotFoundException e) { - return null; + private boolean instanceOf(Class cls, Set expectedClasses, Set negativeCache) { + // Match on strings to avoid class loading errors + if (expectedClasses.contains(cls.getName())) { + return true; } + + // Check interfaces as well + for (Class interfaceClass : cls.getInterfaces()) { + if (!negativeCache.contains(interfaceClass.getName()) + && instanceOf(interfaceClass, expectedClasses, negativeCache)) { + return true; + } + } + + return false; } } From 3619b77e1b5459fb26ab692696df677862eb7020 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Wed, 22 Oct 2025 23:20:37 -0400 Subject: [PATCH 14/21] Add unit tests for AbstractSparkPlanSerializer --- .../spark/Spark212PlanSerializer.java | 1 + .../spark/Spark213PlanSerializer.java | 1 + .../spark/AbstractSparkPlanSerializer.java | 11 +-- .../spark/SparkPlanSerializerTestCases.groovy | 97 +++++++++++++++++++ .../spark/SparkPlanSerializerTest.java | 11 +++ 5 files changed, 115 insertions(+), 6 deletions(-) create mode 100644 dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/SparkPlanSerializerTestCases.groovy create mode 100644 dd-java-agent/instrumentation/spark/src/testFixtures/java/datadog/trace/instrumentation/spark/SparkPlanSerializerTest.java diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanSerializer.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanSerializer.java index 7390bd3e33d..90f8f21a162 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanSerializer.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212PlanSerializer.java @@ -3,6 +3,7 @@ import org.apache.spark.sql.catalyst.trees.TreeNode; public class Spark212PlanSerializer extends AbstractSparkPlanSerializer { + @Override public String getKey(int idx, TreeNode node) { return String.format("_dd.unknown_key.%d", idx); } diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanSerializer.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanSerializer.java index fa21564a2a8..ca20859a768 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanSerializer.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213PlanSerializer.java @@ -3,6 +3,7 @@ import org.apache.spark.sql.catalyst.trees.TreeNode; public class Spark213PlanSerializer extends AbstractSparkPlanSerializer { + @Override public String getKey(int idx, TreeNode node) { return node.productElementName(idx); } diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java index df554d53a70..9e397ddf98c 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java @@ -13,6 +13,7 @@ import org.apache.spark.sql.catalyst.plans.QueryPlan; import org.apache.spark.sql.catalyst.plans.physical.Partitioning; import org.apache.spark.sql.catalyst.trees.TreeNode; +import org.apache.spark.sql.execution.SparkPlan; import scala.Option; import scala.collection.Iterable; import scala.collection.JavaConverters; @@ -58,7 +59,7 @@ public abstract class AbstractSparkPlanSerializer { public abstract String getKey(int idx, TreeNode node); - public Map extractFormattedProduct(TreeNode plan) { + public Map extractFormattedProduct(SparkPlan plan) { HashMap result = new HashMap<>(); safeParseTreeNode(plan, 0) .forEach( @@ -110,9 +111,7 @@ protected Object safeParseObjectToJson(Object value, int depth) { if (value == null) { return "null"; - } else if (value instanceof String - || value instanceof Boolean - || Number.class.isInstance(value)) { + } else if (value instanceof String || value instanceof Boolean || value instanceof Number) { return value; } else if (value instanceof Option) { return safeParseObjectToJson(((Option) value).getOrElse(() -> "none"), depth); @@ -129,10 +128,10 @@ protected Object safeParseObjectToJson(Object value, int depth) { } return list; } else if (value instanceof Partitioning) { - if (value instanceof TreeNode && depth + 1 < MAX_DEPTH) { + if (value instanceof TreeNode && depth < MAX_DEPTH) { HashMap inner = new HashMap<>(); inner.put( - value.getClass().getSimpleName(), safeParseTreeNode(((TreeNode) value), depth + 2)); + value.getClass().getSimpleName(), safeParseTreeNode(((TreeNode) value), depth + 1)); return inner; } else { return value.toString(); diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/SparkPlanSerializerTestCases.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/SparkPlanSerializerTestCases.groovy new file mode 100644 index 00000000000..e781735023e --- /dev/null +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/SparkPlanSerializerTestCases.groovy @@ -0,0 +1,97 @@ +package datadog.trace.instrumentation.spark + +import org.apache.spark.SparkConf +import org.apache.spark.sql.catalyst.expressions.ExprId +import org.apache.spark.sql.catalyst.expressions.PrettyAttribute +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression +import org.apache.spark.sql.catalyst.expressions.aggregate.Count +import org.apache.spark.sql.catalyst.expressions.aggregate.Partial$ +import org.apache.spark.sql.execution.InputAdapter +import org.apache.spark.sql.types.NullType +import scala.collection.JavaConverters +import spock.lang.Specification + +class SparkPlanSerializerTestCases extends Specification { + def "should not recurse lists more than 4 levels deep"() { + given: + def serializer = new SparkPlanSerializerTest() + + def list = JavaConverters.collectionAsScalaIterable(Arrays.asList("hello", "bye")) + for (int i=0; i < 10; i++) { + list = JavaConverters.collectionAsScalaIterable(Arrays.asList(list, list)) + } + + when: + def res = serializer.safeParseObjectToJson(list, 0) + + then: + assert res instanceof ArrayList + assert res.toString() == "[[[[], []], [[], []]], [[[], []], [[], []]]]" + } + + def "should not parse lists more than 50 long"() { + given: + def serializer = new SparkPlanSerializerTest() + + def list = new ArrayList() + for (int i=0; i < 100; i++) { + list.add(i) + } + list = JavaConverters.collectionAsScalaIterable(list) + + when: + def res = serializer.safeParseObjectToJson(list, 0) + + then: + assert res instanceof ArrayList + assert res.size() == 50 + assert res.toString() == "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]" + } + + def "unknown objects should return null"() { + given: + def serializer = new SparkPlanSerializerTest() + + when: + def res = serializer.safeParseObjectToJson(new SparkConf(), 0) + + then: + assert res == null + } + + def "QueryPlan inheritors should return null"() { + given: + def serializer = new SparkPlanSerializerTest() + + when: + def res = serializer.safeParseObjectToJson(new InputAdapter(null), 0) + + then: + assert res == null + } + + def "inheritors of safe classes should return string"() { + given: + def serializer = new SparkPlanSerializerTest() + + when: + def res = serializer.safeParseObjectToJson(new PrettyAttribute("test", new NullType()), 0) + + then: + assert res instanceof String + assert res == "test" + } + + def "TreeNode inheritors that are not QueryPlans should return simpleString"() { + given: + def serializer = new SparkPlanSerializerTest() + def expression = new AggregateExpression(new Count(null), new Partial$(), false, new ExprId(0, UUID.randomUUID())) + + when: + def res = serializer.safeParseObjectToJson(expression, 0) + + then: + assert res instanceof String + assert res == "partial_count(null)" + } +} diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/java/datadog/trace/instrumentation/spark/SparkPlanSerializerTest.java b/dd-java-agent/instrumentation/spark/src/testFixtures/java/datadog/trace/instrumentation/spark/SparkPlanSerializerTest.java new file mode 100644 index 00000000000..ee9e26a774c --- /dev/null +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/java/datadog/trace/instrumentation/spark/SparkPlanSerializerTest.java @@ -0,0 +1,11 @@ +package datadog.trace.instrumentation.spark; + +import org.apache.spark.sql.catalyst.trees.TreeNode; + +public class SparkPlanSerializerTest extends AbstractSparkPlanSerializer { + + @Override + public String getKey(int idx, TreeNode node) { + return String.valueOf(idx); + } +} From 53918a3c16503991c2937305fb3599cd6e279ef4 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Thu, 23 Oct 2025 10:48:29 -0400 Subject: [PATCH 15/21] Make ObjectMapper protected on AbstractDatadogSparkListener instead of public --- .../instrumentation/spark/AbstractDatadogSparkListener.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractDatadogSparkListener.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractDatadogSparkListener.java index bdbe4554593..3f2d79cdb63 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractDatadogSparkListener.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractDatadogSparkListener.java @@ -66,7 +66,7 @@ */ public abstract class AbstractDatadogSparkListener extends SparkListener { private static final Logger log = LoggerFactory.getLogger(AbstractDatadogSparkListener.class); - public static final ObjectMapper objectMapper = new ObjectMapper(); + protected static final ObjectMapper objectMapper = new ObjectMapper(); public static volatile AbstractDatadogSparkListener listener = null; public static volatile boolean finishTraceOnApplicationEnd = true; From 6e68c69d7641ac26266e7d0765139f0b40c283c4 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Thu, 23 Oct 2025 14:18:12 -0400 Subject: [PATCH 16/21] Specify correct helper class names --- .../trace/instrumentation/spark/Spark212Instrumentation.java | 4 ++-- .../trace/instrumentation/spark/Spark213Instrumentation.java | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java index 009e81715f4..209d1ef9b29 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java @@ -21,7 +21,7 @@ public class Spark212Instrumentation extends AbstractSparkInstrumentation { public String[] helperClassNames() { return new String[] { packageName + ".AbstractDatadogSparkListener", - packageName + ".AbstractSparkPlanUtils", + packageName + ".AbstractSparkPlanSerializer", packageName + ".DatabricksParentContext", packageName + ".OpenlineageParentContext", packageName + ".DatadogSpark212Listener", @@ -32,7 +32,7 @@ public String[] helperClassNames() { packageName + ".SparkSQLUtils", packageName + ".SparkSQLUtils$SparkPlanInfoForStage", packageName + ".SparkSQLUtils$AccumulatorWithStage", - packageName + ".Spark212PlanUtils" + packageName + ".Spark212PlanSerializer" }; } diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java index 3cb215fe484..07b01c78c38 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java @@ -21,7 +21,7 @@ public class Spark213Instrumentation extends AbstractSparkInstrumentation { public String[] helperClassNames() { return new String[] { packageName + ".AbstractDatadogSparkListener", - packageName + ".AbstractSparkPlanUtils", + packageName + ".AbstractSparkPlanSerializer", packageName + ".DatabricksParentContext", packageName + ".OpenlineageParentContext", packageName + ".DatadogSpark213Listener", @@ -32,7 +32,7 @@ public String[] helperClassNames() { packageName + ".SparkSQLUtils", packageName + ".SparkSQLUtils$SparkPlanInfoForStage", packageName + ".SparkSQLUtils$AccumulatorWithStage", - packageName + ".Spark213PlanUtils" + packageName + ".Spark213PlanSerializer" }; } From 5483ba8098c011d47386d25394a473748af071ff Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Thu, 23 Oct 2025 14:29:24 -0400 Subject: [PATCH 17/21] Add dd.data.jobs.experimental_features.enabled FF --- .../spark/Spark212Instrumentation.java | 4 +++- .../spark/Spark213Instrumentation.java | 4 +++- .../spark/AbstractSpark24SqlTest.groovy | 2 +- .../spark/AbstractSpark32SqlTest.groovy | 2 +- .../main/java/datadog/trace/api/ConfigDefaults.java | 1 + .../datadog/trace/api/config/GeneralConfig.java | 2 ++ .../src/main/java/datadog/trace/api/Config.java | 13 +++++++++++++ 7 files changed, 24 insertions(+), 4 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java index 209d1ef9b29..0807d7be44b 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java @@ -109,7 +109,9 @@ public static class SparkPlanInfoAdvice { public static void exit( @Advice.Return(readOnly = false) SparkPlanInfo planInfo, @Advice.Argument(0) SparkPlan plan) { - if (planInfo.metadata().size() == 0 && Config.get().isDataJobsParseSparkPlanEnabled()) { + if (planInfo.metadata().size() == 0 + && (Config.get().isDataJobsParseSparkPlanEnabled() + || Config.get().isDataJobsExperimentalFeaturesEnabled())) { Spark212PlanSerializer planUtils = new Spark212PlanSerializer(); HashMap args = new HashMap<>(); planInfo = diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java index 07b01c78c38..d9f731bdced 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java @@ -110,7 +110,9 @@ public static class SparkPlanInfoAdvice { public static void exit( @Advice.Return(readOnly = false) SparkPlanInfo planInfo, @Advice.Argument(0) SparkPlan plan) { - if (planInfo.metadata().size() == 0 && Config.get().isDataJobsParseSparkPlanEnabled()) { + if (planInfo.metadata().size() == 0 + && (Config.get().isDataJobsParseSparkPlanEnabled() + || Config.get().isDataJobsExperimentalFeaturesEnabled())) { Spark213PlanSerializer planUtils = new Spark213PlanSerializer(); planInfo = new SparkPlanInfo( diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy index 158932ab3a9..a34941d1be0 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark24SqlTest.groovy @@ -16,7 +16,7 @@ abstract class AbstractSpark24SqlTest extends InstrumentationSpecification { super.configurePreAgent() injectSysConfig("dd.integration.spark.enabled", "true") injectSysConfig("dd.integration.openlineage-spark.enabled", "true") - injectSysConfig("dd.data.jobs.parse_spark_plan.enabled", "true") + injectSysConfig("dd.data.jobs.experimental_features.enabled", "true") } static Dataset generateSampleDataframe(SparkSession spark) { diff --git a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy index 1590c228352..ee97f8ed87b 100644 --- a/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy +++ b/dd-java-agent/instrumentation/spark/src/testFixtures/groovy/datadog/trace/instrumentation/spark/AbstractSpark32SqlTest.groovy @@ -11,7 +11,7 @@ abstract class AbstractSpark32SqlTest extends InstrumentationSpecification { super.configurePreAgent() injectSysConfig("dd.integration.spark.enabled", "true") injectSysConfig("dd.integration.spark-openlineage.enabled", "true") - injectSysConfig("dd.data.jobs.parse_spark_plan.enabled", "true") + injectSysConfig("dd.data.jobs.experimental_features.enabled", "true") } def "compute a GROUP BY sql query plan"() { diff --git a/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java b/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java index cea0d0063a4..160745eac9f 100644 --- a/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java +++ b/dd-trace-api/src/main/java/datadog/trace/api/ConfigDefaults.java @@ -232,6 +232,7 @@ public final class ConfigDefaults { static final boolean DEFAULT_DATA_JOBS_OPENLINEAGE_ENABLED = false; static final boolean DEFAULT_DATA_JOBS_OPENLINEAGE_TIMEOUT_ENABLED = true; static final boolean DEFAULT_DATA_JOBS_PARSE_SPARK_PLAN_ENABLED = false; + static final boolean DEFAULT_DATA_JOBS_EXPERIMENTAL_FEATURES_ENABLED = false; static final boolean DEFAULT_DATA_STREAMS_ENABLED = false; static final int DEFAULT_DATA_STREAMS_BUCKET_DURATION = 10; // seconds diff --git a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java index de6f65e597e..3d656029562 100644 --- a/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java +++ b/dd-trace-api/src/main/java/datadog/trace/api/config/GeneralConfig.java @@ -82,6 +82,8 @@ public final class GeneralConfig { "data.jobs.openlineage.timeout.enabled"; public static final String DATA_JOBS_PARSE_SPARK_PLAN_ENABLED = "data.jobs.parse_spark_plan.enabled"; + public static final String DATA_JOBS_EXPERIMENTAL_FEATURES_ENABLED = + "data.jobs.experimental_features.enabled"; public static final String DATA_STREAMS_ENABLED = "data.streams.enabled"; public static final String DATA_STREAMS_BUCKET_DURATION_SECONDS = diff --git a/internal-api/src/main/java/datadog/trace/api/Config.java b/internal-api/src/main/java/datadog/trace/api/Config.java index 529c8077579..cd4a0e2d7f7 100644 --- a/internal-api/src/main/java/datadog/trace/api/Config.java +++ b/internal-api/src/main/java/datadog/trace/api/Config.java @@ -48,6 +48,7 @@ import static datadog.trace.api.ConfigDefaults.DEFAULT_CWS_ENABLED; import static datadog.trace.api.ConfigDefaults.DEFAULT_CWS_TLS_REFRESH; import static datadog.trace.api.ConfigDefaults.DEFAULT_DATA_JOBS_ENABLED; +import static datadog.trace.api.ConfigDefaults.DEFAULT_DATA_JOBS_EXPERIMENTAL_FEATURES_ENABLED; import static datadog.trace.api.ConfigDefaults.DEFAULT_DATA_JOBS_OPENLINEAGE_ENABLED; import static datadog.trace.api.ConfigDefaults.DEFAULT_DATA_JOBS_OPENLINEAGE_TIMEOUT_ENABLED; import static datadog.trace.api.ConfigDefaults.DEFAULT_DATA_JOBS_PARSE_SPARK_PLAN_ENABLED; @@ -343,6 +344,7 @@ import static datadog.trace.api.config.GeneralConfig.APP_KEY; import static datadog.trace.api.config.GeneralConfig.AZURE_APP_SERVICES; import static datadog.trace.api.config.GeneralConfig.DATA_JOBS_ENABLED; +import static datadog.trace.api.config.GeneralConfig.DATA_JOBS_EXPERIMENTAL_FEATURES_ENABLED; import static datadog.trace.api.config.GeneralConfig.DATA_JOBS_OPENLINEAGE_ENABLED; import static datadog.trace.api.config.GeneralConfig.DATA_JOBS_OPENLINEAGE_TIMEOUT_ENABLED; import static datadog.trace.api.config.GeneralConfig.DATA_JOBS_PARSE_SPARK_PLAN_ENABLED; @@ -1192,6 +1194,7 @@ public static String getHostName() { private final boolean dataJobsOpenLineageEnabled; private final boolean dataJobsOpenLineageTimeoutEnabled; private final boolean dataJobsParseSparkPlanEnabled; + private final boolean dataJobsExperimentalFeaturesEnabled; private final boolean dataStreamsEnabled; private final float dataStreamsBucketDurationSeconds; @@ -2633,6 +2636,10 @@ PROFILING_DATADOG_PROFILER_ENABLED, isDatadogProfilerSafeInCurrentEnvironment()) dataJobsParseSparkPlanEnabled = configProvider.getBoolean( DATA_JOBS_PARSE_SPARK_PLAN_ENABLED, DEFAULT_DATA_JOBS_PARSE_SPARK_PLAN_ENABLED); + dataJobsExperimentalFeaturesEnabled = + configProvider.getBoolean( + DATA_JOBS_EXPERIMENTAL_FEATURES_ENABLED, + DEFAULT_DATA_JOBS_EXPERIMENTAL_FEATURES_ENABLED); dataStreamsEnabled = configProvider.getBoolean(DATA_STREAMS_ENABLED, DEFAULT_DATA_STREAMS_ENABLED); @@ -4566,6 +4573,10 @@ public boolean isDataJobsParseSparkPlanEnabled() { return dataJobsParseSparkPlanEnabled; } + public boolean isDataJobsExperimentalFeaturesEnabled() { + return dataJobsExperimentalFeaturesEnabled; + } + public boolean isApmTracingEnabled() { return apmTracingEnabled; } @@ -5910,6 +5921,8 @@ public String toString() { + dataJobsOpenLineageTimeoutEnabled + ", dataJobsParseSparkPlanEnabled=" + dataJobsParseSparkPlanEnabled + + ", dataJobsExperimentalFeaturesEnabled=" + + dataJobsExperimentalFeaturesEnabled + ", apmTracingEnabled=" + apmTracingEnabled + ", jdkSocketEnabled=" From e4973fce3c455adad608dc72971a5688c89fde11 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Fri, 24 Oct 2025 11:39:43 -0400 Subject: [PATCH 18/21] Remove knownMatchingTypes override from version-specific impls --- .../spark/Spark212Instrumentation.java | 12 ------------ .../spark/Spark213Instrumentation.java | 12 ------------ .../spark/AbstractSparkInstrumentation.java | 3 ++- 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java index 0807d7be44b..bd49b214402 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.12/src/main/java/datadog/trace/instrumentation/spark/Spark212Instrumentation.java @@ -36,18 +36,6 @@ public String[] helperClassNames() { }; } - @Override - public String[] knownMatchingTypes() { - String[] res = new String[super.knownMatchingTypes().length + 1]; - int idx = 0; - for (String match : super.knownMatchingTypes()) { - res[idx] = match; - idx++; - } - res[idx] = "org.apache.spark.sql.execution.SparkPlanInfo$"; - return res; - } - @Override public void methodAdvice(MethodTransformer transformer) { super.methodAdvice(transformer); diff --git a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java index d9f731bdced..b114d8746ea 100644 --- a/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java +++ b/dd-java-agent/instrumentation/spark/spark_2.13/src/main/java/datadog/trace/instrumentation/spark/Spark213Instrumentation.java @@ -36,18 +36,6 @@ public String[] helperClassNames() { }; } - @Override - public String[] knownMatchingTypes() { - String[] res = new String[super.knownMatchingTypes().length + 1]; - int idx = 0; - for (String match : super.knownMatchingTypes()) { - res[idx] = match; - idx++; - } - res[idx] = "org.apache.spark.sql.execution.SparkPlanInfo$"; - return res; - } - @Override public void methodAdvice(MethodTransformer transformer) { super.methodAdvice(transformer); diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkInstrumentation.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkInstrumentation.java index 102f3a65ccd..0277e9446f0 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkInstrumentation.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkInstrumentation.java @@ -36,7 +36,8 @@ public String[] knownMatchingTypes() { "org.apache.spark.deploy.yarn.ApplicationMaster", "org.apache.spark.util.Utils", "org.apache.spark.util.SparkClassUtils", - "org.apache.spark.scheduler.LiveListenerBus" + "org.apache.spark.scheduler.LiveListenerBus", + "org.apache.spark.sql.execution.SparkPlanInfo$" }; } From 5527ad058235e56cee5c715bc197c811d302b798 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Fri, 24 Oct 2025 13:07:01 -0400 Subject: [PATCH 19/21] Catch NullPointerException for getDeclaredMethod calls --- .../spark/AbstractSparkPlanSerializer.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java index 9e397ddf98c..e79ef5935bf 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java @@ -127,7 +127,7 @@ protected Object safeParseObjectToJson(Object value, int depth) { } } return list; - } else if (value instanceof Partitioning) { + } else if (Partitioning.class.isInstance(value)) { if (value instanceof TreeNode && depth < MAX_DEPTH) { HashMap inner = new HashMap<>(); inner.put( @@ -153,11 +153,15 @@ private String getSimpleString(TreeNode value) { .getDeclaredMethod("simpleString", new Class[] {int.class}) .invoke(value, MAX_LENGTH) .toString(); - } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException exception) { + } catch (NullPointerException + | NoSuchMethodException + | IllegalAccessException + | InvocationTargetException exception) { try { // Attempt the Spark v2 `simpleString` signature return TreeNode.class.getDeclaredMethod("simpleString").invoke(value).toString(); - } catch (NoSuchMethodException + } catch (NullPointerException + | NoSuchMethodException | IllegalAccessException | InvocationTargetException innerException) { } From 1f31add635bad0e4cb5584edd36ff6857425e0b3 Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Fri, 24 Oct 2025 13:34:45 -0400 Subject: [PATCH 20/21] Adjust more gates to match classes using string comparison --- .../spark/AbstractSparkPlanSerializer.java | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java index e79ef5935bf..333d575d8c4 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java @@ -11,7 +11,6 @@ import java.util.Map; import java.util.Set; import org.apache.spark.sql.catalyst.plans.QueryPlan; -import org.apache.spark.sql.catalyst.plans.physical.Partitioning; import org.apache.spark.sql.catalyst.trees.TreeNode; import org.apache.spark.sql.execution.SparkPlan; import scala.Option; @@ -26,7 +25,10 @@ public abstract class AbstractSparkPlanSerializer { private final ObjectMapper mapper = AbstractDatadogSparkListener.objectMapper; private final String SPARK_PKG_NAME = "org.apache.spark"; - private final Set SAFE_CLASS_NAMES = + + private final Set SAFE_PARSE_TRAVERSE = + new HashSet<>(Arrays.asList(SPARK_PKG_NAME + ".sql.catalyst.plans.physical.Partitioning")); + private final Set SAFE_PARSE_STRING = new HashSet<>( Arrays.asList( SPARK_PKG_NAME + ".Partitioner", // not a product or TreeNode @@ -42,9 +44,8 @@ public abstract class AbstractSparkPlanSerializer { )); // Add class here if we want to break inheritance and interface traversal early when we see - // this class. Any class added must be a class whose parents we do not want to match - // (inclusive of the class itself). - private final Set NEGATIVE_CACHE_CLASSES = + // this class. In other words, we explicitly do not match these classes or their parents + private final Set NEGATIVE_CACHE = new HashSet<>( Arrays.asList( "java.io.Serializable", @@ -52,7 +53,6 @@ public abstract class AbstractSparkPlanSerializer { "scala.Equals", "scala.Product", SPARK_PKG_NAME + ".sql.catalyst.InternalRow", - SPARK_PKG_NAME + ".sql.catalyst.expressions.Expression", SPARK_PKG_NAME + ".sql.catalyst.expressions.UnaryExpression", SPARK_PKG_NAME + ".sql.catalyst.expressions.Unevaluable", SPARK_PKG_NAME + ".sql.catalyst.trees.TreeNode")); @@ -127,7 +127,7 @@ protected Object safeParseObjectToJson(Object value, int depth) { } } return list; - } else if (Partitioning.class.isInstance(value)) { + } else if (instanceOf(value, SAFE_PARSE_TRAVERSE, NEGATIVE_CACHE)) { if (value instanceof TreeNode && depth < MAX_DEPTH) { HashMap inner = new HashMap<>(); inner.put( @@ -136,7 +136,7 @@ protected Object safeParseObjectToJson(Object value, int depth) { } else { return value.toString(); } - } else if (traversedInstanceOf(value, SAFE_CLASS_NAMES, NEGATIVE_CACHE_CLASSES)) { + } else if (instanceOf(value, SAFE_PARSE_STRING, NEGATIVE_CACHE)) { return value.toString(); } else if (value instanceof TreeNode) { // fallback case, leave at bottom @@ -170,9 +170,11 @@ private String getSimpleString(TreeNode value) { } } - private boolean traversedInstanceOf( - Object value, Set expectedClasses, Set negativeCache) { - if (instanceOf(value.getClass(), expectedClasses, negativeCache)) { + // Matches a class to a set of expected classes. Returns true if the class or any + // of the interfaces or parent classes it implements matches a class in `expectedClasses`. + // Will not attempt to match any classes identified in `negativeCache` on traversal + private boolean instanceOf(Object value, Set expectedClasses, Set negativeCache) { + if (classOrInterfaceInstanceOf(value.getClass(), expectedClasses, negativeCache)) { return true; } @@ -185,7 +187,7 @@ private boolean traversedInstanceOf( // don't traverse known paths break; } - if (instanceOf(currClass, expectedClasses, negativeCache)) { + if (classOrInterfaceInstanceOf(currClass, expectedClasses, negativeCache)) { return true; } lim += 1; @@ -194,7 +196,11 @@ private boolean traversedInstanceOf( return false; } - private boolean instanceOf(Class cls, Set expectedClasses, Set negativeCache) { + // Matches a class to a set of expected classes. Returns true if the class or any + // of the interfaces it implements matches a class in `expectedClasses`. Will not + // attempt to match any classes identified in `negativeCache`. + private boolean classOrInterfaceInstanceOf( + Class cls, Set expectedClasses, Set negativeCache) { // Match on strings to avoid class loading errors if (expectedClasses.contains(cls.getName())) { return true; @@ -203,7 +209,7 @@ private boolean instanceOf(Class cls, Set expectedClasses, Set n // Check interfaces as well for (Class interfaceClass : cls.getInterfaces()) { if (!negativeCache.contains(interfaceClass.getName()) - && instanceOf(interfaceClass, expectedClasses, negativeCache)) { + && classOrInterfaceInstanceOf(interfaceClass, expectedClasses, negativeCache)) { return true; } } From 18e51d5fddfd564cbb61b55486058f671444447e Mon Sep 17 00:00:00 2001 From: Charles Yu Date: Fri, 24 Oct 2025 14:21:06 -0400 Subject: [PATCH 21/21] Revert "Catch NullPointerException for getDeclaredMethod calls" This reverts commit 5527ad058235e56cee5c715bc197c811d302b798. --- .../spark/AbstractSparkPlanSerializer.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java index 333d575d8c4..5817b4542d8 100644 --- a/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java +++ b/dd-java-agent/instrumentation/spark/src/main/java/datadog/trace/instrumentation/spark/AbstractSparkPlanSerializer.java @@ -153,15 +153,11 @@ private String getSimpleString(TreeNode value) { .getDeclaredMethod("simpleString", new Class[] {int.class}) .invoke(value, MAX_LENGTH) .toString(); - } catch (NullPointerException - | NoSuchMethodException - | IllegalAccessException - | InvocationTargetException exception) { + } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException exception) { try { // Attempt the Spark v2 `simpleString` signature return TreeNode.class.getDeclaredMethod("simpleString").invoke(value).toString(); - } catch (NullPointerException - | NoSuchMethodException + } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException innerException) { }