Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions dags.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2505,3 +2505,25 @@ bqetl_market_intel_bot:
tags:
- impact/tier_3
- repo/bigquery-etl

bqetl_gecko_trace:
catchup: false
default_args:
depends_on_past: false
email:
- [email protected]
- [email protected]
email_on_failure: true
email_on_retry: true
end_date: null
max_active_tis_per_dag: null
owner: [email protected]
retries: 2
retry_delay: 30m
start_date: "2025-09-26"
description: |
Processes gecko trace data across multiple Firefox applications.
repo: bigquery-etl
schedule_interval: 0 9 * * *
tags:
- impact/tier_3
44 changes: 44 additions & 0 deletions sql/mozfun/gecko_trace/build_root_span/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# gecko_trace.build_root_span

Builds a root span tree structure from an array of span objects.

## Signature

```sql
gecko_trace.build_root_span(spans ARRAY<JSON>) RETURNS JSON
```

## Arguments

- `spans`: Array of JSON objects representing individual spans. Each span should
contain at minimum:
- `span_id`: Unique identifier for the span
- `parent_span_id`: ID of the parent span (null for root spans)

## Description

Takes an array of JSON span objects and constructs a hierarchical tree structure
by linking spans with their parent-child relationships.

If no explicit root span is found, the function will attempt to find a single
"missing" root span. If there are multiple or no missing roots, an error is
thrown.

## Returns

Returns a JSON object representing the root span with all child spans nested in
`childSpans` arrays throughout the tree structure.

## Example

```sql
SELECT gecko_trace.build_root_span([
JSON '{"span_id": "root", "parent_span_id": null, "name": "main_process"}',
JSON '{"span_id": "child1", "parent_span_id": "root", "name": "network_request"}',
JSON '{"span_id": "child2", "parent_span_id": "root", "name": "dom_parse"}',
JSON '{"span_id": "grandchild", "parent_span_id": "child1", "name": "dns_lookup"}'
])
```

This would return a tree structure where the root span contains two child spans
in its `childSpans` array, and one of those children has its own child span.
14 changes: 14 additions & 0 deletions sql/mozfun/gecko_trace/build_root_span/metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
friendly_name: Gecko Trace Build Root Span
description: |-
Builds a root span tree structure from an array of span objects.
Takes an array of JSON span objects and constructs a hierarchical tree structure
by linking spans with their parent-child relationships. Returns the root span
with all child spans nested in a `childSpans` array property.
If no root span is found, the function will attempt to find a single "missing"
root span. If there are multiple or no missing roots, an error is thrown.
This function is used for processing Gecko trace data to reconstruct the
hierarchical structure of spans within a trace.
67 changes: 67 additions & 0 deletions sql/mozfun/gecko_trace/build_root_span/udf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
CREATE OR REPLACE FUNCTION gecko_trace.build_root_span(spans ARRAY<JSON>)
RETURNS JSON
LANGUAGE js AS r"""
const spansById = new Map();
let rootSpanId;

spans.forEach((span) => {
const spanId = span.span_id;
// Re-attach any children accumulated while parent was "missing"
const maybeMissingSelf = spansById.get(spanId);
span.childSpans = maybeMissingSelf?.childSpans ?? [];
spansById.set(spanId, span);

if (!span.parent_span_id) {
rootSpanId = spanId; // yay, we found the root span
return;
}

const parent = spansById.get(span.parent_span_id) || {
span_id: span.parent_span_id,
childSpans: [],
type: "missing",
};
parent.childSpans.push(span);
spansById.set(span.parent_span_id, parent);
});

if (!rootSpanId) {
// Find the single missing root, if any
const missingRoots = Array.from(spansById.values()).filter(
(span) => span.type == "missing",
);
if (missingRoots.length != 1) {
throw new Error(
`Unable to construct span tree: expected exactly one missing root span, but found ${missingRoots.length}`,
);
}

rootSpanId = missingRoots[0].span_id;
}

return spansById.get(rootSpanId);
""";

-- Tests
SELECT
-- Test with simple parent-child relationship
assert.not_null(
gecko_trace.build_root_span(
[
JSON '{"span_id": "root", "parent_span_id": null, "name": "root_span"}',
JSON '{"span_id": "child1", "parent_span_id": "root", "name": "child_span"}'
]
)
),
-- Test with empty array
assert.null(gecko_trace.build_root_span([])),
-- Test single span (should be root)
assert.equals(
"root",
JSON_VALUE(
gecko_trace.build_root_span(
[JSON '{"span_id": "root", "parent_span_id": null, "name": "root_span"}']
),
"$.span_id"
)
);
57 changes: 57 additions & 0 deletions sql/mozfun/gecko_trace/calculate_signature/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# gecko_trace.calculate_signature

Calculates a signature hash for a trace based on its root span structure.

## Signature

```sql
gecko_trace.calculate_signature(rootSpan JSON) RETURNS STRING
```

## Arguments

- `rootSpan`: JSON object representing the root span of a trace tree, typically
generated by `gecko_trace.build_root_span()`. Should contain:
- `name`: Span name
- `scope`: Object with `name` property
- `resource`: Object with `attributes` property
- `events`: Optional array of event objects with `name` and `attributes`
- `childSpans`: Array of child span objects with the same structure

## Description

Uses a fast hash function (cyrb64) to generate a deterministic signature based
on the hierarchical structure and attributes of spans in a trace. The signature
is calculated by traversing the span tree depth-first and hashing:

- Resource attributes (excluding certain internal IDs like
`gecko_process_internal_id`)
- Scope names
- Span names
- Event names and attributes

## Returns

Returns a string hash that serves as a deterministic signature for the trace
structure. Traces with identical signatures have the same execution pattern and
can be grouped together for analysis.

## Example

```sql
WITH root_span AS (
SELECT gecko_trace.build_root_span(spans_array) as root
FROM traces_table
WHERE trace_id = 'some_trace_id'
)
SELECT gecko_trace.calculate_signature(root) as signature
FROM root_span
```

## Notes

- Internal process IDs and other volatile attributes are excluded from hashing
to focus on logical execution patterns
- Used in conjunction with `gecko_trace.build_root_span()` for complete trace
analysis workflows
- Returns empty string for NULL input
15 changes: 15 additions & 0 deletions sql/mozfun/gecko_trace/calculate_signature/metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
friendly_name: Gecko Trace Calculate Signature
description: |-
Calculates a signature hash for a trace based on its root span structure.

Uses a fast hash function (cyrb64) to generate a deterministic signature
based on the hierarchical structure and attributes of spans in a trace.
The signature is calculated by traversing the span tree and hashing:
- Resource attributes (excluding certain internal IDs like gecko_process_internal_id)
- Scope names
- Span names
- Event names and attributes

The function returns a string hash that can be used to identify traces with
similar execution patterns.
84 changes: 84 additions & 0 deletions sql/mozfun/gecko_trace/calculate_signature/udf.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
CREATE OR REPLACE FUNCTION gecko_trace.calculate_signature(rootSpan JSON)
RETURNS STRING
LANGUAGE js AS r"""
// cyrb53 (c) 2018 bryc (github.com/bryc). License: Public domain. Attribution appreciated.
// A fast and simple 64-bit (or 53-bit) string hash function with decent collision resistance.
// Largely inspired by MurmurHash2/3, but with a focus on speed/simplicity.
// See https://stackoverflow.com/questions/7616461/generate-a-hash-from-string-in-javascript/52171480#52171480
// https://github.com/bryc/code/blob/master/jshash/experimental/cyrb53.js
const cyrb64 = (str, seed = 0) => {
let h1 = 0xdeadbeef ^ seed,
h2 = 0x41c6ce57 ^ seed;
for (let i = 0, ch; i < str.length; i++) {
ch = str.charCodeAt(i);
h1 = Math.imul(h1 ^ ch, 2654435761);
h2 = Math.imul(h2 ^ ch, 1597334677);
}
h1 = Math.imul(h1 ^ (h1 >>> 16), 2246822507);
h1 ^= Math.imul(h2 ^ (h2 >>> 13), 3266489909);
h2 = Math.imul(h2 ^ (h2 >>> 16), 2246822507);
h2 ^= Math.imul(h1 ^ (h1 >>> 13), 3266489909);
// For a single 53-bit numeric return value we could return
// 4294967296 * (2097151 & h2) + (h1 >>> 0);
// but we instead return the full 64-bit value:
return [h2 >>> 0, h1 >>> 0];
};

const seed = 0;
let digest = "";
const hash = (str) => {
const [h2, h1] = cyrb64(digest + str, seed);
digest =
h2.toString(36).padStart(7, "0") + h1.toString(36).padStart(7, "0");
};

const ATTRS_TO_SKIP = {"gecko_process_internal_id": null}
const hashAttrs = (attrs) => {
for (const [key, value] of Object.entries(attrs)) {
if (key in ATTRS_TO_SKIP) continue;
hash(key);
hash(value);
}
}

const hashEvents = (events) => {
for (const event of events) {
hash(event.name);
hashAttrs(event.attributes);
}
};

const stack = [rootSpan];
while (stack.length > 0) {
const span = stack.pop();
hashAttrs(span.resource.attributes);
hash(span.scope.name);
hash(span.name);
if (span.events) {
hashEvents(span.events);
}
stack.push(...span.childSpans);
}

return digest;
""";

-- Tests
SELECT
-- Test with simple root span
assert.not_null(
gecko_trace.calculate_signature(
JSON '{"span_id": "root", "name": "test", "scope": {"name": "test_scope"}, "resource": {"attributes": {}}, "childSpans": []}'
)
),
-- Test that same input produces same signature
assert.equals(
gecko_trace.calculate_signature(
JSON '{"span_id": "root", "name": "test", "scope": {"name": "test_scope"}, "resource": {"attributes": {}}, "childSpans": []}'
),
gecko_trace.calculate_signature(
JSON '{"span_id": "root", "name": "test", "scope": {"name": "test_scope"}, "resource": {"attributes": {}}, "childSpans": []}'
)
),
-- Test that null input returns empty string
assert.equals("", gecko_trace.calculate_signature(NULL));
Loading