Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions packages/elastic_agent/changelog.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,18 @@
# newer versions go on top
- version: "2.6.8"
changes:
- description: Adds processor for health_status field to status change logs data stream
type: enhancement
link: https://github.com/elastic/integrations/pull/15852
- description: Add new alerting rules for agent health status changes
type: enhancement
link: https://github.com/elastic/integrations/pull/15852
- description: Use more specifc index and remove RLIKE usage for system metrics alerting rules
type: enhancement
link: https://github.com/elastic/integrations/pull/15852
- description: Use system.process.cpu.total.normalized.pct for CPU usage alerting rule
type: bugfix
link: https://github.com/elastic/integrations/pull/15852
- version: "2.6.7"
changes:
- description: Add mapping for error fields for beats logs.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
{
"events": [
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "online",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "offline",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "error",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "degraded",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "updating",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "enrolling",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "unenrolling",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
{
"expected": [
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "online",
"health_status": "healthy",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "offline",
"health_status": "offline",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "error",
"health_status": "unhealthy",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "degraded",
"health_status": "unhealthy",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "updating",
"health_status": "updating",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "enrolling",
"health_status": "updating",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
},
{
"@timestamp": "2024-01-15T10:30:00.000Z",
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "unenrolling",
"health_status": "updating",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
---
description: Pipeline for Elastic Agent status change logs.
processors:
- script:
description: Derive health_status from status field
if: ctx.status != null
lang: painless
source: |
String status = ctx.status;
String healthStatus;

if (status == 'online') {
healthStatus = 'healthy';
} else if (status == 'error' || status == 'degraded') {
healthStatus = 'unhealthy';
} else if (status == 'updating' || status == 'enrolling' || status == 'unenrolling') {
healthStatus = 'updating';
} else {
healthStatus = status;
}

ctx.health_status = healthStatus;
ignore_failure: true
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
- name: status
type: keyword
- name: health_status
type: keyword
- name: policy_id
type: keyword
- name: agentless
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
{
"@timestamp": 1576280412771,
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "HEALTHY",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
}
"@timestamp": 1576280412771,
"data_stream": {
"type": "logs",
"dataset": "elastic_agent.status_change",
"namespace": "default"
},
"agent": {
"id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7"
},
"status": "online",
"health_status": "healthy",
"policy_id": "test-policy",
"agentless": false,
"space_id": "default",
"hostname": "test-host"
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"thresholdComparator": ">",
"size": 100,
"esqlQuery": {
"esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS cpu_process_pct = MAX(system.process.cpu.total.pct) * 100\n BY elastic_agent.id, process.name,\n time_bucket = BUCKET(@timestamp, 1 minute)\n// Count the 1 minute timebuckets that are above 80% by process and agent\n| WHERE cpu_process_pct >= 80\n| STATS count_above_threshold = COUNT(*)\n BY elastic_agent.id, process.name\n// Alert if there are 5 or more occurences\n| WHERE count_above_threshold >= 5"
"esql": "FROM metrics-system*, *:metrics-system*\n| WHERE TO_LOWER(process.executable) LIKE \"*elastic*agent*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS cpu_process_pct = MAX(system.process.cpu.total.norm.pct) * 100\n BY elastic_agent.id, process.name,\n time_bucket = BUCKET(@timestamp, 1 minute)\n// Count the 1 minute timebuckets that are above 80% by process and agent\n| WHERE cpu_process_pct >= 80\n| STATS count_above_threshold = COUNT(*)\n BY elastic_agent.id, process.name\n// Alert if there are 5 or more occurences\n| WHERE count_above_threshold >= 5"
},
"aggType": "count",
"groupBy": "row",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"thresholdComparator": ">",
"size": 100,
"esqlQuery": {
"esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS max_memory_per_process = MAX(system.process.memory.rss.pct * 100) BY agent.id, process.name\n| STATS total_memory_usage = SUM(max_memory_per_process) BY agent.id\n| WHERE total_memory_usage > 50"
"esql": "FROM metrics-system*, *:metrics-system*\n| WHERE TO_LOWER(process.executable) LIKE \"*elastic*agent*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS max_memory_per_process = MAX(system.process.memory.rss.pct * 100) BY agent.id, process.name\n| STATS total_memory_usage = SUM(max_memory_per_process) BY agent.id\n| WHERE total_memory_usage > 50"
},
"aggType": "count",
"groupBy": "row",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"thresholdComparator": ">",
"size": 100,
"esqlQuery": {
"esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS restart_count = COUNT_DISTINCT(process.cpu.start_time) BY host.name, process.name, bucket(@timestamp,5 minute) \n| WHERE restart_count > 10\n| STATS MAX(restart_count) BY host.name, process.name"
"esql": "FROM metrics-system*, *:metrics-system*\n| WHERE TO_LOWER(process.executable) LIKE \"*elastic*agent*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS restart_count = COUNT_DISTINCT(process.cpu.start_time) BY host.name, process.name, bucket(@timestamp,5 minute) \n| WHERE restart_count > 10\n| STATS MAX(restart_count) BY host.name, process.name"
},
"aggType": "count",
"groupBy": "row",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"id": "elastic-agent-offline-status",
"type": "alerting_rule_template",
"attributes": {
"name": "[Elastic Agent] Offline status",
"tags": ["Elastic Agent"],
"ruleTypeId": ".es-query",
"schedule": {
"interval": "1m"
},
"params": {
"searchType": "esqlQuery",
"timeWindowSize": 5,
"timeWindowUnit": "m",
"threshold": [0],
"thresholdComparator": ">",
"size": 100,
"esqlQuery": {
"esql": "FROM logs-elastic_agent.status_change-default, *:logs-elastic_agent.status_change-default\n| WHERE data_stream.dataset == \"elastic_agent.status_change\" and agentless == false and health_status == \"offline\""
},
"aggType": "count",
"groupBy": "row",
"termSize": 5,
"sourceFields": [],
"timeField": "@timestamp",
"excludeHitsFromPreviousRun": true
},
"alertDelay": {
"active": 1
}
},
"coreMigrationVersion": "8.8.0",
"typeMigrationVersion": "10.1.0"
}
Loading