diff --git a/packages/elastic_agent/changelog.yml b/packages/elastic_agent/changelog.yml index 7d084910095..a5ac6a6f608 100644 --- a/packages/elastic_agent/changelog.yml +++ b/packages/elastic_agent/changelog.yml @@ -1,4 +1,18 @@ # newer versions go on top +- version: "2.6.8" + changes: + - description: Adds processor for health_status field to status change logs data stream + type: enhancement + link: https://github.com/elastic/integrations/pull/15852 + - description: Add new alerting rules for agent health status changes + type: enhancement + link: https://github.com/elastic/integrations/pull/15852 + - description: Use more specifc index and remove RLIKE usage for system metrics alerting rules + type: enhancement + link: https://github.com/elastic/integrations/pull/15852 + - description: Use system.process.cpu.total.normalized.pct for CPU usage alerting rule + type: bugfix + link: https://github.com/elastic/integrations/pull/15852 - version: "2.6.7" changes: - description: Add mapping for error fields for beats logs. diff --git a/packages/elastic_agent/data_stream/status_change_logs/_dev/test/pipeline/test-health-status.json b/packages/elastic_agent/data_stream/status_change_logs/_dev/test/pipeline/test-health-status.json new file mode 100644 index 00000000000..b7ea5bf51ca --- /dev/null +++ b/packages/elastic_agent/data_stream/status_change_logs/_dev/test/pipeline/test-health-status.json @@ -0,0 +1,116 @@ +{ + "events": [ + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "online", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "offline", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "error", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "degraded", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "updating", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "enrolling", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "unenrolling", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + } + ] +} diff --git a/packages/elastic_agent/data_stream/status_change_logs/_dev/test/pipeline/test-health-status.json-expected.json b/packages/elastic_agent/data_stream/status_change_logs/_dev/test/pipeline/test-health-status.json-expected.json new file mode 100644 index 00000000000..1443ecb9153 --- /dev/null +++ b/packages/elastic_agent/data_stream/status_change_logs/_dev/test/pipeline/test-health-status.json-expected.json @@ -0,0 +1,123 @@ +{ + "expected": [ + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "online", + "health_status": "healthy", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "offline", + "health_status": "offline", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "error", + "health_status": "unhealthy", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "degraded", + "health_status": "unhealthy", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "updating", + "health_status": "updating", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "enrolling", + "health_status": "updating", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + }, + { + "@timestamp": "2024-01-15T10:30:00.000Z", + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "unenrolling", + "health_status": "updating", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" + } + ] +} diff --git a/packages/elastic_agent/data_stream/status_change_logs/elasticsearch/ingest_pipeline/default.yml b/packages/elastic_agent/data_stream/status_change_logs/elasticsearch/ingest_pipeline/default.yml new file mode 100644 index 00000000000..72e0e4ffac7 --- /dev/null +++ b/packages/elastic_agent/data_stream/status_change_logs/elasticsearch/ingest_pipeline/default.yml @@ -0,0 +1,23 @@ +--- +description: Pipeline for Elastic Agent status change logs. +processors: + - script: + description: Derive health_status from status field + if: ctx.status != null + lang: painless + source: | + String status = ctx.status; + String healthStatus; + + if (status == 'online') { + healthStatus = 'healthy'; + } else if (status == 'error' || status == 'degraded') { + healthStatus = 'unhealthy'; + } else if (status == 'updating' || status == 'enrolling' || status == 'unenrolling') { + healthStatus = 'updating'; + } else { + healthStatus = status; + } + + ctx.health_status = healthStatus; + ignore_failure: true diff --git a/packages/elastic_agent/data_stream/status_change_logs/fields/fields.yml b/packages/elastic_agent/data_stream/status_change_logs/fields/fields.yml index 61d481e7882..c09196442a8 100644 --- a/packages/elastic_agent/data_stream/status_change_logs/fields/fields.yml +++ b/packages/elastic_agent/data_stream/status_change_logs/fields/fields.yml @@ -1,5 +1,7 @@ - name: status type: keyword +- name: health_status + type: keyword - name: policy_id type: keyword - name: agentless diff --git a/packages/elastic_agent/data_stream/status_change_logs/sample_event.json b/packages/elastic_agent/data_stream/status_change_logs/sample_event.json index 1740962b715..b611288904b 100644 --- a/packages/elastic_agent/data_stream/status_change_logs/sample_event.json +++ b/packages/elastic_agent/data_stream/status_change_logs/sample_event.json @@ -1,16 +1,17 @@ { - "@timestamp": 1576280412771, - "data_stream": { - "type": "logs", - "dataset": "elastic_agent.status_change", - "namespace": "default" - }, - "agent": { - "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" - }, - "status": "HEALTHY", - "policy_id": "test-policy", - "agentless": false, - "space_id": "default", - "hostname": "test-host" -} \ No newline at end of file + "@timestamp": 1576280412771, + "data_stream": { + "type": "logs", + "dataset": "elastic_agent.status_change", + "namespace": "default" + }, + "agent": { + "id": "f2b3c4d5-e6f7-8a9b-b0c1-d2e3f4g5h6i7" + }, + "status": "online", + "health_status": "healthy", + "policy_id": "test-policy", + "agentless": false, + "space_id": "default", + "hostname": "test-host" +} diff --git a/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-cpu-usage-spike-rule.json b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-cpu-usage-spike-rule.json index 57151899cba..2072928d652 100644 --- a/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-cpu-usage-spike-rule.json +++ b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-cpu-usage-spike-rule.json @@ -16,7 +16,7 @@ "thresholdComparator": ">", "size": 100, "esqlQuery": { - "esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS cpu_process_pct = MAX(system.process.cpu.total.pct) * 100\n BY elastic_agent.id, process.name,\n time_bucket = BUCKET(@timestamp, 1 minute)\n// Count the 1 minute timebuckets that are above 80% by process and agent\n| WHERE cpu_process_pct >= 80\n| STATS count_above_threshold = COUNT(*)\n BY elastic_agent.id, process.name\n// Alert if there are 5 or more occurences\n| WHERE count_above_threshold >= 5" + "esql": "FROM metrics-system*, *:metrics-system*\n| WHERE TO_LOWER(process.executable) LIKE \"*elastic*agent*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS cpu_process_pct = MAX(system.process.cpu.total.norm.pct) * 100\n BY elastic_agent.id, process.name,\n time_bucket = BUCKET(@timestamp, 1 minute)\n// Count the 1 minute timebuckets that are above 80% by process and agent\n| WHERE cpu_process_pct >= 80\n| STATS count_above_threshold = COUNT(*)\n BY elastic_agent.id, process.name\n// Alert if there are 5 or more occurences\n| WHERE count_above_threshold >= 5" }, "aggType": "count", "groupBy": "row", diff --git a/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-excessive-memory-usage-rule.json b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-excessive-memory-usage-rule.json index 88dc71e7e97..2bc6aaa566d 100644 --- a/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-excessive-memory-usage-rule.json +++ b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-excessive-memory-usage-rule.json @@ -16,7 +16,7 @@ "thresholdComparator": ">", "size": 100, "esqlQuery": { - "esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS max_memory_per_process = MAX(system.process.memory.rss.pct * 100) BY agent.id, process.name\n| STATS total_memory_usage = SUM(max_memory_per_process) BY agent.id\n| WHERE total_memory_usage > 50" + "esql": "FROM metrics-system*, *:metrics-system*\n| WHERE TO_LOWER(process.executable) LIKE \"*elastic*agent*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS max_memory_per_process = MAX(system.process.memory.rss.pct * 100) BY agent.id, process.name\n| STATS total_memory_usage = SUM(max_memory_per_process) BY agent.id\n| WHERE total_memory_usage > 50" }, "aggType": "count", "groupBy": "row", diff --git a/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-excessive-restarts.json b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-excessive-restarts.json index 129bcfeb6ef..5a407e18879 100644 --- a/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-excessive-restarts.json +++ b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-excessive-restarts.json @@ -16,7 +16,7 @@ "thresholdComparator": ">", "size": 100, "esqlQuery": { - "esql": "FROM metrics-*, *:metrics-*\n| WHERE process.executable RLIKE \".*[Ee]lastic.*[Aa]gent.*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS restart_count = COUNT_DISTINCT(process.cpu.start_time) BY host.name, process.name, bucket(@timestamp,5 minute) \n| WHERE restart_count > 10\n| STATS MAX(restart_count) BY host.name, process.name" + "esql": "FROM metrics-system*, *:metrics-system*\n| WHERE TO_LOWER(process.executable) LIKE \"*elastic*agent*\" AND agent.name NOT LIKE \"*agentless*\"\n| STATS restart_count = COUNT_DISTINCT(process.cpu.start_time) BY host.name, process.name, bucket(@timestamp,5 minute) \n| WHERE restart_count > 10\n| STATS MAX(restart_count) BY host.name, process.name" }, "aggType": "count", "groupBy": "row", diff --git a/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-offline-status.json b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-offline-status.json new file mode 100644 index 00000000000..b886101ae55 --- /dev/null +++ b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-offline-status.json @@ -0,0 +1,34 @@ +{ + "id": "elastic-agent-offline-status", + "type": "alerting_rule_template", + "attributes": { + "name": "[Elastic Agent] Offline status", + "tags": ["Elastic Agent"], + "ruleTypeId": ".es-query", + "schedule": { + "interval": "1m" + }, + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "threshold": [0], + "thresholdComparator": ">", + "size": 100, + "esqlQuery": { + "esql": "FROM logs-elastic_agent.status_change-default, *:logs-elastic_agent.status_change-default\n| WHERE data_stream.dataset == \"elastic_agent.status_change\" and agentless == false and health_status == \"offline\"" + }, + "aggType": "count", + "groupBy": "row", + "termSize": 5, + "sourceFields": [], + "timeField": "@timestamp", + "excludeHitsFromPreviousRun": true + }, + "alertDelay": { + "active": 1 + } + }, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" +} diff --git a/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-unenrolled-status.json b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-unenrolled-status.json new file mode 100644 index 00000000000..1fabb74f27f --- /dev/null +++ b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-unenrolled-status.json @@ -0,0 +1,34 @@ +{ + "id": "elastic-agent-unenrolled-status", + "type": "alerting_rule_template", + "attributes": { + "name": "[Elastic Agent] Unenrolled status", + "tags": ["Elastic Agent"], + "ruleTypeId": ".es-query", + "schedule": { + "interval": "1m" + }, + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "threshold": [0], + "thresholdComparator": ">", + "size": 100, + "esqlQuery": { + "esql": "FROM logs-elastic_agent.status_change-default, *:logs-elastic_agent.status_change-default\n| WHERE data_stream.dataset == \"elastic_agent.status_change\" and agentless == false and health_status == \"unenrolled\"" + }, + "aggType": "count", + "groupBy": "row", + "termSize": 5, + "sourceFields": [], + "timeField": "@timestamp", + "excludeHitsFromPreviousRun": true + }, + "alertDelay": { + "active": 1 + } + }, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" +} diff --git a/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-unhealthy-status.json b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-unhealthy-status.json index c817436f589..a3f00eb35aa 100644 --- a/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-unhealthy-status.json +++ b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-unhealthy-status.json @@ -16,7 +16,7 @@ "thresholdComparator": ">", "size": 100, "esqlQuery": { - "esql": "FROM logs-elastic_agent.status_change-default, *:logs-elastic_agent.status_change-default\n| WHERE data_stream.dataset == \"elastic_agent.status_change\" and agentless == false and status in (\"error\", \"degraded\")" + "esql": "FROM logs-elastic_agent.status_change-default, *:logs-elastic_agent.status_change-default\n| WHERE data_stream.dataset == \"elastic_agent.status_change\" and agentless == false and health_status == \"unhealthy\"" }, "aggType": "count", "groupBy": "row", diff --git a/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-uninstalled-status.json b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-uninstalled-status.json new file mode 100644 index 00000000000..665537ba5f9 --- /dev/null +++ b/packages/elastic_agent/kibana/alerting_rule_template/elastic-agent-uninstalled-status.json @@ -0,0 +1,34 @@ +{ + "id": "elastic-agent-uninstalled-status", + "type": "alerting_rule_template", + "attributes": { + "name": "[Elastic Agent] Uninstalled status", + "tags": ["Elastic Agent"], + "ruleTypeId": ".es-query", + "schedule": { + "interval": "1m" + }, + "params": { + "searchType": "esqlQuery", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "threshold": [0], + "thresholdComparator": ">", + "size": 100, + "esqlQuery": { + "esql": "FROM logs-elastic_agent.status_change-default, *:logs-elastic_agent.status_change-default\n| WHERE data_stream.dataset == \"elastic_agent.status_change\" and agentless == false and health_status == \"uninstalled\"" + }, + "aggType": "count", + "groupBy": "row", + "termSize": 5, + "sourceFields": [], + "timeField": "@timestamp", + "excludeHitsFromPreviousRun": true + }, + "alertDelay": { + "active": 1 + } + }, + "coreMigrationVersion": "8.8.0", + "typeMigrationVersion": "10.1.0" +} diff --git a/packages/elastic_agent/manifest.yml b/packages/elastic_agent/manifest.yml index aa26ef89c32..42c198f06ce 100644 --- a/packages/elastic_agent/manifest.yml +++ b/packages/elastic_agent/manifest.yml @@ -1,6 +1,6 @@ name: elastic_agent title: Elastic Agent -version: 2.6.7 +version: 2.6.8 description: Collect logs and metrics from Elastic Agents. type: integration format_version: 3.5.0