Skip to content
This repository was archived by the owner on Dec 21, 2021. It is now read-only.

Commit 21ecca5

Browse files
authored
Adds monitoring of systemd units (#53)
Added monitoring of systemd units to notice when they fail. During startup, every unit will be checked once every second for 10 seconds, after this it will be considered as "running". When a unit is considered as "running" the state will be checked once every 10 seconds. This is a simpler version of what we want to provide in a final implementation, where we listen to changes on dbus instead of polling periodically. fixes #10
1 parent 657a038 commit 21ecca5

File tree

4 files changed

+111
-15
lines changed

4 files changed

+111
-15
lines changed

Cargo.lock

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/provider/states/running.rs

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
1+
use anyhow::anyhow;
12
use k8s_openapi::api::core::v1::{
23
ContainerState, ContainerStateRunning, ContainerStatus as KubeContainerStatus, PodCondition,
34
};
45
use kubelet::pod::Pod;
56
use kubelet::state::prelude::*;
67
use kubelet::state::{State, Transition};
7-
use log::{debug, trace};
8+
use log::{debug, info, trace};
89

910
use crate::provider::states::failed::Failed;
1011
use crate::provider::states::installing::Installing;
1112
use crate::provider::states::make_status_with_containers_and_condition;
1213
use crate::provider::PodState;
1314
use k8s_openapi::apimachinery::pkg::apis::meta::v1::Time;
1415
use k8s_openapi::chrono;
16+
use tokio::time::Duration;
1517

1618
#[derive(Debug, TransitionTo)]
1719
#[transition_to(Failed, Running, Installing)]
@@ -34,17 +36,57 @@ impl State<PodState> for Running {
3436
pod_state: &mut PodState,
3537
_pod: &Pod,
3638
) -> Transition<PodState> {
39+
// We loop here indefinitely and "wake up" periodically to check if the service is still
40+
// up and running
41+
// Interruption of this loop is triggered externally by the Krustlet code when
42+
// - the pod which this state machine refers to gets deleted
43+
// - Krustlet shuts down
3744
loop {
38-
tokio::select! {
39-
_ = tokio::time::delay_for(std::time::Duration::from_secs(10)) => {
40-
trace!("Checking if service {} is still running.", &pod_state.service_name);
45+
tokio::time::delay_for(Duration::from_secs(10)).await;
46+
trace!(
47+
"Checking if service {} is still running.",
48+
&pod_state.service_name
49+
);
50+
51+
// Iterate over all units and check their state
52+
// if the [`service_units`] Option is a None variant, return a failed state
53+
// as we need to run something otherwise we are not doing anything
54+
let systemd_units = match &pod_state.service_units {
55+
Some(units) => units,
56+
None => return Transition::Complete(Err(anyhow!(format!("No systemd units found for service [{}], this should not happen, please report a bug for this!", pod_state.service_name)))),
57+
};
58+
59+
for unit in systemd_units {
60+
match pod_state.systemd_manager.is_running(&unit.get_name()) {
61+
Ok(true) => trace!(
62+
"Unit [{}] of service [{}] still running ...",
63+
&unit.get_name(),
64+
pod_state.service_name
65+
),
66+
Ok(false) => {
67+
info!("Unit [{}] for service [{}] failed unexpectedly, transitioning to failed state.", pod_state.service_name, unit.get_name());
68+
return Transition::next(
69+
self,
70+
Failed {
71+
message: "".to_string(),
72+
},
73+
);
74+
}
75+
Err(dbus_error) => {
76+
info!(
77+
"Error querying ActiveState for Unit [{}] of service [{}]: [{}].",
78+
pod_state.service_name,
79+
unit.get_name(),
80+
dbus_error
81+
);
82+
return Transition::Complete(Err(dbus_error));
83+
}
4184
}
4285
}
43-
// TODO: We are not watching the service yet, need to subscribe to events and
44-
// react to those
4586
}
4687
}
4788

89+
// test
4890
async fn json_status(
4991
&self,
5092
pod_state: &mut PodState,

src/provider/states/starting.rs

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,10 @@ use crate::provider::states::failed::Failed;
66
use crate::provider::states::running::Running;
77
use crate::provider::states::setup_failed::SetupFailed;
88
use crate::provider::PodState;
9-
use log::{error, info, warn};
9+
use anyhow::anyhow;
10+
use log::{debug, error, info, warn};
11+
use std::time::Instant;
12+
use tokio::time::Duration;
1013

1114
#[derive(Default, Debug, TransitionTo)]
1215
#[transition_to(Running, Failed, SetupFailed)]
@@ -36,6 +39,37 @@ impl State<PodState> for Starting {
3639
);
3740
return Transition::Complete(Err(enable_error));
3841
}
42+
43+
let start_time = Instant::now();
44+
// TODO: does this need to be configurable, or ar we happy with a hard coded value
45+
// for now. I've briefly looked at the podspec and couldn't identify a good field
46+
// to use for this - also, currently this starts containers (= systemd units) in
47+
// order and waits 10 seconds for every unit, so a service with five containers
48+
// would take 50 seconds until it reported running - which is totally fine in case
49+
// the units actually depend on each other, but a case could be made for waiting
50+
// once at the end
51+
while start_time.elapsed().as_secs() < 10 {
52+
tokio::time::delay_for(Duration::from_secs(1)).await;
53+
debug!(
54+
"Checking if unit [{}] is still up and running.",
55+
&unit.get_name()
56+
);
57+
match pod_state.systemd_manager.is_running(&unit.get_name()) {
58+
Ok(true) => debug!(
59+
"Service [{}] still running after [{}] seconds",
60+
&unit.get_name(),
61+
start_time.elapsed().as_secs()
62+
),
63+
Ok(false) => {
64+
return Transition::Complete(Err(anyhow!(format!(
65+
"Unit [{}] stopped unexpectedly during startup after [{}] seconds.",
66+
&unit.get_name(),
67+
start_time.elapsed().as_secs()
68+
))))
69+
}
70+
Err(dbus_error) => return Transition::Complete(Err(dbus_error)),
71+
}
72+
}
3973
}
4074
} else {
4175
warn!(

src/provider/systemdmanager/manager.rs

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
//!
66
use crate::provider::systemdmanager::systemdunit::SystemDUnit;
77
use anyhow::anyhow;
8-
use dbus::arg::{AppendAll, ReadAll};
8+
use dbus::arg::{AppendAll, ReadAll, Variant};
99
use dbus::blocking::SyncConnection;
1010
use dbus::strings::Member;
1111
use dbus::Path;
@@ -25,6 +25,7 @@ pub enum UnitTypes {
2525
const SYSTEMD_DESTINATION: &str = "org.freedesktop.systemd1";
2626
const SYSTEMD_NODE: &str = "/org/freedesktop/systemd1";
2727
const SYSTEMD_MANAGER_INTERFACE: &str = "org.freedesktop.systemd1.Manager";
28+
const DBUS_PROPERTIES_INTERFACE: &str = "org.freedesktop.DBus.Properties";
2829

2930
/// The main way of interacting with this module, this struct offers
3031
/// the public methods for managing service units.
@@ -171,10 +172,7 @@ impl SystemdManager {
171172
}
172173

173174
let unit_file = self.units_directory.join(&unit_name);
174-
if linked_unit_file
175-
&& unit_file.exists()
176-
&& unit_file.symlink_metadata()?.file_type().is_file()
177-
{
175+
if unit_file.exists() && unit_file.symlink_metadata()?.file_type().is_file() {
178176
// Handle the special case where we need to replace an actual file with a symlink
179177
// This only occurs when switching from writing the file
180178
// directly into the units folder to using a linked file - should not happen in practice
@@ -357,6 +355,28 @@ impl SystemdManager {
357355
}
358356
}
359357

358+
pub fn is_running(&self, unit: &str) -> Result<bool, anyhow::Error> {
359+
let unit_node = self
360+
.method_call("GetUnit", (&unit,))
361+
.map(|r: (Path,)| r.0)?;
362+
363+
let proxy = self
364+
.connection
365+
.with_proxy(SYSTEMD_DESTINATION, &unit_node, self.timeout);
366+
367+
let active_state = proxy
368+
.method_call(
369+
DBUS_PROPERTIES_INTERFACE,
370+
"Get",
371+
("org.freedesktop.systemd1.Unit", "ActiveState"),
372+
)
373+
.map(|r: (Variant<String>,)| r.0)?;
374+
375+
// TODO: I think this can panic, there should be a get() method on Variant that returns
376+
// an option, but I've not yet been able to get that to work
377+
Ok(active_state.0 == "active")
378+
}
379+
360380
// Symlink a unit file into the systemd unit folder
361381
// This is not public on purpose, as [create] should be the normal way to link unit files
362382
// when using this crate

0 commit comments

Comments
 (0)