atomvm · UncleGrumpy · Oct 27, 2025 · Oct 27, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -89,6 +89,9 @@ instead `badarg`.
 - Fixed a bug where empty atom could not be created on some platforms, thus breaking receiving a message for a registered process from an OTP node.
 - Fix a memory leak in distribution when a BEAM node would monitor a process by name.
 - Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
+- Added missing support for supervisor `one_for_all` strategy.
+- Supervisor now honors period and intensity options.
+- Fix supervisor crash if a child fails to restart.
 
 ## [0.6.7] - Unreleased
 

diff --git a/libs/estdlib/src/supervisor.erl b/libs/estdlib/src/supervisor.erl
@@ -88,13 +88,28 @@
     start :: {module(), atom(), [any()] | undefined},
     restart :: restart(),
     shutdown :: shutdown(),
-    type :: child_type,
+    type :: child_type(),
     modules = [] :: [module()] | dynamic
 }).
--record(state, {restart_strategy :: strategy(), children = [] :: [#child{}]}).
+-record(state, {
+    restart_strategy = one_for_one :: strategy(),
+    intensity = 3 :: non_neg_integer(),
+    period = 5 :: pos_integer(),
+    restart_count = 0,
+    restarts = [],
+    children = [] :: [#child{}]
+}).
+
+%% Used to trim stale restarts when the 'intensity' value is large.
+%% The number of restarts before triggering a purge of restarts older
+%% than 'period', so stale restarts do not continue to consume ram for
+%% the sake of MCUs with limited memory. In the future a function
+%% could be used to set a sane default for the platform (OTP uses 1000).
+-define(STALE_RESTART_LIMIT, 100).
 
 start_link(Module, Args) ->
     gen_server:start_link(?MODULE, {Module, Args}, []).
+
 start_link(SupName, Module, Args) ->
     gen_server:start_link(SupName, ?MODULE, {Module, Args}, []).
 
@@ -119,16 +134,27 @@ count_children(Supervisor) ->
 init({Mod, Args}) ->
     erlang:process_flag(trap_exit, true),
     case Mod:init(Args) of
-        {ok, {{Strategy, _Intensity, _Period}, StartSpec}} ->
-            State = init_state(StartSpec, #state{restart_strategy = Strategy}),
+        {ok, {{Strategy, Intensity, Period}, StartSpec}} ->
+            State = init_state(StartSpec, #state{
+                restart_strategy = Strategy,
+                intensity = Intensity,
+                period = Period
+            }),
             NewChildren = start_children(State#state.children, []),
             {ok, State#state{children = NewChildren}};
-        {ok, {#{strategy := Strategy}, StartSpec}} ->
-            State = init_state(StartSpec, #state{restart_strategy = Strategy}),
+        {ok, {#{} = SupSpec, StartSpec}} ->
+            Strategy = maps:get(strategy, SupSpec, one_for_one),
+            Intensity = maps:get(intensity, SupSpec, 3),
+            Period = maps:get(period, SupSpec, 5),
+            State = init_state(StartSpec, #state{
+                restart_strategy = Strategy,
+                intensity = Intensity,
+                period = Period
+            }),
             NewChildren = start_children(State#state.children, []),
             {ok, State#state{children = NewChildren}};
         Error ->
-            {stop, {bad_return, {mod, init, Error}}}
+            {stop, {bad_return, {Mod, init, Error}}}
     end.
 
 -spec child_spec_to_record(child_spec()) -> #child{}.
@@ -172,7 +198,7 @@ child_spec_to_record(#{id := ChildId, start := MFA} = ChildMap) ->
 init_state([ChildSpec | T], State) ->
     Child = child_spec_to_record(ChildSpec),
     NewChildren = [Child | State#state.children],
-    init_state(T, #state{children = NewChildren});
+    init_state(T, State#state{children = NewChildren});
 init_state([], State) ->
     State#state{children = lists:reverse(State#state.children)}.
 
@@ -184,7 +210,50 @@ start_children([Child | T], StartedC) ->
 start_children([], StartedC) ->
     StartedC.
 
-restart_child(Pid, Reason, State) ->
+restart_child(Pid, Reason, #state{restart_strategy = one_for_one} = State) ->
+    case lists:keyfind(Pid, #child.pid, State#state.children) of
+        false ->
+            {ok, State};
+        #child{restart = {terminating, temporary, From}} ->
+            gen_server:reply(From, ok),
+            NewChildren = lists:keydelete(Pid, #child.pid, State#state.children),
+            {ok, State#state{children = NewChildren}};
+        #child{restart = {terminating, Restart, From}} = Child ->
+            gen_server:reply(From, ok),
+            NewChildren = lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{
+                pid = undefined, restart = Restart
+            }),
+            {ok, State#state{children = NewChildren}};
+        #child{} = Child ->
+            case should_restart(Reason, Child#child.restart) of
+                true ->
+                    case add_restart(State) of
+                        {ok, State1} ->
+                            case try_start(Child) of
+                                {ok, NewPid, _Result} ->
+                                    NewChild = Child#child{pid = NewPid},
+                                    Children = lists:keyreplace(
+                                        Pid, #child.pid, State1#state.children, NewChild
+                                    ),
+                                    {ok, State1#state{children = Children}};
+                                {error, _} ->
+                                    erlang:send_after(
+                                        50, self(), {try_again_restart, Child#child.id}
+                                    ),
+                                    {ok, State1}
+                            end;
+                        {shutdown, State1} ->
+                            RemainingChildren = lists:keydelete(
+                                Pid, #child.pid, State#state.children
+                            ),
+                            {shutdown, State1#state{children = RemainingChildren}}
+                    end;
+                false ->
+                    Children = lists:keydelete(Pid, #child.pid, State#state.children),
+                    {ok, State#state{children = Children}}
+            end
+    end;
+restart_child(Pid, Reason, #state{restart_strategy = one_for_all} = State) ->
     case lists:keyfind(Pid, #child.pid, State#state.children) of
         false ->
             {ok, State};
@@ -201,13 +270,21 @@ restart_child(Pid, Reason, State) ->
         #child{} = Child ->
             case should_restart(Reason, Child#child.restart) of
                 true ->
-                    case try_start(Child) of
-                        {ok, NewPid, _Result} ->
-                            NewChild = Child#child{pid = NewPid},
-                            Children = lists:keyreplace(
-                                Pid, #child.pid, State#state.children, NewChild
+                    case add_restart(State) of
+                        {ok, State1} ->
+                            Siblings = lists:keydelete(Pid, #child.pid, State#state.children),
+                            case restart_many_children(Child, Siblings) of
+                                {ok, NewChildren} ->
+                                    {ok, State1#state{children = NewChildren}};
+                                {ok, NewChildren, RetryID} ->
+                                    erlang:send_after(50, self(), {try_again_restart, RetryID}),
+                                    {ok, State1#state{children = NewChildren}}
+                            end;
+                        {shutdown, State1} ->
+                            RemainingChildren = lists:keydelete(
+                                Pid, #child.pid, State#state.children
                             ),
-                            {ok, State#state{children = Children}}
+                            {shutdown, State1#state{children = RemainingChildren}}
                     end;
                 false ->
                     Children = lists:keydelete(Pid, #child.pid, State#state.children),
@@ -309,6 +386,26 @@ handle_info({ensure_killed, Pid}, State) ->
             exit(Pid, kill),
             {noreply, State}
     end;
+handle_info({try_again_restart, Id}, State) ->
+    Child = lists:keyfind(Id, #child.id, State#state.children),
+    case add_restart(State) of
+        {ok, State1} ->
+            case try_start(Child) of
+                {ok, NewPid, _Result} ->
+                    UpdatedChildren = lists:keyreplace(
+                        Id, Child#child.id, State#state.children, Child#child{pid = NewPid}
+                    ),
+                    {noreply, State#state{children = UpdatedChildren}};
+                {error, {_, _}} ->
+                    erlang:send_after(50, self(), {try_again_restart, Id}),
+                    {noreply, State1}
+            end;
+        {shutdown, State1} ->
+            RemainingChildren = lists:keydelete(
+                Id, #child.id, State1#state.children
+            ),
+            {stop, shutdown, State1#state{children = RemainingChildren}}
+    end;
 handle_info(_Msg, State) ->
     %TODO: log unexpected message
     {noreply, State}.
@@ -321,9 +418,16 @@ terminate(_Reason, #state{children = Children} = State) ->
 
 loop_terminate([#child{pid = undefined} | Tail], AccRemaining) ->
     loop_terminate(Tail, AccRemaining);
+loop_terminate([#child{pid = {restarting, _}} | Tail], AccRemaining) ->
+    loop_terminate(Tail, AccRemaining);
 loop_terminate([#child{pid = Pid} = Child | Tail], AccRemaining) when is_pid(Pid) ->
-    do_terminate(Child),
-    loop_terminate(Tail, [Pid | AccRemaining]);
+    case is_process_alive(Pid) of
+        true ->
+            do_terminate(Child),
+            loop_terminate(Tail, [Pid | AccRemaining]);
+        false ->
+            loop_terminate(Tail, AccRemaining)
+    end;
 loop_terminate([], AccRemaining) ->
     AccRemaining.
 
@@ -364,6 +468,117 @@ try_start(#child{start = {M, F, Args}} = Record) ->
             {error, {{'EXIT', Error}, Record}}
     end.
 
+add_restart(
+    #state{
+        intensity = Intensity, period = Period, restart_count = RestartCount, restarts = Restarts
+    } = State
+) ->
+    Now = erlang:monotonic_time(millisecond),
+    Threshold = Now - Period * 1000,
+    case can_restart(Intensity, Threshold, Restarts, RestartCount) of
+        {true, RestartCount1, Restarts1} ->
+            {ok, State#state{
+                restarts = Restarts1 ++ [Now], restart_count = RestartCount1 + 1
+            }};
+        {false, RestartCount1, Restarts1} ->
+            % TODO: log supervisor shutdown due to maximum intensity exceeded
+            {shutdown, State#state{
+                restarts = Restarts1, restart_count = RestartCount1
+            }}
+    end.
+
+can_restart(0, _, _, _) ->
+    {false, 0, []};
+can_restart(_, _, _, 0) ->
+    {true, 0, []};
+can_restart(Intensity, Threshold, Restarts, RestartCount) when
+    RestartCount >= ?STALE_RESTART_LIMIT
+->
+    {NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)),
+    can_restart(Intensity, Threshold, Restarts1, NewCount);
+can_restart(Intensity, Threshold, [Restart | _] = Restarts, RestartCount) when
+    RestartCount >= Intensity andalso Restart < Threshold
+->
+    {NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)),
+    can_restart(Intensity, Threshold, Restarts1, NewCount);
+can_restart(Intensity, _, Restarts, RestartCount) when RestartCount >= Intensity ->
+    {false, RestartCount, Restarts};
+can_restart(Intensity, _, Restarts, RestartCount) when RestartCount < Intensity ->
+    {true, RestartCount, Restarts}.
+
+trim_expired_restarts(Threshold, [Restart | Restarts]) when Restart < Threshold ->
+    trim_expired_restarts(Threshold, Restarts);
+trim_expired_restarts(_Threshold, Restarts) ->
+    {length(Restarts), Restarts}.
+
+restart_many_children(#child{pid = Pid} = Child, Children) ->
+    Siblings = lists:keydelete(Pid, #child.pid, Children),
+    {ok, Children1} = terminate_many_children(Siblings, [Child#child{pid = {restarting, Pid}}]),
+    do_restart_children(Children1, [], []).
+
+terminate_many_children([], NewChildren) ->
+    {ok, lists:reverse(NewChildren)};
+terminate_many_children([Child | Children], NewChildren) ->
+    case Child of
+        #child{restart = {terminating, _Restart, From}} = Child when is_pid(From) ->
+            terminate_many_children(Children, NewChildren);
+        #child{pid = undefined, restart = temporary} = Child ->
+            terminate_many_children(Children, NewChildren);
+        #child{pid = Pid, restart = temporary} = Child when is_pid(Pid) ->
+            do_terminate(Child),
+            terminate_many_children(Children, NewChildren);
+        #child{pid = undefined} = Child ->
+            terminate_many_children(Children, [Child | NewChildren]);
+        #child{pid = Pid} = Child when is_pid(Pid) ->
+            do_terminate(Child),
+            terminate_many_children(Children, [
+                Child#child{pid = {restarting, Pid}} | NewChildren
+            ])
+    end.
+
+do_restart_children([], NewChildren, []) ->
+    {ok, lists:reverse(NewChildren)};
+do_restart_children([], NewChildren, [RetryChild | T] = RetryChildren) ->
+    if
+        length(T) =:= 0 ->
+            {ok, {lists:reverse(NewChildren), RetryChild#child.id}};
+        true ->
+            ok = differed_try_again(RetryChildren),
+            {ok, lists:reverse(NewChildren)}
+    end;
+do_restart_children([#child{pid = Pid} = Child | Children], NewChildren, RetryChildren) ->
+    case Pid of
+        {restarting, _} ->
+            case try_start(Child) of
+                {ok, Pid1, {ok, Pid1}} ->
+                    do_restart_children(
+                        Children, [Child#child{pid = Pid1} | NewChildren], RetryChildren
+                    );
+                {ok, Pid1, {ok, Pid1, _Result}} ->
+                    do_restart_children(
+                        Children, [Child#child{pid = Pid1} | NewChildren], RetryChildren
+                    );
+                {ok, undefined, {ok, undefined}} ->
+                    do_restart_children(
+                        Children, [Child#child{pid = undefined} | NewChildren], RetryChildren
+                    );
+                {error, _} ->
+                    do_restart_children(Children, NewChildren, [Child | RetryChildren])
+            end;
+        _ ->
+            % retain previous ignore children without starting them
+            do_restart_children(Children, [Child | NewChildren], RetryChildren)
+    end.
+
+%% Schedules "try again" restarts at 50ms intervals when multiple children have failed to restart
+%% on the first attempt. This is an accumulated (reverse start order) list, so the children that
+%% should start last get the longest delay before sending the try_again_restart request.
+differed_try_again([]) ->
+    ok;
+differed_try_again([Child | Children] = RetryChildren) ->
+    erlang:send_after(50 * length(RetryChildren), self(), {try_again_restart, Child#child.id}),
+    differed_try_again(Children).
+
 child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) ->
     Child =
         case Pid of