preserve state in singleton managers

This commit is contained in:
2026-03-03 14:13:01 -07:00
parent e0f2d8c4aa
commit 4ac6c09759
4 changed files with 124 additions and 49 deletions

View File

@@ -2,13 +2,16 @@ defmodule Backend.Application do
@moduledoc false
use Application
# application fully started on each node
@impl true
def start(_type, _args) do
children = [
BackendWeb.Telemetry,
{Phoenix.PubSub, name: Backend.PubSub},
Backend.Cluster,
{Backend.GlobalSingleton, Backend.GameRunner},
{Backend.GlobalSingleton,
module: Backend.GameRunner, name: "GameRunner", startup_args: %{}, pubsub_channel: "GameRunner.StartupArgs"},
BackendWeb.Endpoint
]

View File

@@ -12,13 +12,12 @@ defmodule Backend.GameRunner do
# Client API
def start_link(_opts) do
case GenServer.start_link(__MODULE__, %{}, name: @name) do
def start_link(startup_args) do
case GenServer.start_link(__MODULE__, startup_args, name: @name) do
{:ok, pid} ->
{:ok, pid}
{:error, {:already_started, _pid}} ->
# Another instance is already running globally
:ignore
end
end
@@ -50,15 +49,23 @@ defmodule Backend.GameRunner do
GenServer.call(@name, :get_node_name)
end
def get_pid do
GenServer.call(@name, :get_pid)
end
def crash_game do
GenServer.cast(@name, :crash)
end
# Server Callbacks
@impl true
def init(_) do
def init(startup_args) do
Logger.info("GameState starting on node: #{node()}")
sleep_delay = round(1000 / @fps)
:timer.send_interval(sleep_delay, :tick)
{:ok, %{players: %{}, tick_number: 0}}
{:ok, %{players: %{}, tick_number: 0} |> Map.merge(startup_args)}
end
@impl true
@@ -103,6 +110,12 @@ defmodule Backend.GameRunner do
{:noreply, broadcast_state(new_state)}
end
@impl true
def handle_cast(:crash, _state) do
Logger.error("Simulated crash of GameRunner on node #{node()}")
raise "Simulated crash"
end
@impl true
def handle_info(:tick, state) do
if rem(state.tick_number, 100) == 0 do
@@ -146,8 +159,19 @@ defmodule Backend.GameRunner do
{:reply, node(), state}
end
def handle_call(:get_pid, _from, state) do
{:reply, self(), state}
end
defp broadcast_state(state) do
Phoenix.PubSub.broadcast(Backend.PubSub, "game_state", {:game_state_updated, state})
Phoenix.PubSub.broadcast(
Backend.PubSub,
"GameRunner.StartupArgs",
{:startup_args_updated, state}
)
state
end
end

View File

@@ -3,63 +3,88 @@ defmodule Backend.GlobalSingleton do
Supervisor that ensures a global singleton process runs across the cluster.
If the node running it crashes, another node will take over.
"""
use Supervisor
use GenServer
require Logger
def start_link(module) do
Supervisor.start_link(__MODULE__, module, name: :"#{module}.GlobalSingleton")
def start_link(
module: module,
name: name,
startup_args: startup_args,
pubsub_channel: pubsub_channel
) do
GenServer.start_link(__MODULE__, {module, name, startup_args, pubsub_channel},
name: get_name(module, name)
)
end
defp get_name(module, name), do: :"#{name}.#{module}.GlobalSingleton"
@impl true
def init({module, name, startup_args, pubsub_channel}) do
# immediately schedule first check
Process.send(self(), :check, [])
if pubsub_channel do
Phoenix.PubSub.subscribe(Backend.PubSub, pubsub_channel)
end
{:ok, %{module: module, name: name, startup_args: startup_args, monitor_ref: nil}}
end
@impl true
def init(module) do
children = [
%{
id: :monitor_task,
start: {Task, :start_link, [fn -> monitor_loop(module) end]},
restart: :permanent
}
]
def handle_info(
:check,
%{module: module, startup_args: startup_args} = state
) do
Process.send_after(self(), :check, 100)
Supervisor.init(children, strategy: :one_for_one)
end
defp monitor_loop(module) do
process_pid =
case :global.whereis_name(module) do
:undefined ->
# Double-check before attempting to start
Process.sleep(50)
case :global.whereis_name(module) do
:undefined ->
Logger.info("#{module} not running, attempting to start on #{node()}")
case module.start_link([]) do
{:ok, _pid} ->
Logger.info("#{module} started on #{node()}")
{:error, {:already_started, _pid}} ->
Logger.debug("#{module} already started by another node")
_ ->
:ok
end
Process.sleep(100)
monitor_loop(module)
_pid ->
# Another node won the race
monitor_loop(module)
end
attempt_to_start_child_here(%{module: module, startup_args: startup_args})
pid when is_pid(pid) ->
ref = Process.monitor(pid)
receive do
{:DOWN, ^ref, :process, ^pid, _reason} ->
Logger.warning("#{module} went down, attempting takeover")
monitor_loop(module)
pid
end
{:noreply, monitor_if_not_already(process_pid, state)}
end
@impl true
def handle_info({:startup_args_updated, new_args}, state) do
Logger.info("Received updated startup args for #{state.module}: #{inspect(new_args)}")
{:noreply, %{state | startup_args: new_args}}
end
@impl true
def handle_info(
{:DOWN, ref, :process, _pid, _reason},
%{module: module, monitor_ref: ref} = state
) do
Logger.warning("#{module} went down, attempting takeover on #{node()}")
send(self(), :check)
{:noreply, %{state | monitor_ref: nil}}
end
defp monitor_if_not_already(pid, %{monitor_ref: nil} = state) when is_pid(pid) do
ref = Process.monitor(pid)
%{state | monitor_ref: ref}
end
defp monitor_if_not_already(_pid, state), do: state
defp attempt_to_start_child_here(%{module: module, startup_args: startup_args}) do
case module.start_link(startup_args) do
{:ok, pid} ->
Logger.info("#{module} started on #{node()}")
pid
{:error, {:already_started, pid}} ->
Logger.debug("#{module} already started by another node")
pid
_ ->
nil
end
end
end

View File

@@ -30,6 +30,29 @@ defmodule Backend.NodeClusterIntegrationTests do
assert length(peer_nodes -- [owner_node]) == 3,
"Expected 3 non-owner peer nodes, got #{inspect(peer_nodes -- [owner_node])}"
end
test "crashing GameRunner gets picked up on other node" do
peers = start_cluster(2)
game_runner_pid =
:peer.call(hd(peers) |> elem(0), :global, :whereis_name, [Backend.GameRunner])
assert is_pid(game_runner_pid), "Could not find GameRunner in :global registry"
first_node = node(game_runner_pid)
GameRunner.crash_game()
:timer.sleep(100)
new_pid = GameRunner.get_pid()
assert is_pid(new_pid), "GameRunner did not restart after crash "
restarted_node = node(new_pid)
assert restarted_node != first_node,
"GameRunner restarted on the same node after 10 crash attempts"
end
end
defp start_cluster(count) do