preserve state in singleton managers

This commit is contained in:
2026-03-03 14:13:01 -07:00
parent e0f2d8c4aa
commit 4ac6c09759
4 changed files with 124 additions and 49 deletions

View File

@@ -2,13 +2,16 @@ defmodule Backend.Application do
@moduledoc false @moduledoc false
use Application use Application
# application fully started on each node
@impl true @impl true
def start(_type, _args) do def start(_type, _args) do
children = [ children = [
BackendWeb.Telemetry, BackendWeb.Telemetry,
{Phoenix.PubSub, name: Backend.PubSub}, {Phoenix.PubSub, name: Backend.PubSub},
Backend.Cluster, Backend.Cluster,
{Backend.GlobalSingleton, Backend.GameRunner}, {Backend.GlobalSingleton,
module: Backend.GameRunner, name: "GameRunner", startup_args: %{}, pubsub_channel: "GameRunner.StartupArgs"},
BackendWeb.Endpoint BackendWeb.Endpoint
] ]

View File

@@ -12,13 +12,12 @@ defmodule Backend.GameRunner do
# Client API # Client API
def start_link(_opts) do def start_link(startup_args) do
case GenServer.start_link(__MODULE__, %{}, name: @name) do case GenServer.start_link(__MODULE__, startup_args, name: @name) do
{:ok, pid} -> {:ok, pid} ->
{:ok, pid} {:ok, pid}
{:error, {:already_started, _pid}} -> {:error, {:already_started, _pid}} ->
# Another instance is already running globally
:ignore :ignore
end end
end end
@@ -50,15 +49,23 @@ defmodule Backend.GameRunner do
GenServer.call(@name, :get_node_name) GenServer.call(@name, :get_node_name)
end end
def get_pid do
GenServer.call(@name, :get_pid)
end
def crash_game do
GenServer.cast(@name, :crash)
end
# Server Callbacks # Server Callbacks
@impl true @impl true
def init(_) do def init(startup_args) do
Logger.info("GameState starting on node: #{node()}") Logger.info("GameState starting on node: #{node()}")
sleep_delay = round(1000 / @fps) sleep_delay = round(1000 / @fps)
:timer.send_interval(sleep_delay, :tick) :timer.send_interval(sleep_delay, :tick)
{:ok, %{players: %{}, tick_number: 0}} {:ok, %{players: %{}, tick_number: 0} |> Map.merge(startup_args)}
end end
@impl true @impl true
@@ -103,6 +110,12 @@ defmodule Backend.GameRunner do
{:noreply, broadcast_state(new_state)} {:noreply, broadcast_state(new_state)}
end end
@impl true
def handle_cast(:crash, _state) do
Logger.error("Simulated crash of GameRunner on node #{node()}")
raise "Simulated crash"
end
@impl true @impl true
def handle_info(:tick, state) do def handle_info(:tick, state) do
if rem(state.tick_number, 100) == 0 do if rem(state.tick_number, 100) == 0 do
@@ -146,8 +159,19 @@ defmodule Backend.GameRunner do
{:reply, node(), state} {:reply, node(), state}
end end
def handle_call(:get_pid, _from, state) do
{:reply, self(), state}
end
defp broadcast_state(state) do defp broadcast_state(state) do
Phoenix.PubSub.broadcast(Backend.PubSub, "game_state", {:game_state_updated, state}) Phoenix.PubSub.broadcast(Backend.PubSub, "game_state", {:game_state_updated, state})
Phoenix.PubSub.broadcast(
Backend.PubSub,
"GameRunner.StartupArgs",
{:startup_args_updated, state}
)
state state
end end
end end

View File

@@ -3,63 +3,88 @@ defmodule Backend.GlobalSingleton do
Supervisor that ensures a global singleton process runs across the cluster. Supervisor that ensures a global singleton process runs across the cluster.
If the node running it crashes, another node will take over. If the node running it crashes, another node will take over.
""" """
use Supervisor use GenServer
require Logger require Logger
def start_link(module) do def start_link(
Supervisor.start_link(__MODULE__, module, name: :"#{module}.GlobalSingleton") module: module,
name: name,
startup_args: startup_args,
pubsub_channel: pubsub_channel
) do
GenServer.start_link(__MODULE__, {module, name, startup_args, pubsub_channel},
name: get_name(module, name)
)
end
defp get_name(module, name), do: :"#{name}.#{module}.GlobalSingleton"
@impl true
def init({module, name, startup_args, pubsub_channel}) do
# immediately schedule first check
Process.send(self(), :check, [])
if pubsub_channel do
Phoenix.PubSub.subscribe(Backend.PubSub, pubsub_channel)
end
{:ok, %{module: module, name: name, startup_args: startup_args, monitor_ref: nil}}
end end
@impl true @impl true
def init(module) do def handle_info(
children = [ :check,
%{ %{module: module, startup_args: startup_args} = state
id: :monitor_task, ) do
start: {Task, :start_link, [fn -> monitor_loop(module) end]}, Process.send_after(self(), :check, 100)
restart: :permanent
}
]
Supervisor.init(children, strategy: :one_for_one) process_pid =
end
defp monitor_loop(module) do
case :global.whereis_name(module) do case :global.whereis_name(module) do
:undefined -> :undefined ->
# Double-check before attempting to start attempt_to_start_child_here(%{module: module, startup_args: startup_args})
Process.sleep(50)
case :global.whereis_name(module) do
:undefined ->
Logger.info("#{module} not running, attempting to start on #{node()}")
case module.start_link([]) do
{:ok, _pid} ->
Logger.info("#{module} started on #{node()}")
{:error, {:already_started, _pid}} ->
Logger.debug("#{module} already started by another node")
_ ->
:ok
end
Process.sleep(100)
monitor_loop(module)
_pid ->
# Another node won the race
monitor_loop(module)
end
pid when is_pid(pid) -> pid when is_pid(pid) ->
ref = Process.monitor(pid) pid
end
receive do {:noreply, monitor_if_not_already(process_pid, state)}
{:DOWN, ^ref, :process, ^pid, _reason} -> end
Logger.warning("#{module} went down, attempting takeover")
monitor_loop(module) @impl true
end def handle_info({:startup_args_updated, new_args}, state) do
Logger.info("Received updated startup args for #{state.module}: #{inspect(new_args)}")
{:noreply, %{state | startup_args: new_args}}
end
@impl true
def handle_info(
{:DOWN, ref, :process, _pid, _reason},
%{module: module, monitor_ref: ref} = state
) do
Logger.warning("#{module} went down, attempting takeover on #{node()}")
send(self(), :check)
{:noreply, %{state | monitor_ref: nil}}
end
defp monitor_if_not_already(pid, %{monitor_ref: nil} = state) when is_pid(pid) do
ref = Process.monitor(pid)
%{state | monitor_ref: ref}
end
defp monitor_if_not_already(_pid, state), do: state
defp attempt_to_start_child_here(%{module: module, startup_args: startup_args}) do
case module.start_link(startup_args) do
{:ok, pid} ->
Logger.info("#{module} started on #{node()}")
pid
{:error, {:already_started, pid}} ->
Logger.debug("#{module} already started by another node")
pid
_ ->
nil
end end
end end
end end

View File

@@ -30,6 +30,29 @@ defmodule Backend.NodeClusterIntegrationTests do
assert length(peer_nodes -- [owner_node]) == 3, assert length(peer_nodes -- [owner_node]) == 3,
"Expected 3 non-owner peer nodes, got #{inspect(peer_nodes -- [owner_node])}" "Expected 3 non-owner peer nodes, got #{inspect(peer_nodes -- [owner_node])}"
end end
test "crashing GameRunner gets picked up on other node" do
peers = start_cluster(2)
game_runner_pid =
:peer.call(hd(peers) |> elem(0), :global, :whereis_name, [Backend.GameRunner])
assert is_pid(game_runner_pid), "Could not find GameRunner in :global registry"
first_node = node(game_runner_pid)
GameRunner.crash_game()
:timer.sleep(100)
new_pid = GameRunner.get_pid()
assert is_pid(new_pid), "GameRunner did not restart after crash "
restarted_node = node(new_pid)
assert restarted_node != first_node,
"GameRunner restarted on the same node after 10 crash attempts"
end
end end
defp start_cluster(count) do defp start_cluster(count) do