udpates
Some checks failed
CI/CD Pipeline / build (push) Failing after 5s

This commit is contained in:
2026-03-25 15:13:43 -06:00
parent 0041c25f19
commit 62f16b2bde
7 changed files with 318 additions and 179 deletions

View File

@@ -21,6 +21,8 @@ defmodule ElixirAi.ConversationManager do
def init(_) do
Logger.info("ConversationManager initializing...")
:pg.join(ElixirAi.SingletonPG, {:singleton, __MODULE__}, self())
# Mitigation 4: receive :nodedown when a cluster peer disappears (sleep/wake, crash)
:net_kernel.monitor_nodes(true)
send(self(), :load_conversations)
{:ok, %{conversations: :loading, subscriptions: MapSet.new(), runners: %{}}}
end
@@ -102,14 +104,57 @@ defmodule ElixirAi.ConversationManager do
end
def handle_info({:DOWN, _ref, :process, pid, reason}, %{runners: runners} = state) do
runners =
Enum.reject(runners, fn {_name, info} -> info.pid == pid end)
# Find the name before removing so we can check for a Horde redistribution replacement
{name, _} = Enum.find(runners, {nil, nil}, fn {_n, info} -> info.pid == pid end)
new_runners =
runners
|> Enum.reject(fn {_n, info} -> info.pid == pid end)
|> Map.new()
Logger.info("ConversationManager: runner #{inspect(pid)} went down (#{inspect(reason)})")
{:noreply, %{state | runners: runners}}
# Mitigation 2: Horde may have already restarted the runner on another node; re-monitor it
new_runners =
if name do
case :pg.get_members(ElixirAi.RunnerPG, {:runner, name}) do
[new_pid | _] when new_pid != pid ->
Logger.info(
"ConversationManager: re-monitoring redistributed runner for #{name} at #{inspect(new_pid)}"
)
Process.monitor(new_pid)
Map.put(new_runners, name, %{pid: new_pid, node: node(new_pid)})
_ ->
new_runners
end
else
new_runners
end
{:noreply, %{state | runners: new_runners}}
end
# Mitigation 4: node went down — evict all cached runners on that node immediately,
# before the individual :DOWN messages for each pid arrive.
def handle_info({:nodedown, down_node}, %{runners: runners} = state) do
stale = Enum.filter(runners, fn {_name, info} -> info.node == down_node end)
if stale != [] do
names = Enum.map(stale, &elem(&1, 0))
Logger.info(
"ConversationManager: node #{down_node} down, clearing stale runners: #{inspect(names)}"
)
end
new_runners = Map.reject(runners, fn {_name, info} -> info.node == down_node end)
{:noreply, %{state | runners: new_runners}}
end
def handle_info({:nodeup, _node}, state), do: {:noreply, state}
def handle_info({:error, {:db_error, reason}}, state) do
Logger.error("ConversationManager received db_error: #{inspect(reason)}")
{:noreply, state}
@@ -142,7 +187,33 @@ defmodule ElixirAi.ConversationManager do
conversations = Map.new(conversation_list, fn %{name: name} -> {name, []} end)
Logger.info("Conversation map keys: #{inspect(Map.keys(conversations))}")
{:noreply, %{state | conversations: conversations}}
# Mitigation 3: after a ConversationManager restart, re-establish monitors for any
# ChatRunners that are still alive in Horde — they carry on running but we lost
# all monitor refs when this process restarted.
runners =
:pg.which_groups(ElixirAi.RunnerPG)
|> Enum.flat_map(fn
{:runner, name} ->
case :pg.get_members(ElixirAi.RunnerPG, {:runner, name}) do
[pid | _] ->
Process.monitor(pid)
[{name, %{pid: pid, node: node(pid)}}]
_ ->
[]
end
_ ->
[]
end)
|> Map.new()
Logger.info(
"ConversationManager: re-established monitors for #{map_size(runners)} live runners"
)
{:noreply, %{state | conversations: conversations, runners: runners}}
end
def handle_info({:retry_call, message, from}, state) do
@@ -195,9 +266,32 @@ defmodule ElixirAi.ConversationManager do
ElixirAi.ChatRunnerSupervisor,
{ElixirAi.ChatRunner, name: name}
) do
{:ok, pid} -> {:ok, pid}
{:error, {:already_started, pid}} -> {:ok, pid}
error -> error
{:ok, pid} ->
{:ok, pid}
{:error, {:already_started, pid}} ->
# Mitigation 6: the returned pid may be on a node that just went down but whose
# :DOWN message hasn't been processed yet; verify the node is still reachable.
if node_alive?(node(pid)) do
{:ok, pid}
else
# Node is gone; Horde will redistribute — wait briefly for the new registration.
case registry_lookup_with_retry(name) do
nil -> {:error, :runner_unavailable}
new_pid -> {:ok, new_pid}
end
end
# Mitigation 1: :already_present means Horde knows the child spec but the process
# is mid-redistribution and not yet registered. Retry the registry until it appears.
{:error, :already_present} ->
case registry_lookup_with_retry(name) do
nil -> {:error, :runner_unavailable}
pid -> {:ok, pid}
end
error ->
error
end
case result do
@@ -210,14 +304,21 @@ defmodule ElixirAi.ConversationManager do
MapSet.put(state.subscriptions, name)
end
existing_runners = Map.get(state, :runners, %{})
new_runners =
if Map.has_key?(existing_runners, name) do
existing_runners
else
Process.monitor(pid)
Map.put(existing_runners, name, %{pid: pid, node: node(pid)})
case Map.get(state.runners, name) do
nil ->
Process.monitor(pid)
Map.put(state.runners, name, %{pid: pid, node: node(pid)})
%{pid: ^pid} ->
# Same pid — nothing to update
state.runners
%{pid: old_pid} ->
# Pid changed (redistribution raced ahead of :DOWN) — swap the monitor
Process.demonitor(old_pid, [:flush])
Process.monitor(pid)
Map.put(state.runners, name, %{pid: pid, node: node(pid)})
end
{:ok, pid, new_subscriptions, new_runners}
@@ -226,4 +327,22 @@ defmodule ElixirAi.ConversationManager do
error
end
end
# Mitigation 5: Horde registry syncs via delta-CRDT with up to ~100ms lag after a
# process moves nodes. Retry with exponential backoff before concluding it doesn't exist.
defp registry_lookup_with_retry(name, retries \\ 3, delay_ms \\ 50)
defp registry_lookup_with_retry(_name, 0, _delay_ms), do: nil
defp registry_lookup_with_retry(name, retries, delay_ms) do
case Horde.Registry.lookup(ElixirAi.ChatRegistry, name) do
[{pid, _} | _] when is_pid(pid) ->
pid
_ ->
Process.sleep(delay_ms)
registry_lookup_with_retry(name, retries - 1, delay_ms * 2)
end
end
defp node_alive?(n), do: n == Node.self() or n in Node.list()
end