better process tracking for admin dashboard
Some checks failed
CI/CD Pipeline / build (push) Failing after 5s

This commit is contained in:
2026-03-20 12:07:16 -06:00
parent b2f53942a2
commit 6138d71d29
21 changed files with 910 additions and 106 deletions

View File

@@ -27,9 +27,16 @@ defmodule ElixirAi.ChatUtils do
}
run_function = fn current_message_id, tool_call_id, args ->
Task.start(fn ->
result = function.(args)
send(server, {:tool_response, current_message_id, tool_call_id, result})
Task.start_link(fn ->
try do
result = function.(args)
send(server, {:tool_response, current_message_id, tool_call_id, result})
rescue
e ->
reason = Exception.format(:error, e, __STACKTRACE__)
Logger.error("Tool task crashed: #{reason}")
send(server, {:tool_response, current_message_id, tool_call_id, {:error, reason}})
end
end)
end
@@ -41,7 +48,7 @@ defmodule ElixirAi.ChatUtils do
end
def request_ai_response(server, messages, tools, provider) do
Task.start(fn ->
Task.start_link(fn ->
api_url = provider.completions_url
api_key = provider.api_token
model = provider.model_name
@@ -82,7 +89,8 @@ defmodule ElixirAi.ChatUtils do
:ok
{:error, reason} ->
IO.warn("AI request failed: #{inspect(reason)} for #{api_url}")
Logger.warning("AI request failed: #{inspect(reason)} for #{api_url}")
send(server, {:ai_request_error, reason})
end
end)
end

View File

@@ -12,6 +12,7 @@ defmodule ElixirAi.Application do
{Cluster.Supervisor,
[Application.get_env(:libcluster, :topologies, []), [name: ElixirAi.ClusterSupervisor]]},
{Phoenix.PubSub, name: ElixirAi.PubSub},
{ElixirAi.LiveViewPG, []},
ElixirAi.ToolTesting,
ElixirAiWeb.Endpoint,
{Horde.Registry,
@@ -55,7 +56,7 @@ defmodule ElixirAi.Application do
if Application.get_env(:elixir_ai, :env) == :test do
Supervisor.child_spec({Task, fn -> :ok end}, id: :skip_default_provider)
else
{Task, fn -> ElixirAi.AiProvider.ensure_default_provider() end}
{Task, fn -> ElixirAi.AiProvider.ensure_configured_providers() end}
end
end

View File

@@ -49,6 +49,7 @@ defmodule ElixirAi.ChatRunner do
"Last message role was #{last_message.role}, requesting AI response for conversation #{name}"
)
broadcast_ui(name, :recovery_restart)
ElixirAi.ChatUtils.request_ai_response(self(), messages, tools(self(), name), provider)
end

View File

@@ -3,17 +3,33 @@ defmodule ElixirAi.ClusterSingleton do
require Logger
@sync_delay_ms 200
@retry_delay_ms 500
@singletons [ElixirAi.ConversationManager]
def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: __MODULE__)
def status, do: GenServer.call(__MODULE__, :status)
def configured_singletons, do: @singletons
def init(_opts) do
Process.send_after(self(), :start_singletons, @sync_delay_ms)
{:ok, :pending}
end
def handle_info(:start_singletons, _state) do
def handle_info(:start_singletons, state) do
if Node.list() == [] do
Logger.debug("ClusterSingleton: no peer nodes yet, retrying in #{@retry_delay_ms}ms")
Process.send_after(self(), :start_singletons, @retry_delay_ms)
{:noreply, state}
else
start_singletons()
{:noreply, :started}
end
end
defp start_singletons do
for module <- @singletons do
if singleton_exists?(module) do
Logger.debug(
@@ -37,10 +53,10 @@ defmodule ElixirAi.ClusterSingleton do
end
end
end
{:noreply, :started}
end
def handle_call(:status, _from, state), do: {:reply, state, state}
defp singleton_exists?(module) do
case Horde.Registry.lookup(ElixirAi.ChatRegistry, module) do
[{pid, _metadata} | _] when is_pid(pid) ->

View File

@@ -21,7 +21,7 @@ defmodule ElixirAi.ConversationManager do
def init(_) do
Logger.info("ConversationManager initializing...")
send(self(), :load_conversations)
{:ok, %{conversations: :loading, subscriptions: MapSet.new()}}
{:ok, %{conversations: :loading, subscriptions: MapSet.new(), runners: %{}}}
end
def create_conversation(name, ai_provider_id) do
@@ -40,6 +40,10 @@ defmodule ElixirAi.ConversationManager do
GenServer.call(@name, {:get_messages, name})
end
def list_runners do
GenServer.call(@name, :list_runners)
end
def handle_call(message, from, %{conversations: :loading} = state) do
Logger.warning(
"Received call #{inspect(message)} from #{inspect(from)} while loading conversations. Retrying after delay."
@@ -75,7 +79,7 @@ defmodule ElixirAi.ConversationManager do
%{conversations: conversations} = state
) do
if Map.has_key?(conversations, name) do
reply_with_started(name, state)
reply_with_conversation(name, state)
else
{:reply, {:error, :not_found}, state}
end
@@ -84,10 +88,6 @@ defmodule ElixirAi.ConversationManager do
def handle_call(:list, _from, %{conversations: conversations} = state) do
keys = Map.keys(conversations)
Logger.debug(
"list_conversations returning: #{inspect(keys, limit: :infinity, printable_limit: :infinity, binaries: :as_binaries)}"
)
{:reply, keys, state}
end
@@ -95,6 +95,19 @@ defmodule ElixirAi.ConversationManager do
{:reply, Map.get(conversations, name, []), state}
end
def handle_call(:list_runners, _from, state) do
{:reply, Map.get(state, :runners, %{}), state}
end
def handle_info({:DOWN, _ref, :process, pid, reason}, %{runners: runners} = state) do
runners =
Enum.reject(runners, fn {_name, info} -> info.pid == pid end)
|> Map.new()
Logger.info("ConversationManager: runner #{inspect(pid)} went down (#{inspect(reason)})")
{:noreply, %{state | runners: runners}}
end
def handle_info({:db_error, reason}, state) do
Logger.error("ConversationManager received db_error: #{inspect(reason)}")
{:noreply, state}
@@ -138,10 +151,13 @@ defmodule ElixirAi.ConversationManager do
end
end
# Returns {pid} to callers that only need to know the process started (e.g. create).
defp reply_with_started(name, state, update_state \\ fn s -> s end) do
case start_and_subscribe(name, state.subscriptions) do
{:ok, pid, new_subscriptions} ->
new_state = update_state.(%{state | subscriptions: new_subscriptions})
case start_and_subscribe(name, state) do
{:ok, pid, new_subscriptions, new_runners} ->
new_state =
update_state.(%{state | subscriptions: new_subscriptions, runners: new_runners})
{:reply, {:ok, pid}, new_state}
{:error, _reason} = error ->
@@ -149,7 +165,21 @@ defmodule ElixirAi.ConversationManager do
end
end
defp start_and_subscribe(name, subscriptions) do
# Returns the full conversation state using the pid directly, bypassing the
# Horde registry (which may not have synced yet on the calling node).
defp reply_with_conversation(name, state) do
case start_and_subscribe(name, state) do
{:ok, pid, new_subscriptions, new_runners} ->
new_state = %{state | subscriptions: new_subscriptions, runners: new_runners}
conversation = GenServer.call(pid, :get_conversation)
{:reply, {:ok, conversation}, new_state}
{:error, _reason} = error ->
{:reply, error, state}
end
end
defp start_and_subscribe(name, state) do
result =
case Horde.DynamicSupervisor.start_child(
ElixirAi.ChatRunnerSupervisor,
@@ -163,14 +193,24 @@ defmodule ElixirAi.ConversationManager do
case result do
{:ok, pid} ->
new_subscriptions =
if MapSet.member?(subscriptions, name) do
subscriptions
if MapSet.member?(state.subscriptions, name) do
state.subscriptions
else
Phoenix.PubSub.subscribe(ElixirAi.PubSub, conversation_message_topic(name))
MapSet.put(subscriptions, name)
MapSet.put(state.subscriptions, name)
end
{:ok, pid, new_subscriptions}
existing_runners = Map.get(state, :runners, %{})
new_runners =
if Map.has_key?(existing_runners, name) do
existing_runners
else
Process.monitor(pid)
Map.put(existing_runners, name, %{pid: pid, node: node(pid)})
end
{:ok, pid, new_subscriptions, new_runners}
error ->
error

View File

@@ -138,28 +138,80 @@ defmodule ElixirAi.AiProvider do
end
def ensure_default_provider do
sql = "SELECT COUNT(*) FROM ai_providers"
params = %{}
endpoint = Application.get_env(:elixir_ai, :ai_endpoint)
token = Application.get_env(:elixir_ai, :ai_token)
model = Application.get_env(:elixir_ai, :ai_model)
case DbHelpers.run_sql(sql, params, providers_topic()) do
{:error, :db_error} ->
{:error, :db_error}
if endpoint && token && model do
case find_by_name("default") do
{:error, :not_found} ->
attrs = %{
name: "default",
model_name: model,
api_token: token,
completions_url: endpoint
}
rows ->
case rows do
[%{"count" => 0}] ->
attrs = %{
name: "default",
model_name: Application.fetch_env!(:elixir_ai, :ai_model),
api_token: Application.fetch_env!(:elixir_ai, :ai_token),
completions_url: Application.fetch_env!(:elixir_ai, :ai_endpoint)
}
create(attrs)
create(attrs)
{:ok, _} ->
:ok
_ ->
:ok
{:error, reason} ->
{:error, reason}
end
else
Logger.info("AI env vars not configured, skipping default provider creation")
:ok
end
end
def ensure_providers_from_file do
case System.get_env("PROVIDERS_CONFIG_PATH") do
nil ->
:ok
path ->
case YamlElixir.read_from_file(path) do
{:ok, %{"providers" => providers}} when is_list(providers) ->
Enum.each(providers, &ensure_provider_from_yaml/1)
{:ok, _} ->
Logger.warning("providers.yml: expected a top-level 'providers' list, skipping")
{:error, reason} ->
Logger.warning("Could not read providers config from #{path}: #{inspect(reason)}")
end
end
end
def ensure_configured_providers do
ensure_default_provider()
ensure_providers_from_file()
end
defp ensure_provider_from_yaml(%{
"name" => name,
"model" => model,
"responses_endpoint" => endpoint,
"api_key" => api_key
}) do
case find_by_name(name) do
{:error, :not_found} ->
Logger.info("Creating provider '#{name}' from providers config file")
create(%{name: name, model_name: model, api_token: api_key, completions_url: endpoint})
{:ok, _} ->
Logger.debug("Provider '#{name}' already exists, skipping")
{:error, reason} ->
Logger.warning("Could not check existence of provider '#{name}': #{inspect(reason)}")
end
end
defp ensure_provider_from_yaml(entry) do
Logger.warning(
"Skipping invalid provider entry in providers config file (must have name, model, responses_endpoint, api_key): #{inspect(entry)}"
)
end
end

View File

@@ -0,0 +1,16 @@
defmodule ElixirAi.LiveViewPG do
@moduledoc """
Named :pg scope for tracking LiveView processes across the cluster.
Each LiveView joins {:liveview, ViewModule} on connect; :pg syncs membership
automatically and removes dead processes without any additional cleanup.
"""
def child_spec(_opts) do
%{
id: __MODULE__,
start: {:pg, :start_link, [__MODULE__]},
type: :worker,
restart: :permanent
}
end
end

View File

@@ -0,0 +1,245 @@
defmodule ElixirAiWeb.AdminLive do
use ElixirAiWeb, :live_view
require Logger
@refresh_ms 1_000
def mount(_params, _session, socket) do
if connected?(socket) do
:net_kernel.monitor_nodes(true)
:pg.join(ElixirAi.LiveViewPG, {:liveview, __MODULE__}, self())
schedule_refresh()
end
{:ok, assign(socket, cluster_info: gather_info())}
end
def handle_info({:nodeup, _node}, socket) do
{:noreply, assign(socket, cluster_info: gather_info())}
end
def handle_info({:nodedown, _node}, socket) do
{:noreply, assign(socket, cluster_info: gather_info())}
end
def handle_info(:refresh, socket) do
schedule_refresh()
{:noreply, assign(socket, cluster_info: gather_info())}
end
defp schedule_refresh, do: Process.send_after(self(), :refresh, @refresh_ms)
defp gather_info do
import ElixirAi.PubsubTopics
all_nodes = [Node.self() | Node.list()]
configured = ElixirAi.ClusterSingleton.configured_singletons()
node_statuses =
Enum.map(all_nodes, fn node ->
status =
if node == Node.self() do
try do
ElixirAi.ClusterSingleton.status()
catch
_, _ -> :unreachable
end
else
case :rpc.call(node, ElixirAi.ClusterSingleton, :status, [], 3_000) do
{:badrpc, _} -> :unreachable
result -> result
end
end
{node, status}
end)
singleton_locations =
Enum.map(configured, fn module ->
location =
case Horde.Registry.lookup(ElixirAi.ChatRegistry, module) do
[{pid, _}] -> node(pid)
_ -> nil
end
{module, location}
end)
# All ChatRunner entries in the distributed registry, keyed by conversation name.
# Each entry is a {name, node, pid, supervisor_node} tuple.
chat_runners =
Horde.DynamicSupervisor.which_children(ElixirAi.ChatRunnerSupervisor)
|> Enum.flat_map(fn
{_, pid, _, _} when is_pid(pid) ->
case Horde.Registry.select(ElixirAi.ChatRegistry, [
{{:"$1", pid, :"$2"}, [], [{{:"$1", pid, :"$2"}}]}
]) do
[{name, ^pid, _}] when is_binary(name) -> [{name, node(pid), pid}]
_ -> []
end
_ ->
[]
end)
|> Enum.sort_by(&elem(&1, 0))
# :pg is cluster-wide — one local call returns members from all nodes.
# Processes are automatically removed from their group when they die.
liveviews =
:pg.which_groups(ElixirAi.LiveViewPG)
|> Enum.flat_map(fn
{:liveview, view} ->
:pg.get_members(ElixirAi.LiveViewPG, {:liveview, view})
|> Enum.map(fn pid -> {view, node(pid)} end)
_ ->
[]
end)
%{
nodes: node_statuses,
configured_singletons: configured,
singleton_locations: singleton_locations,
chat_runners: chat_runners,
liveviews: liveviews
}
end
def render(assigns) do
~H"""
<div class="p-6 space-y-4">
<h1 class="text-lg font-semibold text-cyan-200 tracking-wide">Cluster Admin</h1>
<div class="grid gap-4 grid-cols-1 lg:grid-cols-2 xl:grid-cols-3">
<%= for {node, status} <- @cluster_info.nodes do %>
<% node_singletons =
Enum.filter(@cluster_info.singleton_locations, fn {_, loc} -> loc == node end) %>
<% node_runners =
Enum.filter(@cluster_info.chat_runners, fn {_, rnode, _} -> rnode == node end) %>
<% node_liveviews =
@cluster_info.liveviews
|> Enum.filter(fn {_, n} -> n == node end)
|> Enum.group_by(fn {view, _} -> view end) %>
<div class="rounded-lg border border-cyan-800/50 bg-cyan-950/30 overflow-hidden">
<div class="flex items-center justify-between px-4 py-3 bg-cyan-900/40 border-b border-cyan-800/50">
<div class="flex items-center gap-2">
<span class="font-mono text-sm font-semibold text-cyan-200">{node}</span>
<%= if node == Node.self() do %>
<span class="text-xs bg-cyan-800/50 text-cyan-400 px-1.5 py-0.5 rounded">self</span>
<% end %>
</div>
<.status_badge status={status} />
</div>
<div class="p-4 space-y-4">
<%= if node_singletons != [] do %>
<div>
<p class="text-xs font-semibold uppercase tracking-widest text-cyan-600 mb-1.5">
Singletons
</p>
<div class="space-y-1">
<%= for {module, _} <- node_singletons do %>
<div class="px-2 py-1.5 rounded bg-cyan-900/30 font-mono text-xs text-cyan-300">
{inspect(module)}
</div>
<% end %>
</div>
</div>
<% end %>
<%= if node_runners != [] do %>
<div>
<p class="text-xs font-semibold uppercase tracking-widest text-cyan-600 mb-1.5">
Chat Runners
<span class="normal-case font-normal text-cyan-700 ml-1">
{length(node_runners)}
</span>
</p>
<div class="space-y-1">
<%= for {name, _, _} <- node_runners do %>
<div class="px-2 py-1.5 rounded bg-cyan-900/30 font-mono text-xs text-cyan-200">
{name}
</div>
<% end %>
</div>
</div>
<% end %>
<%= if node_liveviews != %{} do %>
<div>
<p class="text-xs font-semibold uppercase tracking-widest text-cyan-600 mb-1.5">
LiveViews
</p>
<div class="space-y-1">
<%= for {view, instances} <- node_liveviews do %>
<div class="px-2 py-1.5 rounded bg-cyan-900/30 flex justify-between items-center gap-2">
<span class="font-mono text-xs text-cyan-200">{short_module(view)}</span>
<span class="text-xs text-cyan-600">×{length(instances)}</span>
</div>
<% end %>
</div>
</div>
<% end %>
<%= if node_singletons == [] and node_runners == [] and node_liveviews == %{} do %>
<p class="text-xs text-cyan-700 italic">No active processes</p>
<% end %>
</div>
</div>
<% end %>
</div>
<% unlocated =
Enum.filter(@cluster_info.singleton_locations, fn {_, loc} -> is_nil(loc) end) %>
<%= if unlocated != [] do %>
<section>
<h2 class="text-xs font-semibold uppercase tracking-widest text-red-500 mb-2">
Singletons Not Running
</h2>
<div class="flex flex-wrap gap-2">
<%= for {module, _} <- unlocated do %>
<span class="px-2 py-1 rounded bg-red-900/20 border border-red-800/40 font-mono text-xs text-red-400">
{inspect(module)}
</span>
<% end %>
</div>
</section>
<% end %>
<p class="text-xs text-cyan-800">Refreshes every 1s or on node events.</p>
</div>
"""
end
defp short_module(module) when is_atom(module) do
module
|> Atom.to_string()
|> String.replace_prefix("Elixir.", "")
|> String.split(".")
|> List.last()
end
defp status_badge(assigns) do
~H"""
<%= case @status do %>
<% :started -> %>
<span class="inline-block px-2 py-0.5 rounded text-xs font-semibold bg-green-900 text-green-300">
started
</span>
<% :pending -> %>
<span class="inline-block px-2 py-0.5 rounded text-xs font-semibold bg-yellow-900 text-yellow-300">
pending
</span>
<% :unreachable -> %>
<span class="inline-block px-2 py-0.5 rounded text-xs font-semibold bg-red-900 text-red-300">
unreachable
</span>
<% other -> %>
<span class="inline-block px-2 py-0.5 rounded text-xs font-semibold bg-cyan-900 text-cyan-300">
{inspect(other)}
</span>
<% end %>
"""
end
end

View File

@@ -10,13 +10,12 @@ defmodule ElixirAiWeb.ChatLive do
def mount(%{"name" => name}, _session, socket) do
case ConversationManager.open_conversation(name) do
{:ok, _pid} ->
{:ok, conversation} ->
if connected?(socket) do
Phoenix.PubSub.subscribe(ElixirAi.PubSub, chat_topic(name))
:pg.join(ElixirAi.LiveViewPG, {:liveview, __MODULE__}, self())
end
conversation = ChatRunner.get_conversation(name)
{:ok,
socket
|> assign(conversation_name: name)
@@ -25,7 +24,8 @@ defmodule ElixirAiWeb.ChatLive do
|> assign(streaming_response: conversation.streaming_response)
|> assign(background_color: "bg-cyan-950/30")
|> assign(provider: conversation.provider)
|> assign(db_error: nil)}
|> assign(db_error: nil)
|> assign(ai_error: nil)}
{:error, :not_found} ->
{:ok, push_navigate(socket, to: "/")}
@@ -41,7 +41,8 @@ defmodule ElixirAiWeb.ChatLive do
|> assign(streaming_response: nil)
|> assign(background_color: "bg-cyan-950/30")
|> assign(provider: nil)
|> assign(db_error: Exception.format(:error, reason))}
|> assign(db_error: Exception.format(:error, reason))
|> assign(ai_error: nil)}
end
end
@@ -60,6 +61,11 @@ defmodule ElixirAiWeb.ChatLive do
Database error: {@db_error}
</div>
<% end %>
<%= if @ai_error do %>
<div class="mx-4 mt-2 px-3 py-2 rounded text-sm text-red-400 bg-red-950/40" role="alert">
AI error: {@ai_error}
</div>
<% end %>
<div
id="chat-messages"
phx-hook="ScrollBottom"
@@ -118,6 +124,10 @@ defmodule ElixirAiWeb.ChatLive do
{:noreply, assign(socket, user_input: "")}
end
def handle_info(:recovery_restart, socket) do
{:noreply, assign(socket, streaming_response: nil, ai_error: nil)}
end
def handle_info({:user_chat_message, message}, socket) do
{:noreply,
socket
@@ -210,6 +220,16 @@ defmodule ElixirAiWeb.ChatLive do
{:noreply, assign(socket, db_error: reason)}
end
def handle_info({:ai_request_error, reason}, socket) do
error_message =
case reason do
%{__struct__: mod, reason: r} -> "#{inspect(mod)}: #{inspect(r)}"
_ -> inspect(reason)
end
{:noreply, assign(socket, ai_error: error_message, streaming_response: nil)}
end
def handle_info({:set_background_color, color}, socket) do
Logger.info("setting background color to #{color}")
{:noreply, assign(socket, background_color: color)}

View File

@@ -8,6 +8,7 @@ defmodule ElixirAiWeb.HomeLive do
def mount(_params, _session, socket) do
if connected?(socket) do
Phoenix.PubSub.subscribe(ElixirAi.PubSub, providers_topic())
:pg.join(ElixirAi.LiveViewPG, {:liveview, __MODULE__}, self())
send(self(), :load_data)
end

View File

@@ -19,6 +19,7 @@ defmodule ElixirAiWeb.Router do
live "/", HomeLive
live "/chat/:name", ChatLive
live "/admin", AdminLive
end
# Other scopes may use custom stacks.