better process tracking for admin dashboard

2026-03-20 12:07:16 -06:00
parent b2f53942a2
commit 6138d71d29
21 changed files with 910 additions and 106 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -38,4 +38,6 @@ npm-debug.log
 elixir_ls/
 .env

-*.tmp
+*.tmp
+
+providers.yml
--- a/config/config.exs
+++ b/config/config.exs
@@ -1,17 +1,10 @@
-# This file is responsible for configuring your application
-# and its dependencies with the aid of the Config module.
-#
-# This configuration file is loaded before any dependency and
-# is restricted to this project.
-
-# General application configuration
+# General config, overriden by other files in this directory.
 import Config

 config :elixir_ai,
  ecto_repos: [ElixirAi.Repo],
  generators: [timestamp_type: :utc_datetime]

-# Configures the endpoint
 config :elixir_ai, ElixirAiWeb.Endpoint,
  url: [host: "localhost"],
  adapter: Bandit.PhoenixAdapter,
@@ -22,7 +15,6 @@ config :elixir_ai, ElixirAiWeb.Endpoint,
  pubsub_server: ElixirAi.PubSub,
  live_view: [signing_salt: "4UG1IVt+"]

-# Configure esbuild (the version is required)
 config :esbuild,
  version: "0.17.11",
  elixir_ai: [
@@ -32,7 +24,6 @@ config :esbuild,
    env: %{"NODE_PATH" => Path.expand("../deps", __DIR__)}
  ]

-# Configure tailwind (the version is required)
 config :tailwind,
  version: "4.0.9",
  elixir_ai: [
@@ -43,17 +34,12 @@ config :tailwind,
    cd: Path.expand("../assets", __DIR__)
  ]

-# Configures Elixir's Logger
 config :logger, :console,
  format: "$time $metadata[$level] $message\n",
  metadata: [:request_id]

-# Use Jason for JSON parsing in Phoenix
 config :phoenix, :json_library, Jason

-# Lower the BEAM node-down detection window from the default 60s.
-# Nodes send ticks every (net_ticktime / 4)s; a node is declared down
-# after 4 missed ticks (net_ticktime total). 5s means detection in ≤5s.
 if System.get_env("RELEASE_MODE") do
  config :kernel, net_ticktime: 2
 end
@@ -67,6 +53,4 @@ config :libcluster,
    ]
  ]

-# Import environment specific config. This must remain at the bottom
-# of this file so it overrides the configuration defined above.
 import_config "#{config_env()}.exs"
--- a/config/runtime.exs
+++ b/config/runtime.exs
@@ -4,9 +4,9 @@ import Dotenvy
 source!([".env", System.get_env()])

 config :elixir_ai,
-  ai_endpoint: env!("AI_RESPONSES_ENDPOINT", :string!),
-  ai_token: env!("AI_TOKEN", :string!),
-  ai_model: env!("AI_MODEL", :string!)
+  ai_endpoint: System.get_env("AI_RESPONSES_ENDPOINT"),
+  ai_token: System.get_env("AI_TOKEN"),
+  ai_model: System.get_env("AI_MODEL")

 # config/runtime.exs is executed for all environments, including
 # during releases. It is executed after compilation and before the
@@ -72,7 +72,7 @@ if config_env() == :prod do
      ]
  end

-  host = System.get_env("PHX_HOST") || "example.com"
+  host = System.get_env("PHX_HOST") || raise "environment variable PHX_HOST is missing."
  port = String.to_integer(System.get_env("PORT") || "4000")

  config :elixir_ai, :dns_cluster_query, System.get_env("DNS_CLUSTER_QUERY")
@@ -88,36 +88,4 @@ if config_env() == :prod do
      port: port
    ],
    secret_key_base: secret_key_base
-
-  # ## SSL Support
-  #
-  # To get SSL working, you will need to add the `https` key
-  # to your endpoint configuration:
-  #
-  #     config :elixir_ai, ElixirAiWeb.Endpoint,
-  #       https: [
-  #         ...,
-  #         port: 443,
-  #         cipher_suite: :strong,
-  #         keyfile: System.get_env("SOME_APP_SSL_KEY_PATH"),
-  #         certfile: System.get_env("SOME_APP_SSL_CERT_PATH")
-  #       ]
-  #
-  # The `cipher_suite` is set to `:strong` to support only the
-  # latest and more secure SSL ciphers. This means old browsers
-  # and clients may not be supported. You can set it to
-  # `:compatible` for wider support.
-  #
-  # `:keyfile` and `:certfile` expect an absolute path to the key
-  # and cert in disk or a relative path inside priv, for example
-  # "priv/ssl/server.key". For all supported SSL configuration
-  # options, see https://hexdocs.pm/plug/Plug.SSL.html#configure/1
-  #
-  # We also recommend setting `force_ssl` in your config/prod.exs,
-  # ensuring no data is ever sent via http, always redirecting to https:
-  #
-  #     config :elixir_ai, ElixirAiWeb.Endpoint,
-  #       force_ssl: [hsts: true]
-  #
-  # Check `Plug.SSL` for all available options in `force_ssl`.
 end
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -31,6 +31,7 @@ services:
      RELEASE_NODE: elixir_ai@node1
      RELEASE_COOKIE: secret_cluster_cookie
      SECRET_KEY_BASE: F1nY5uSyD0HfoWejcuuQiaQoMQrjrlFigb3bJ7p4hTXwpTza6sPLpmd+jLS7p0Sh
+      PROVIDERS_CONFIG_PATH: /app/providers.yml
    user: root
    command: |
      sh -c '
@@ -41,6 +42,7 @@ services:
    volumes:
      - .:/app
      - /app/_build
+      - ./providers.yml:/app/providers.yml:ro
    ports:
      - "4001:4000"
    depends_on:
@@ -68,6 +70,7 @@ services:
      RELEASE_NODE: elixir_ai@node2
      RELEASE_COOKIE: secret_cluster_cookie
      SECRET_KEY_BASE: F1nY5uSyD0HfoWejcuuQiaQoMQrjrlFigb3bJ7p4hTXwpTza6sPLpmd+jLS7p0Sh
+      PROVIDERS_CONFIG_PATH: /app/providers.yml
    user: root
    command: |
      sh -c '
@@ -78,6 +81,7 @@ services:
    volumes:
      - .:/app
      - /app/_build
+      - ./providers.yml:/app/providers.yml:ro
    ports:
      - "4002:4000"
    depends_on:
--- a/example.providers.yml
+++ b/example.providers.yml
@@ -0,0 +1,9 @@
+providers:
+- name: provider name
+  model: gpt-oss-20b
+  responses_endpoint: http://example.com/api/responses
+  api_key: your_api_key_here
+- name: provider name 2
+  model: gpt-oss-20b
+  responses_endpoint: http://example.com/api/responses
+  api_key: your_api_key_here
--- a/kubernetes/statefulset.yml
+++ b/kubernetes/statefulset.yml
@@ -4,7 +4,7 @@ metadata:
  name: ai-ha-elixir
  namespace: ai-ha-elixir
 spec:
-  serviceName: ai-ha-elixir-headless
+  serviceName: ai-ha-elixir-headless # replica1.ai-ha-elixir-headless.svc.cluster.local
  replicas: 2
  podManagementPolicy: Parallel
  updateStrategy:
--- a/lib/elixir_ai/ai_utils/chat_utils.ex
+++ b/lib/elixir_ai/ai_utils/chat_utils.ex
@@ -27,9 +27,16 @@ defmodule ElixirAi.ChatUtils do
    }

    run_function = fn current_message_id, tool_call_id, args ->
-      Task.start(fn ->
-        result = function.(args)
-        send(server, {:tool_response, current_message_id, tool_call_id, result})
+      Task.start_link(fn ->
+        try do
+          result = function.(args)
+          send(server, {:tool_response, current_message_id, tool_call_id, result})
+        rescue
+          e ->
+            reason = Exception.format(:error, e, __STACKTRACE__)
+            Logger.error("Tool task crashed: #{reason}")
+            send(server, {:tool_response, current_message_id, tool_call_id, {:error, reason}})
+        end
      end)
    end

@@ -41,7 +48,7 @@ defmodule ElixirAi.ChatUtils do
  end

  def request_ai_response(server, messages, tools, provider) do
-    Task.start(fn ->
+    Task.start_link(fn ->
      api_url = provider.completions_url
      api_key = provider.api_token
      model = provider.model_name
@@ -82,7 +89,8 @@ defmodule ElixirAi.ChatUtils do
          :ok

        {:error, reason} ->
-          IO.warn("AI request failed: #{inspect(reason)} for #{api_url}")
+          Logger.warning("AI request failed: #{inspect(reason)} for #{api_url}")
+          send(server, {:ai_request_error, reason})
      end
    end)
  end
--- a/lib/elixir_ai/application.ex
+++ b/lib/elixir_ai/application.ex
@@ -12,6 +12,7 @@ defmodule ElixirAi.Application do
      {Cluster.Supervisor,
       [Application.get_env(:libcluster, :topologies, []), [name: ElixirAi.ClusterSupervisor]]},
      {Phoenix.PubSub, name: ElixirAi.PubSub},
+      {ElixirAi.LiveViewPG, []},
      ElixirAi.ToolTesting,
      ElixirAiWeb.Endpoint,
      {Horde.Registry,
@@ -55,7 +56,7 @@ defmodule ElixirAi.Application do
    if Application.get_env(:elixir_ai, :env) == :test do
      Supervisor.child_spec({Task, fn -> :ok end}, id: :skip_default_provider)
    else
-      {Task, fn -> ElixirAi.AiProvider.ensure_default_provider() end}
+      {Task, fn -> ElixirAi.AiProvider.ensure_configured_providers() end}
    end
  end

--- a/lib/elixir_ai/chat_runner.ex
+++ b/lib/elixir_ai/chat_runner.ex
@@ -49,6 +49,7 @@ defmodule ElixirAi.ChatRunner do
        "Last message role was #{last_message.role}, requesting AI response for conversation #{name}"
      )

+      broadcast_ui(name, :recovery_restart)
      ElixirAi.ChatUtils.request_ai_response(self(), messages, tools(self(), name), provider)
    end

--- a/lib/elixir_ai/cluster_singleton.ex
+++ b/lib/elixir_ai/cluster_singleton.ex
@@ -3,17 +3,33 @@ defmodule ElixirAi.ClusterSingleton do
  require Logger

  @sync_delay_ms 200
+  @retry_delay_ms 500

  @singletons [ElixirAi.ConversationManager]

  def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: __MODULE__)

+  def status, do: GenServer.call(__MODULE__, :status)
+
+  def configured_singletons, do: @singletons
+
  def init(_opts) do
    Process.send_after(self(), :start_singletons, @sync_delay_ms)
    {:ok, :pending}
  end

-  def handle_info(:start_singletons, _state) do
+  def handle_info(:start_singletons, state) do
+    if Node.list() == [] do
+      Logger.debug("ClusterSingleton: no peer nodes yet, retrying in #{@retry_delay_ms}ms")
+      Process.send_after(self(), :start_singletons, @retry_delay_ms)
+      {:noreply, state}
+    else
+      start_singletons()
+      {:noreply, :started}
+    end
+  end
+
+  defp start_singletons do
    for module <- @singletons do
      if singleton_exists?(module) do
        Logger.debug(
@@ -37,10 +53,10 @@ defmodule ElixirAi.ClusterSingleton do
        end
      end
    end
-
-    {:noreply, :started}
  end

+  def handle_call(:status, _from, state), do: {:reply, state, state}
+
  defp singleton_exists?(module) do
    case Horde.Registry.lookup(ElixirAi.ChatRegistry, module) do
      [{pid, _metadata} | _] when is_pid(pid) ->
--- a/lib/elixir_ai/conversation_manager.ex
+++ b/lib/elixir_ai/conversation_manager.ex
@@ -21,7 +21,7 @@ defmodule ElixirAi.ConversationManager do
  def init(_) do
    Logger.info("ConversationManager initializing...")
    send(self(), :load_conversations)
-    {:ok, %{conversations: :loading, subscriptions: MapSet.new()}}
+    {:ok, %{conversations: :loading, subscriptions: MapSet.new(), runners: %{}}}
  end

  def create_conversation(name, ai_provider_id) do
@@ -40,6 +40,10 @@ defmodule ElixirAi.ConversationManager do
    GenServer.call(@name, {:get_messages, name})
  end

+  def list_runners do
+    GenServer.call(@name, :list_runners)
+  end
+
  def handle_call(message, from, %{conversations: :loading} = state) do
    Logger.warning(
      "Received call #{inspect(message)} from #{inspect(from)} while loading conversations. Retrying after delay."
@@ -75,7 +79,7 @@ defmodule ElixirAi.ConversationManager do
        %{conversations: conversations} = state
      ) do
    if Map.has_key?(conversations, name) do
-      reply_with_started(name, state)
+      reply_with_conversation(name, state)
    else
      {:reply, {:error, :not_found}, state}
    end
@@ -84,10 +88,6 @@ defmodule ElixirAi.ConversationManager do
  def handle_call(:list, _from, %{conversations: conversations} = state) do
    keys = Map.keys(conversations)

-    Logger.debug(
-      "list_conversations returning: #{inspect(keys, limit: :infinity, printable_limit: :infinity, binaries: :as_binaries)}"
-    )
-
    {:reply, keys, state}
  end

@@ -95,6 +95,19 @@ defmodule ElixirAi.ConversationManager do
    {:reply, Map.get(conversations, name, []), state}
  end

+  def handle_call(:list_runners, _from, state) do
+    {:reply, Map.get(state, :runners, %{}), state}
+  end
+
+  def handle_info({:DOWN, _ref, :process, pid, reason}, %{runners: runners} = state) do
+    runners =
+      Enum.reject(runners, fn {_name, info} -> info.pid == pid end)
+      |> Map.new()
+
+    Logger.info("ConversationManager: runner #{inspect(pid)} went down (#{inspect(reason)})")
+    {:noreply, %{state | runners: runners}}
+  end
+
  def handle_info({:db_error, reason}, state) do
    Logger.error("ConversationManager received db_error: #{inspect(reason)}")
    {:noreply, state}
@@ -138,10 +151,13 @@ defmodule ElixirAi.ConversationManager do
    end
  end

+  # Returns {pid} to callers that only need to know the process started (e.g. create).
  defp reply_with_started(name, state, update_state \\ fn s -> s end) do
-    case start_and_subscribe(name, state.subscriptions) do
-      {:ok, pid, new_subscriptions} ->
-        new_state = update_state.(%{state | subscriptions: new_subscriptions})
+    case start_and_subscribe(name, state) do
+      {:ok, pid, new_subscriptions, new_runners} ->
+        new_state =
+          update_state.(%{state | subscriptions: new_subscriptions, runners: new_runners})
+
        {:reply, {:ok, pid}, new_state}

      {:error, _reason} = error ->
@@ -149,7 +165,21 @@ defmodule ElixirAi.ConversationManager do
    end
  end

-  defp start_and_subscribe(name, subscriptions) do
+  # Returns the full conversation state using the pid directly, bypassing the
+  # Horde registry (which may not have synced yet on the calling node).
+  defp reply_with_conversation(name, state) do
+    case start_and_subscribe(name, state) do
+      {:ok, pid, new_subscriptions, new_runners} ->
+        new_state = %{state | subscriptions: new_subscriptions, runners: new_runners}
+        conversation = GenServer.call(pid, :get_conversation)
+        {:reply, {:ok, conversation}, new_state}
+
+      {:error, _reason} = error ->
+        {:reply, error, state}
+    end
+  end
+
+  defp start_and_subscribe(name, state) do
    result =
      case Horde.DynamicSupervisor.start_child(
             ElixirAi.ChatRunnerSupervisor,
@@ -163,14 +193,24 @@ defmodule ElixirAi.ConversationManager do
    case result do
      {:ok, pid} ->
        new_subscriptions =
-          if MapSet.member?(subscriptions, name) do
-            subscriptions
+          if MapSet.member?(state.subscriptions, name) do
+            state.subscriptions
          else
            Phoenix.PubSub.subscribe(ElixirAi.PubSub, conversation_message_topic(name))
-            MapSet.put(subscriptions, name)
+            MapSet.put(state.subscriptions, name)
          end

-        {:ok, pid, new_subscriptions}
+        existing_runners = Map.get(state, :runners, %{})
+
+        new_runners =
+          if Map.has_key?(existing_runners, name) do
+            existing_runners
+          else
+            Process.monitor(pid)
+            Map.put(existing_runners, name, %{pid: pid, node: node(pid)})
+          end
+
+        {:ok, pid, new_subscriptions, new_runners}

      error ->
        error
--- a/lib/elixir_ai/data/ai_provider.ex
+++ b/lib/elixir_ai/data/ai_provider.ex
@@ -138,28 +138,80 @@ defmodule ElixirAi.AiProvider do
  end

  def ensure_default_provider do
-    sql = "SELECT COUNT(*) FROM ai_providers"
-    params = %{}
+    endpoint = Application.get_env(:elixir_ai, :ai_endpoint)
+    token = Application.get_env(:elixir_ai, :ai_token)
+    model = Application.get_env(:elixir_ai, :ai_model)

-    case DbHelpers.run_sql(sql, params, providers_topic()) do
-      {:error, :db_error} ->
-        {:error, :db_error}
+    if endpoint && token && model do
+      case find_by_name("default") do
+        {:error, :not_found} ->
+          attrs = %{
+            name: "default",
+            model_name: model,
+            api_token: token,
+            completions_url: endpoint
+          }

-      rows ->
-        case rows do
-          [%{"count" => 0}] ->
-            attrs = %{
-              name: "default",
-              model_name: Application.fetch_env!(:elixir_ai, :ai_model),
-              api_token: Application.fetch_env!(:elixir_ai, :ai_token),
-              completions_url: Application.fetch_env!(:elixir_ai, :ai_endpoint)
-            }
+          create(attrs)

-            create(attrs)
+        {:ok, _} ->
+          :ok

-          _ ->
-            :ok
+        {:error, reason} ->
+          {:error, reason}
+      end
+    else
+      Logger.info("AI env vars not configured, skipping default provider creation")
+      :ok
+    end
+  end
+
+  def ensure_providers_from_file do
+    case System.get_env("PROVIDERS_CONFIG_PATH") do
+      nil ->
+        :ok
+
+      path ->
+        case YamlElixir.read_from_file(path) do
+          {:ok, %{"providers" => providers}} when is_list(providers) ->
+            Enum.each(providers, &ensure_provider_from_yaml/1)
+
+          {:ok, _} ->
+            Logger.warning("providers.yml: expected a top-level 'providers' list, skipping")
+
+          {:error, reason} ->
+            Logger.warning("Could not read providers config from #{path}: #{inspect(reason)}")
        end
    end
  end
+
+  def ensure_configured_providers do
+    ensure_default_provider()
+    ensure_providers_from_file()
+  end
+
+  defp ensure_provider_from_yaml(%{
+         "name" => name,
+         "model" => model,
+         "responses_endpoint" => endpoint,
+         "api_key" => api_key
+       }) do
+    case find_by_name(name) do
+      {:error, :not_found} ->
+        Logger.info("Creating provider '#{name}' from providers config file")
+        create(%{name: name, model_name: model, api_token: api_key, completions_url: endpoint})
+
+      {:ok, _} ->
+        Logger.debug("Provider '#{name}' already exists, skipping")
+
+      {:error, reason} ->
+        Logger.warning("Could not check existence of provider '#{name}': #{inspect(reason)}")
+    end
+  end
+
+  defp ensure_provider_from_yaml(entry) do
+    Logger.warning(
+      "Skipping invalid provider entry in providers config file (must have name, model, responses_endpoint, api_key): #{inspect(entry)}"
+    )
+  end
 end
--- a/lib/elixir_ai/live_view_pg.ex
+++ b/lib/elixir_ai/live_view_pg.ex
@@ -0,0 +1,16 @@
+defmodule ElixirAi.LiveViewPG do
+  @moduledoc """
+  Named :pg scope for tracking LiveView processes across the cluster.
+  Each LiveView joins {:liveview, ViewModule} on connect; :pg syncs membership
+  automatically and removes dead processes without any additional cleanup.
+  """
+
+  def child_spec(_opts) do
+    %{
+      id: __MODULE__,
+      start: {:pg, :start_link, [__MODULE__]},
+      type: :worker,
+      restart: :permanent
+    }
+  end
+end
--- a/lib/elixir_ai_web/admin/admin_live.ex
+++ b/lib/elixir_ai_web/admin/admin_live.ex
@@ -0,0 +1,245 @@
+defmodule ElixirAiWeb.AdminLive do
+  use ElixirAiWeb, :live_view
+  require Logger
+
+  @refresh_ms 1_000
+
+  def mount(_params, _session, socket) do
+    if connected?(socket) do
+      :net_kernel.monitor_nodes(true)
+      :pg.join(ElixirAi.LiveViewPG, {:liveview, __MODULE__}, self())
+      schedule_refresh()
+    end
+
+    {:ok, assign(socket, cluster_info: gather_info())}
+  end
+
+  def handle_info({:nodeup, _node}, socket) do
+    {:noreply, assign(socket, cluster_info: gather_info())}
+  end
+
+  def handle_info({:nodedown, _node}, socket) do
+    {:noreply, assign(socket, cluster_info: gather_info())}
+  end
+
+  def handle_info(:refresh, socket) do
+    schedule_refresh()
+    {:noreply, assign(socket, cluster_info: gather_info())}
+  end
+
+  defp schedule_refresh, do: Process.send_after(self(), :refresh, @refresh_ms)
+
+  defp gather_info do
+    import ElixirAi.PubsubTopics
+
+    all_nodes = [Node.self() | Node.list()]
+    configured = ElixirAi.ClusterSingleton.configured_singletons()
+
+    node_statuses =
+      Enum.map(all_nodes, fn node ->
+        status =
+          if node == Node.self() do
+            try do
+              ElixirAi.ClusterSingleton.status()
+            catch
+              _, _ -> :unreachable
+            end
+          else
+            case :rpc.call(node, ElixirAi.ClusterSingleton, :status, [], 3_000) do
+              {:badrpc, _} -> :unreachable
+              result -> result
+            end
+          end
+
+        {node, status}
+      end)
+
+    singleton_locations =
+      Enum.map(configured, fn module ->
+        location =
+          case Horde.Registry.lookup(ElixirAi.ChatRegistry, module) do
+            [{pid, _}] -> node(pid)
+            _ -> nil
+          end
+
+        {module, location}
+      end)
+
+    # All ChatRunner entries in the distributed registry, keyed by conversation name.
+    # Each entry is a {name, node, pid, supervisor_node} tuple.
+    chat_runners =
+      Horde.DynamicSupervisor.which_children(ElixirAi.ChatRunnerSupervisor)
+      |> Enum.flat_map(fn
+        {_, pid, _, _} when is_pid(pid) ->
+          case Horde.Registry.select(ElixirAi.ChatRegistry, [
+                 {{:"$1", pid, :"$2"}, [], [{{:"$1", pid, :"$2"}}]}
+               ]) do
+            [{name, ^pid, _}] when is_binary(name) -> [{name, node(pid), pid}]
+            _ -> []
+          end
+
+        _ ->
+          []
+      end)
+      |> Enum.sort_by(&elem(&1, 0))
+
+    # :pg is cluster-wide — one local call returns members from all nodes.
+    # Processes are automatically removed from their group when they die.
+    liveviews =
+      :pg.which_groups(ElixirAi.LiveViewPG)
+      |> Enum.flat_map(fn
+        {:liveview, view} ->
+          :pg.get_members(ElixirAi.LiveViewPG, {:liveview, view})
+          |> Enum.map(fn pid -> {view, node(pid)} end)
+
+        _ ->
+          []
+      end)
+
+    %{
+      nodes: node_statuses,
+      configured_singletons: configured,
+      singleton_locations: singleton_locations,
+      chat_runners: chat_runners,
+      liveviews: liveviews
+    }
+  end
+
+  def render(assigns) do
+    ~H"""
+    <div class="p-6 space-y-4">
+      <h1 class="text-lg font-semibold text-cyan-200 tracking-wide">Cluster Admin</h1>
+
+      <div class="grid gap-4 grid-cols-1 lg:grid-cols-2 xl:grid-cols-3">
+        <%= for {node, status} <- @cluster_info.nodes do %>
+          <% node_singletons =
+            Enum.filter(@cluster_info.singleton_locations, fn {_, loc} -> loc == node end) %>
+          <% node_runners =
+            Enum.filter(@cluster_info.chat_runners, fn {_, rnode, _} -> rnode == node end) %>
+          <% node_liveviews =
+            @cluster_info.liveviews
+            |> Enum.filter(fn {_, n} -> n == node end)
+            |> Enum.group_by(fn {view, _} -> view end) %>
+
+          <div class="rounded-lg border border-cyan-800/50 bg-cyan-950/30 overflow-hidden">
+            <div class="flex items-center justify-between px-4 py-3 bg-cyan-900/40 border-b border-cyan-800/50">
+              <div class="flex items-center gap-2">
+                <span class="font-mono text-sm font-semibold text-cyan-200">{node}</span>
+                <%= if node == Node.self() do %>
+                  <span class="text-xs bg-cyan-800/50 text-cyan-400 px-1.5 py-0.5 rounded">self</span>
+                <% end %>
+              </div>
+              <.status_badge status={status} />
+            </div>
+
+            <div class="p-4 space-y-4">
+              <%= if node_singletons != [] do %>
+                <div>
+                  <p class="text-xs font-semibold uppercase tracking-widest text-cyan-600 mb-1.5">
+                    Singletons
+                  </p>
+                  <div class="space-y-1">
+                    <%= for {module, _} <- node_singletons do %>
+                      <div class="px-2 py-1.5 rounded bg-cyan-900/30 font-mono text-xs text-cyan-300">
+                        {inspect(module)}
+                      </div>
+                    <% end %>
+                  </div>
+                </div>
+              <% end %>
+
+              <%= if node_runners != [] do %>
+                <div>
+                  <p class="text-xs font-semibold uppercase tracking-widest text-cyan-600 mb-1.5">
+                    Chat Runners
+                    <span class="normal-case font-normal text-cyan-700 ml-1">
+                      {length(node_runners)}
+                    </span>
+                  </p>
+                  <div class="space-y-1">
+                    <%= for {name, _, _} <- node_runners do %>
+                      <div class="px-2 py-1.5 rounded bg-cyan-900/30 font-mono text-xs text-cyan-200">
+                        {name}
+                      </div>
+                    <% end %>
+                  </div>
+                </div>
+              <% end %>
+
+              <%= if node_liveviews != %{} do %>
+                <div>
+                  <p class="text-xs font-semibold uppercase tracking-widest text-cyan-600 mb-1.5">
+                    LiveViews
+                  </p>
+                  <div class="space-y-1">
+                    <%= for {view, instances} <- node_liveviews do %>
+                      <div class="px-2 py-1.5 rounded bg-cyan-900/30 flex justify-between items-center gap-2">
+                        <span class="font-mono text-xs text-cyan-200">{short_module(view)}</span>
+                        <span class="text-xs text-cyan-600">×{length(instances)}</span>
+                      </div>
+                    <% end %>
+                  </div>
+                </div>
+              <% end %>
+
+              <%= if node_singletons == [] and node_runners == [] and node_liveviews == %{} do %>
+                <p class="text-xs text-cyan-700 italic">No active processes</p>
+              <% end %>
+            </div>
+          </div>
+        <% end %>
+      </div>
+
+      <% unlocated =
+        Enum.filter(@cluster_info.singleton_locations, fn {_, loc} -> is_nil(loc) end) %>
+      <%= if unlocated != [] do %>
+        <section>
+          <h2 class="text-xs font-semibold uppercase tracking-widest text-red-500 mb-2">
+            Singletons Not Running
+          </h2>
+          <div class="flex flex-wrap gap-2">
+            <%= for {module, _} <- unlocated do %>
+              <span class="px-2 py-1 rounded bg-red-900/20 border border-red-800/40 font-mono text-xs text-red-400">
+                {inspect(module)}
+              </span>
+            <% end %>
+          </div>
+        </section>
+      <% end %>
+
+      <p class="text-xs text-cyan-800">Refreshes every 1s or on node events.</p>
+    </div>
+    """
+  end
+
+  defp short_module(module) when is_atom(module) do
+    module
+    |> Atom.to_string()
+    |> String.replace_prefix("Elixir.", "")
+    |> String.split(".")
+    |> List.last()
+  end
+
+  defp status_badge(assigns) do
+    ~H"""
+    <%= case @status do %>
+      <% :started -> %>
+        <span class="inline-block px-2 py-0.5 rounded text-xs font-semibold bg-green-900 text-green-300">
+          started
+        </span>
+      <% :pending -> %>
+        <span class="inline-block px-2 py-0.5 rounded text-xs font-semibold bg-yellow-900 text-yellow-300">
+          pending
+        </span>
+      <% :unreachable -> %>
+        <span class="inline-block px-2 py-0.5 rounded text-xs font-semibold bg-red-900 text-red-300">
+          unreachable
+        </span>
+      <% other -> %>
+        <span class="inline-block px-2 py-0.5 rounded text-xs font-semibold bg-cyan-900 text-cyan-300">
+          {inspect(other)}
+        </span>
+    <% end %>
+    """
+  end
+end
--- a/lib/elixir_ai_web/chat/chat_live.ex
+++ b/lib/elixir_ai_web/chat/chat_live.ex
@@ -10,13 +10,12 @@ defmodule ElixirAiWeb.ChatLive do

  def mount(%{"name" => name}, _session, socket) do
    case ConversationManager.open_conversation(name) do
-      {:ok, _pid} ->
+      {:ok, conversation} ->
        if connected?(socket) do
          Phoenix.PubSub.subscribe(ElixirAi.PubSub, chat_topic(name))
+          :pg.join(ElixirAi.LiveViewPG, {:liveview, __MODULE__}, self())
        end

-        conversation = ChatRunner.get_conversation(name)
-
        {:ok,
         socket
         |> assign(conversation_name: name)
@@ -25,7 +24,8 @@ defmodule ElixirAiWeb.ChatLive do
         |> assign(streaming_response: conversation.streaming_response)
         |> assign(background_color: "bg-cyan-950/30")
         |> assign(provider: conversation.provider)
-         |> assign(db_error: nil)}
+         |> assign(db_error: nil)
+         |> assign(ai_error: nil)}

      {:error, :not_found} ->
        {:ok, push_navigate(socket, to: "/")}
@@ -41,7 +41,8 @@ defmodule ElixirAiWeb.ChatLive do
         |> assign(streaming_response: nil)
         |> assign(background_color: "bg-cyan-950/30")
         |> assign(provider: nil)
-         |> assign(db_error: Exception.format(:error, reason))}
+         |> assign(db_error: Exception.format(:error, reason))
+         |> assign(ai_error: nil)}
    end
  end

@@ -60,6 +61,11 @@ defmodule ElixirAiWeb.ChatLive do
          Database error: {@db_error}
        </div>
      <% end %>
+      <%= if @ai_error do %>
+        <div class="mx-4 mt-2 px-3 py-2 rounded text-sm text-red-400 bg-red-950/40" role="alert">
+          AI error: {@ai_error}
+        </div>
+      <% end %>
      <div
        id="chat-messages"
        phx-hook="ScrollBottom"
@@ -118,6 +124,10 @@ defmodule ElixirAiWeb.ChatLive do
    {:noreply, assign(socket, user_input: "")}
  end

+  def handle_info(:recovery_restart, socket) do
+    {:noreply, assign(socket, streaming_response: nil, ai_error: nil)}
+  end
+
  def handle_info({:user_chat_message, message}, socket) do
    {:noreply,
     socket
@@ -210,6 +220,16 @@ defmodule ElixirAiWeb.ChatLive do
    {:noreply, assign(socket, db_error: reason)}
  end

+  def handle_info({:ai_request_error, reason}, socket) do
+    error_message =
+      case reason do
+        %{__struct__: mod, reason: r} -> "#{inspect(mod)}: #{inspect(r)}"
+        _ -> inspect(reason)
+      end
+
+    {:noreply, assign(socket, ai_error: error_message, streaming_response: nil)}
+  end
+
  def handle_info({:set_background_color, color}, socket) do
    Logger.info("setting background color to #{color}")
    {:noreply, assign(socket, background_color: color)}
--- a/lib/elixir_ai_web/home/home_live.ex
+++ b/lib/elixir_ai_web/home/home_live.ex
@@ -8,6 +8,7 @@ defmodule ElixirAiWeb.HomeLive do
  def mount(_params, _session, socket) do
    if connected?(socket) do
      Phoenix.PubSub.subscribe(ElixirAi.PubSub, providers_topic())
+      :pg.join(ElixirAi.LiveViewPG, {:liveview, __MODULE__}, self())
      send(self(), :load_data)
    end

--- a/lib/elixir_ai_web/router.ex
+++ b/lib/elixir_ai_web/router.ex
@@ -19,6 +19,7 @@ defmodule ElixirAiWeb.Router do

    live "/", HomeLive
    live "/chat/:name", ChatLive
+    live "/admin", AdminLive
  end

  # Other scopes may use custom stacks.
--- a/mix.exs
+++ b/mix.exs
@@ -5,7 +5,7 @@ defmodule ElixirAi.MixProject do
    [
      app: :elixir_ai,
      version: "0.1.0",
-      elixir: "~> 1.18",
+      elixir: "~> 1.19",
      elixirc_paths: elixirc_paths(Mix.env()),
      start_permanent: Mix.env() == :prod,
      aliases: aliases(),
@@ -61,7 +61,8 @@ defmodule ElixirAi.MixProject do
      {:horde, "~> 0.9"},
      {:credo, "~> 1.7", only: [:dev, :test], runtime: false},
      {:mimic, "~> 2.3.0"},
-      {:zoi, "~> 0.17"}
+      {:zoi, "~> 0.17"},
+      {:yaml_elixir, "~> 2.12"}
    ]
  end

--- a/mix.lock
+++ b/mix.lock
@@ -68,5 +68,7 @@
  "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.1", "a48703a25c170eedadca83b11e88985af08d35f37c6f664d6dcfb106a97782fc", [:rebar3], [], "hexpm", "b3a917854ce3ae233619744ad1e0102e05673136776fb2fa76234f3e03b23642"},
  "websock": {:hex, :websock, "0.5.3", "2f69a6ebe810328555b6fe5c831a851f485e303a7c8ce6c5f675abeb20ebdadc", [:mix], [], "hexpm", "6105453d7fac22c712ad66fab1d45abdf049868f253cf719b625151460b8b453"},
  "websock_adapter": {:hex, :websock_adapter, "0.5.9", "43dc3ba6d89ef5dec5b1d0a39698436a1e856d000d84bf31a3149862b01a287f", [:mix], [{:bandit, ">= 0.6.0", [hex: :bandit, repo: "hexpm", optional: true]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.6", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:websock, "~> 0.5", [hex: :websock, repo: "hexpm", optional: false]}], "hexpm", "5534d5c9adad3c18a0f58a9371220d75a803bf0b9a3d87e6fe072faaeed76a08"},
+  "yamerl": {:hex, :yamerl, "0.10.0", "4ff81fee2f1f6a46f1700c0d880b24d193ddb74bd14ef42cb0bcf46e81ef2f8e", [:rebar3], [], "hexpm", "346adb2963f1051dc837a2364e4acf6eb7d80097c0f53cbdc3046ec8ec4b4e6e"},
+  "yaml_elixir": {:hex, :yaml_elixir, "2.12.1", "d74f2d82294651b58dac849c45a82aaea639766797359baff834b64439f6b3f4", [:mix], [{:yamerl, "~> 0.10", [hex: :yamerl, repo: "hexpm", optional: false]}], "hexpm", "d9ac16563c737d55f9bfeed7627489156b91268a3a21cd55c54eb2e335207fed"},
  "zoi": {:hex, :zoi, "0.17.1", "406aa87bb4181f41dee64336b75434367b7d3e88db813b0e6db0ae2d0f81f743", [:mix], [{:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.14.2 or ~> 3.0 or ~> 4.1", [hex: :phoenix_html, repo: "hexpm", optional: true]}], "hexpm", "3a11bf3bc9189f988ac74e81b5d7ca0c689b2a20eed220746a7043aa528e2aab"},
 }
--- a/test/elixir_ai_web/live/chat_live_test.exs
+++ b/test/elixir_ai_web/live/chat_live_test.exs
@@ -3,10 +3,8 @@ defmodule ElixirAiWeb.ChatLiveTest do
  import ElixirAi.PubsubTopics, only: [chat_topic: 1]

  setup do
-    stub(ElixirAi.ConversationManager, :open_conversation, fn _name -> {:ok, self()} end)
-
-    stub(ElixirAi.ChatRunner, :get_conversation, fn _name ->
-      %{messages: [], streaming_response: nil}
+    stub(ElixirAi.ConversationManager, :open_conversation, fn _name ->
+      {:ok, %{messages: [], streaming_response: nil, provider: nil}}
    end)

    :ok
--- a/tool-calling-outline.md
+++ b/tool-calling-outline.md
@@ -0,0 +1,435 @@
+Below is a **structured design document** intended for another LLM (or engineer) to implement a **persistent BEAM-backed CLI execution system inside a long-lived Docker container using `docker exec`**.
+
+---
+
+# Design Document: Persistent BEAM Tool Runner in Docker
+
+## 1. Objective
+
+Build a system where:
+
+* A **single long-lived Docker container** hosts:
+
+  * a **persistent Elixir (BEAM) daemon**
+  * a standard **bash execution environment**
+* All commands are executed via:
+
+  ```bash
+  docker exec <container> bash -lc "<command>"
+  ```
+* Common tools (`cat`, `grep`, etc.) are **intercepted via PATH shims**
+* Shims delegate execution to the **persistent BEAM daemon**
+* The daemon:
+
+  * executes real system commands
+  * truncates output deterministically
+  * returns `{stdout, stderr, exit_code}`
+
+---
+
+## 2. Non-Goals
+
+* No AI/model integration
+* No streaming output (batch only)
+* No advanced sandboxing (seccomp/cgroups optional later)
+* No distributed execution
+
+---
+
+## 3. System Overview
+
+```text
+Host
+ └─ docker exec
+     └─ Container (long-lived)
+         ├─ bash
+         │   └─ cat / grep / etc → shim (shell script)
+         │        └─ Unix socket request
+         │             └─ BEAM daemon
+         │                  └─ System.cmd("cat", ...)
+         │                  └─ truncate output
+         │                  └─ return response
+```
+
+---
+
+## 4. Key Design Decisions
+
+### 4.1 Persistent Container
+
+* Container is started once and reused
+* Avoid `docker run` per command
+
+### 4.2 Persistent BEAM Process
+
+* Avoid BEAM startup per command
+* Centralize execution + truncation
+
+### 4.3 Bash as Execution Engine
+
+* Do not reimplement shell parsing
+* Support pipes, redirects, chaining
+
+### 4.4 PATH Interception
+
+* Replace selected binaries with shims
+* Keep system binaries available underneath
+
+---
+
+## 5. Container Specification
+
+### 5.1 Base Image
+
+* `debian:bookworm-slim`
+
+### 5.2 Required Packages
+
+```bash
+elixir
+erlang
+bash
+socat
+coreutils
+grep
+```
+
+---
+
+### 5.3 Filesystem Layout
+
+```text
+/app
+  daemon.exs
+  shims/
+    cat
+    grep
+```
+
+---
+
+### 5.4 PATH Configuration
+
+```bash
+PATH=/app/shims:/usr/bin:/bin
+```
+
+---
+
+### 5.5 Container Startup Command
+
+```bash
+elixir daemon.exs & exec bash
+```
+
+Requirements:
+
+* daemon must start before shell usage
+* shell must remain interactive/alive
+
+---
+
+## 6. BEAM Daemon Specification
+
+### 6.1 Transport
+
+* Unix domain socket:
+
+  ```text
+  /tmp/tool_runner.sock
+  ```
+
+* Protocol:
+
+  * request: single line
+  * response: Erlang binary (`:erlang.term_to_binary/1`)
+
+---
+
+### 6.2 Request Format (v1)
+
+```text
+<command>\t<arg1>\t<arg2>\n
+```
+
+Example:
+
+```text
+cat\tfile.txt\n
+```
+
+---
+
+### 6.3 Response Format
+
+```elixir
+{stdout :: binary, stderr :: binary, exit_code :: integer}
+```
+
+Encoded via:
+
+```elixir
+:erlang.term_to_binary/1
+```
+
+---
+
+### 6.4 Execution Logic
+
+For each request:
+
+1. Parse command + args
+2. Call:
+
+```elixir
+System.cmd(cmd, args, stderr_to_stdout: false)
+```
+
+3. Apply truncation (see below)
+4. Return encoded response
+
+---
+
+### 6.5 Truncation Rules
+
+Configurable constants:
+
+```elixir
+@max_bytes 4000
+@max_lines 200
+```
+
+Apply in order:
+
+1. truncate by bytes
+2. truncate by lines
+
+Append:
+
+```text
+...[truncated]
+```
+
+---
+
+### 6.6 Concurrency Model
+
+* Accept loop via `:gen_tcp.accept`
+* Each client handled in separate lightweight process (`spawn`)
+* No shared mutable state required
+
+---
+
+### 6.7 Error Handling
+
+* Unknown command → return exit_code 127
+* Exceptions → return exit_code 1 + error message
+* Socket failure → ignore safely
+
+---
+
+## 7. Shim Specification
+
+### 7.1 Purpose
+
+* Replace system binaries (`cat`, `grep`)
+* Forward calls to daemon
+* Reproduce exact CLI behavior:
+
+  * stdout
+  * stderr
+  * exit code
+
+---
+
+### 7.2 Implementation Language
+
+* Bash (fast startup, no BEAM overhead)
+
+---
+
+### 7.3 Behavior
+
+For command:
+
+```bash
+cat file.txt
+```
+
+Shim must:
+
+1. Build request string
+2. Send to socket via `socat`
+3. Receive binary response
+4. Decode response
+5. Write:
+
+   * stdout → STDOUT
+   * stderr → STDERR
+6. Exit with correct code
+
+---
+
+### 7.4 Request Construction (in-memory)
+
+No temp files.
+
+```bash
+{
+  printf "cat"
+  for arg in "$@"; do
+    printf "\t%s" "$arg"
+  done
+  printf "\n"
+} | socat - UNIX-CONNECT:/tmp/tool_runner.sock
+```
+
+---
+
+### 7.5 Response Decoding
+
+Temporary approach:
+
+```bash
+elixir -e '
+  {out, err, code} = :erlang.binary_to_term(IO.read(:stdio, :all))
+  IO.write(out)
+  if err != "", do: IO.write(:stderr, err)
+  System.halt(code)
+'
+```
+
+---
+
+### 7.6 Known Limitation
+
+* Arguments containing tabs/newlines will break protocol
+* Acceptable for v1
+* Future: switch to JSON protocol
+
+---
+
+## 8. Execution Flow Example
+
+```bash
+docker exec container bash -lc "cat file.txt | grep foo"
+```
+
+Inside container:
+
+1. `cat` → shim
+2. shim → daemon → real `cat`
+3. truncated output returned
+4. piped to `grep`
+5. `grep` → shim → daemon → real `grep`
+
+---
+
+## 9. Performance Expectations
+
+| Component     | Latency   |
+| ------------- | --------- |
+| docker exec   | 10–40 ms  |
+| shim + socket | 1–5 ms    |
+| System.cmd    | 1–5 ms    |
+| total         | ~15–50 ms |
+
+---
+
+## 10. Security Considerations
+
+Minimal (v1):
+
+* No command filtering
+* Full shell access inside container
+
+Future:
+
+* allowlist commands
+* resource limits
+* seccomp profile
+
+---
+
+## 11. Extensibility
+
+### 11.1 Add new tools
+
+* create shim in `/app/shims`
+* no daemon change required
+
+---
+
+### 11.2 Central policies
+
+Implement in daemon:
+
+* timeouts
+* logging
+* output shaping
+* auditing
+
+---
+
+### 11.3 Protocol upgrade path
+
+Replace tab protocol with:
+
+```json
+{ "cmd": "...", "args": [...] }
+```
+
+---
+
+## 12. Failure Modes
+
+| Failure            | Behavior                      |
+| ------------------ | ----------------------------- |
+| daemon not running | shim fails (connection error) |
+| socket missing     | immediate error               |
+| malformed response | decode failure                |
+| command not found  | exit 127                      |
+
+---
+
+## 13. Implementation Checklist
+
+* [ ] Dockerfile builds successfully
+* [ ] daemon starts on container launch
+* [ ] socket created at `/tmp/tool_runner.sock`
+* [ ] shim intercepts commands via PATH
+* [ ] shim communicates with daemon
+* [ ] stdout/stderr preserved
+* [ ] exit codes preserved
+* [ ] truncation enforced
+
+---
+
+## 14. Minimal Acceptance Test
+
+```bash
+docker exec container bash -lc "echo hello"
+docker exec container bash -lc "cat /etc/passwd | grep root"
+docker exec container bash -lc "cat large_file.txt"
+```
+
+Verify:
+
+* correct output
+* truncated when large
+* no noticeable delay beyond ~50ms
+
+---
+
+## 15. Summary
+
+This system:
+
+* avoids BEAM startup overhead
+* preserves Unix execution semantics
+* centralizes control in Elixir
+* remains simple and composable
+
+It matches the intended pattern:
+
+> “Use the real environment, intercept selectively, and control outputs centrally.”