better process tracking for admin dashboard

2026-03-20 12:07:16 -06:00
parent b2f53942a2
commit 6138d71d29
21 changed files with 910 additions and 106 deletions
--- a/lib/elixir_ai/ai_utils/chat_utils.ex
+++ b/lib/elixir_ai/ai_utils/chat_utils.ex
@@ -27,9 +27,16 @@ defmodule ElixirAi.ChatUtils do
    }

    run_function = fn current_message_id, tool_call_id, args ->
-      Task.start(fn ->
-        result = function.(args)
-        send(server, {:tool_response, current_message_id, tool_call_id, result})
+      Task.start_link(fn ->
+        try do
+          result = function.(args)
+          send(server, {:tool_response, current_message_id, tool_call_id, result})
+        rescue
+          e ->
+            reason = Exception.format(:error, e, __STACKTRACE__)
+            Logger.error("Tool task crashed: #{reason}")
+            send(server, {:tool_response, current_message_id, tool_call_id, {:error, reason}})
+        end
      end)
    end

@@ -41,7 +48,7 @@ defmodule ElixirAi.ChatUtils do
  end

  def request_ai_response(server, messages, tools, provider) do
-    Task.start(fn ->
+    Task.start_link(fn ->
      api_url = provider.completions_url
      api_key = provider.api_token
      model = provider.model_name
@@ -82,7 +89,8 @@ defmodule ElixirAi.ChatUtils do
          :ok

        {:error, reason} ->
-          IO.warn("AI request failed: #{inspect(reason)} for #{api_url}")
+          Logger.warning("AI request failed: #{inspect(reason)} for #{api_url}")
+          send(server, {:ai_request_error, reason})
      end
    end)
  end
--- a/lib/elixir_ai/application.ex
+++ b/lib/elixir_ai/application.ex
@@ -12,6 +12,7 @@ defmodule ElixirAi.Application do
      {Cluster.Supervisor,
       [Application.get_env(:libcluster, :topologies, []), [name: ElixirAi.ClusterSupervisor]]},
      {Phoenix.PubSub, name: ElixirAi.PubSub},
+      {ElixirAi.LiveViewPG, []},
      ElixirAi.ToolTesting,
      ElixirAiWeb.Endpoint,
      {Horde.Registry,
@@ -55,7 +56,7 @@ defmodule ElixirAi.Application do
    if Application.get_env(:elixir_ai, :env) == :test do
      Supervisor.child_spec({Task, fn -> :ok end}, id: :skip_default_provider)
    else
-      {Task, fn -> ElixirAi.AiProvider.ensure_default_provider() end}
+      {Task, fn -> ElixirAi.AiProvider.ensure_configured_providers() end}
    end
  end

--- a/lib/elixir_ai/chat_runner.ex
+++ b/lib/elixir_ai/chat_runner.ex
@@ -49,6 +49,7 @@ defmodule ElixirAi.ChatRunner do
        "Last message role was #{last_message.role}, requesting AI response for conversation #{name}"
      )

+      broadcast_ui(name, :recovery_restart)
      ElixirAi.ChatUtils.request_ai_response(self(), messages, tools(self(), name), provider)
    end

--- a/lib/elixir_ai/cluster_singleton.ex
+++ b/lib/elixir_ai/cluster_singleton.ex
@@ -3,17 +3,33 @@ defmodule ElixirAi.ClusterSingleton do
  require Logger

  @sync_delay_ms 200
+  @retry_delay_ms 500

  @singletons [ElixirAi.ConversationManager]

  def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: __MODULE__)

+  def status, do: GenServer.call(__MODULE__, :status)
+
+  def configured_singletons, do: @singletons
+
  def init(_opts) do
    Process.send_after(self(), :start_singletons, @sync_delay_ms)
    {:ok, :pending}
  end

-  def handle_info(:start_singletons, _state) do
+  def handle_info(:start_singletons, state) do
+    if Node.list() == [] do
+      Logger.debug("ClusterSingleton: no peer nodes yet, retrying in #{@retry_delay_ms}ms")
+      Process.send_after(self(), :start_singletons, @retry_delay_ms)
+      {:noreply, state}
+    else
+      start_singletons()
+      {:noreply, :started}
+    end
+  end
+
+  defp start_singletons do
    for module <- @singletons do
      if singleton_exists?(module) do
        Logger.debug(
@@ -37,10 +53,10 @@ defmodule ElixirAi.ClusterSingleton do
        end
      end
    end
-
-    {:noreply, :started}
  end

+  def handle_call(:status, _from, state), do: {:reply, state, state}
+
  defp singleton_exists?(module) do
    case Horde.Registry.lookup(ElixirAi.ChatRegistry, module) do
      [{pid, _metadata} | _] when is_pid(pid) ->
--- a/lib/elixir_ai/conversation_manager.ex
+++ b/lib/elixir_ai/conversation_manager.ex
@@ -21,7 +21,7 @@ defmodule ElixirAi.ConversationManager do
  def init(_) do
    Logger.info("ConversationManager initializing...")
    send(self(), :load_conversations)
-    {:ok, %{conversations: :loading, subscriptions: MapSet.new()}}
+    {:ok, %{conversations: :loading, subscriptions: MapSet.new(), runners: %{}}}
  end

  def create_conversation(name, ai_provider_id) do
@@ -40,6 +40,10 @@ defmodule ElixirAi.ConversationManager do
    GenServer.call(@name, {:get_messages, name})
  end

+  def list_runners do
+    GenServer.call(@name, :list_runners)
+  end
+
  def handle_call(message, from, %{conversations: :loading} = state) do
    Logger.warning(
      "Received call #{inspect(message)} from #{inspect(from)} while loading conversations. Retrying after delay."
@@ -75,7 +79,7 @@ defmodule ElixirAi.ConversationManager do
        %{conversations: conversations} = state
      ) do
    if Map.has_key?(conversations, name) do
-      reply_with_started(name, state)
+      reply_with_conversation(name, state)
    else
      {:reply, {:error, :not_found}, state}
    end
@@ -84,10 +88,6 @@ defmodule ElixirAi.ConversationManager do
  def handle_call(:list, _from, %{conversations: conversations} = state) do
    keys = Map.keys(conversations)

-    Logger.debug(
-      "list_conversations returning: #{inspect(keys, limit: :infinity, printable_limit: :infinity, binaries: :as_binaries)}"
-    )
-
    {:reply, keys, state}
  end

@@ -95,6 +95,19 @@ defmodule ElixirAi.ConversationManager do
    {:reply, Map.get(conversations, name, []), state}
  end

+  def handle_call(:list_runners, _from, state) do
+    {:reply, Map.get(state, :runners, %{}), state}
+  end
+
+  def handle_info({:DOWN, _ref, :process, pid, reason}, %{runners: runners} = state) do
+    runners =
+      Enum.reject(runners, fn {_name, info} -> info.pid == pid end)
+      |> Map.new()
+
+    Logger.info("ConversationManager: runner #{inspect(pid)} went down (#{inspect(reason)})")
+    {:noreply, %{state | runners: runners}}
+  end
+
  def handle_info({:db_error, reason}, state) do
    Logger.error("ConversationManager received db_error: #{inspect(reason)}")
    {:noreply, state}
@@ -138,10 +151,13 @@ defmodule ElixirAi.ConversationManager do
    end
  end

+  # Returns {pid} to callers that only need to know the process started (e.g. create).
  defp reply_with_started(name, state, update_state \\ fn s -> s end) do
-    case start_and_subscribe(name, state.subscriptions) do
-      {:ok, pid, new_subscriptions} ->
-        new_state = update_state.(%{state | subscriptions: new_subscriptions})
+    case start_and_subscribe(name, state) do
+      {:ok, pid, new_subscriptions, new_runners} ->
+        new_state =
+          update_state.(%{state | subscriptions: new_subscriptions, runners: new_runners})
+
        {:reply, {:ok, pid}, new_state}

      {:error, _reason} = error ->
@@ -149,7 +165,21 @@ defmodule ElixirAi.ConversationManager do
    end
  end

-  defp start_and_subscribe(name, subscriptions) do
+  # Returns the full conversation state using the pid directly, bypassing the
+  # Horde registry (which may not have synced yet on the calling node).
+  defp reply_with_conversation(name, state) do
+    case start_and_subscribe(name, state) do
+      {:ok, pid, new_subscriptions, new_runners} ->
+        new_state = %{state | subscriptions: new_subscriptions, runners: new_runners}
+        conversation = GenServer.call(pid, :get_conversation)
+        {:reply, {:ok, conversation}, new_state}
+
+      {:error, _reason} = error ->
+        {:reply, error, state}
+    end
+  end
+
+  defp start_and_subscribe(name, state) do
    result =
      case Horde.DynamicSupervisor.start_child(
             ElixirAi.ChatRunnerSupervisor,
@@ -163,14 +193,24 @@ defmodule ElixirAi.ConversationManager do
    case result do
      {:ok, pid} ->
        new_subscriptions =
-          if MapSet.member?(subscriptions, name) do
-            subscriptions
+          if MapSet.member?(state.subscriptions, name) do
+            state.subscriptions
          else
            Phoenix.PubSub.subscribe(ElixirAi.PubSub, conversation_message_topic(name))
-            MapSet.put(subscriptions, name)
+            MapSet.put(state.subscriptions, name)
          end

-        {:ok, pid, new_subscriptions}
+        existing_runners = Map.get(state, :runners, %{})
+
+        new_runners =
+          if Map.has_key?(existing_runners, name) do
+            existing_runners
+          else
+            Process.monitor(pid)
+            Map.put(existing_runners, name, %{pid: pid, node: node(pid)})
+          end
+
+        {:ok, pid, new_subscriptions, new_runners}

      error ->
        error
--- a/lib/elixir_ai/data/ai_provider.ex
+++ b/lib/elixir_ai/data/ai_provider.ex
@@ -138,28 +138,80 @@ defmodule ElixirAi.AiProvider do
  end

  def ensure_default_provider do
-    sql = "SELECT COUNT(*) FROM ai_providers"
-    params = %{}
+    endpoint = Application.get_env(:elixir_ai, :ai_endpoint)
+    token = Application.get_env(:elixir_ai, :ai_token)
+    model = Application.get_env(:elixir_ai, :ai_model)

-    case DbHelpers.run_sql(sql, params, providers_topic()) do
-      {:error, :db_error} ->
-        {:error, :db_error}
+    if endpoint && token && model do
+      case find_by_name("default") do
+        {:error, :not_found} ->
+          attrs = %{
+            name: "default",
+            model_name: model,
+            api_token: token,
+            completions_url: endpoint
+          }

-      rows ->
-        case rows do
-          [%{"count" => 0}] ->
-            attrs = %{
-              name: "default",
-              model_name: Application.fetch_env!(:elixir_ai, :ai_model),
-              api_token: Application.fetch_env!(:elixir_ai, :ai_token),
-              completions_url: Application.fetch_env!(:elixir_ai, :ai_endpoint)
-            }
+          create(attrs)

-            create(attrs)
+        {:ok, _} ->
+          :ok

-          _ ->
-            :ok
+        {:error, reason} ->
+          {:error, reason}
+      end
+    else
+      Logger.info("AI env vars not configured, skipping default provider creation")
+      :ok
+    end
+  end
+
+  def ensure_providers_from_file do
+    case System.get_env("PROVIDERS_CONFIG_PATH") do
+      nil ->
+        :ok
+
+      path ->
+        case YamlElixir.read_from_file(path) do
+          {:ok, %{"providers" => providers}} when is_list(providers) ->
+            Enum.each(providers, &ensure_provider_from_yaml/1)
+
+          {:ok, _} ->
+            Logger.warning("providers.yml: expected a top-level 'providers' list, skipping")
+
+          {:error, reason} ->
+            Logger.warning("Could not read providers config from #{path}: #{inspect(reason)}")
        end
    end
  end
+
+  def ensure_configured_providers do
+    ensure_default_provider()
+    ensure_providers_from_file()
+  end
+
+  defp ensure_provider_from_yaml(%{
+         "name" => name,
+         "model" => model,
+         "responses_endpoint" => endpoint,
+         "api_key" => api_key
+       }) do
+    case find_by_name(name) do
+      {:error, :not_found} ->
+        Logger.info("Creating provider '#{name}' from providers config file")
+        create(%{name: name, model_name: model, api_token: api_key, completions_url: endpoint})
+
+      {:ok, _} ->
+        Logger.debug("Provider '#{name}' already exists, skipping")
+
+      {:error, reason} ->
+        Logger.warning("Could not check existence of provider '#{name}': #{inspect(reason)}")
+    end
+  end
+
+  defp ensure_provider_from_yaml(entry) do
+    Logger.warning(
+      "Skipping invalid provider entry in providers config file (must have name, model, responses_endpoint, api_key): #{inspect(entry)}"
+    )
+  end
 end
--- a/lib/elixir_ai/live_view_pg.ex
+++ b/lib/elixir_ai/live_view_pg.ex
@@ -0,0 +1,16 @@
+defmodule ElixirAi.LiveViewPG do
+  @moduledoc """
+  Named :pg scope for tracking LiveView processes across the cluster.
+  Each LiveView joins {:liveview, ViewModule} on connect; :pg syncs membership
+  automatically and removes dead processes without any additional cleanup.
+  """
+
+  def child_spec(_opts) do
+    %{
+      id: __MODULE__,
+      start: {:pg, :start_link, [__MODULE__]},
+      type: :worker,
+      restart: :permanent
+    }
+  end
+end