udpates

2026-03-25 15:13:43 -06:00
parent 0041c25f19
commit 62f16b2bde
7 changed files with 318 additions and 179 deletions
--- a/lib/elixir_ai/ai_tools/ai_tools.ex
+++ b/lib/elixir_ai/ai_tools/ai_tools.ex
@@ -18,7 +18,7 @@ defmodule ElixirAi.AiTools do

  import ElixirAi.ChatUtils, only: [ai_tool: 1]

-  @server_tool_names ["store_thing", "read_thing"]
+  @server_tool_names ["store_thing", "read_thing", "list_conversations"]
  @liveview_tool_names ["set_background_color", "navigate_to"]
  @all_tool_names @server_tool_names ++ @liveview_tool_names

@@ -29,7 +29,7 @@ defmodule ElixirAi.AiTools do
  def all_tool_names, do: @all_tool_names

  def build_server_tools(server, allowed_names) do
-    [store_thing(server), read_thing(server)]
+    [store_thing(server), read_thing(server), list_conversations(server)]
    |> Enum.filter(&(&1.name in allowed_names))
  end

@@ -67,6 +67,23 @@ defmodule ElixirAi.AiTools do
    )
  end

+  def list_conversations(server) do
+    ai_tool(
+      name: "list_conversations",
+      description: """
+      Returns a list of all conversation names in the application.
+      Always call this tool before navigating to a conversation page (e.g. /chat/:name)
+      to ensure the conversation exists and to obtain the exact name to use in the path.
+      """,
+      function: fn _args ->
+        names = ElixirAi.ConversationManager.list_conversations()
+        {:ok, names}
+      end,
+      parameters: %{"type" => "object", "properties" => %{}},
+      server: server
+    )
+  end
+
  # ---------------------------------------------------------------------------
  # LiveView tools
  # ---------------------------------------------------------------------------
--- a/lib/elixir_ai/application.ex
+++ b/lib/elixir_ai/application.ex
@@ -3,39 +3,40 @@ defmodule ElixirAi.Application do
  use Application

  def start(_type, _args) do
-    children = [
-      ElixirAiWeb.Telemetry,
-      # Conditionally start Repo (skip in test environment)
-      repo_child_spec(),
-      default_provider_task(),
-      {Cluster.Supervisor,
-       [Application.get_env(:libcluster, :topologies, []), [name: ElixirAi.ClusterSupervisor]]},
-      {Phoenix.PubSub, name: ElixirAi.PubSub},
-      {ElixirAi.LiveViewPG, []},
-      {ElixirAi.RunnerPG, []},
-      {ElixirAi.SingletonPG, []},
-      {ElixirAi.PageToolsPG, []},
-      {ElixirAi.AudioProcessingPG, []},
-      {DynamicSupervisor, name: ElixirAi.AudioWorkerSupervisor, strategy: :one_for_one},
-      ElixirAi.ToolTesting,
-      ElixirAiWeb.Endpoint,
-      {Horde.Registry,
-       [
-         name: ElixirAi.ChatRegistry,
-         keys: :unique,
-         members: :auto,
-         delta_crdt_options: [sync_interval: 100]
-       ]},
-      {Horde.DynamicSupervisor,
-       [
-         name: ElixirAi.ChatRunnerSupervisor,
-         strategy: :one_for_one,
-         members: :auto,
-         delta_crdt_options: [sync_interval: 100],
-         process_redistribution: :active
-       ]},
-      cluster_singleton_child_spec()
-    ]
+    children =
+      [
+        ElixirAiWeb.Telemetry,
+        # Conditionally start Repo (skip in test environment)
+        repo_child_spec(),
+        default_provider_task(),
+        {Cluster.Supervisor,
+         [Application.get_env(:libcluster, :topologies, []), [name: ElixirAi.ClusterSupervisor]]},
+        {Phoenix.PubSub, name: ElixirAi.PubSub},
+        {ElixirAi.LiveViewPG, []},
+        {ElixirAi.RunnerPG, []},
+        {ElixirAi.SingletonPG, []},
+        {ElixirAi.PageToolsPG, []},
+        {ElixirAi.AudioProcessingPG, []},
+        {DynamicSupervisor, name: ElixirAi.AudioWorkerSupervisor, strategy: :one_for_one},
+        ElixirAi.ToolTesting,
+        ElixirAiWeb.Endpoint,
+        {Horde.Registry,
+         [
+           name: ElixirAi.ChatRegistry,
+           keys: :unique,
+           members: :auto,
+           delta_crdt_options: [sync_interval: 100]
+         ]},
+        {Horde.DynamicSupervisor,
+         [
+           name: ElixirAi.ChatRunnerSupervisor,
+           strategy: :one_for_one,
+           members: :auto,
+           delta_crdt_options: [sync_interval: 100],
+           process_redistribution: :active
+         ]},
+        cluster_singleton_child_spec(ElixirAi.ConversationManager)
+      ]

    opts = [strategy: :one_for_one, name: ElixirAi.Supervisor]
    Supervisor.start_link(children, opts)
@@ -63,11 +64,11 @@ defmodule ElixirAi.Application do
    end
  end

-  defp cluster_singleton_child_spec do
+  defp cluster_singleton_child_spec(module) do
    if Application.get_env(:elixir_ai, :env) == :test do
-      Supervisor.child_spec({Task, fn -> :ok end}, id: :skip_cluster_singleton)
+      Supervisor.child_spec({Task, fn -> :ok end}, id: {:skip_cluster_singleton, module})
    else
-      ElixirAi.ClusterSingleton
+      {ElixirAi.ClusterSingletonLauncher, module: module}
    end
  end
 end
--- a/lib/elixir_ai/cluster_singleton/cluster_singleton.ex
+++ b/lib/elixir_ai/cluster_singleton/cluster_singleton.ex
@@ -1,69 +0,0 @@
-defmodule ElixirAi.ClusterSingleton do
-  use GenServer
-  require Logger
-
-  @sync_delay_ms 200
-  @retry_delay_ms 500
-
-  @singletons [ElixirAi.ConversationManager]
-
-  def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: __MODULE__)
-
-  def status, do: GenServer.call(__MODULE__, :status)
-
-  def configured_singletons, do: @singletons
-
-  def init(_opts) do
-    Process.send_after(self(), :start_singletons, @sync_delay_ms)
-    {:ok, :pending}
-  end
-
-  def handle_info(:start_singletons, state) do
-    if Node.list() == [] do
-      Logger.debug("ClusterSingleton: no peer nodes yet, retrying in #{@retry_delay_ms}ms")
-      Process.send_after(self(), :start_singletons, @retry_delay_ms)
-      {:noreply, state}
-    else
-      start_singletons()
-      {:noreply, :started}
-    end
-  end
-
-  defp start_singletons do
-    for module <- @singletons do
-      if singleton_exists?(module) do
-        Logger.debug(
-          "ClusterSingleton: singleton already exists, skipping start for #{inspect(module)}"
-        )
-      else
-        case Horde.DynamicSupervisor.start_child(ElixirAi.ChatRunnerSupervisor, module) do
-          {:ok, _pid} ->
-            :ok
-
-          {:error, {:already_started, _pid}} ->
-            :ok
-
-          {:error, :already_present} ->
-            :ok
-
-          {:error, reason} ->
-            Logger.warning(
-              "ClusterSingleton: failed to start #{inspect(module)}: #{inspect(reason)}"
-            )
-        end
-      end
-    end
-  end
-
-  def handle_call(:status, _from, state), do: {:reply, state, state}
-
-  defp singleton_exists?(module) do
-    case Horde.Registry.lookup(ElixirAi.ChatRegistry, module) do
-      [{pid, _metadata} | _] when is_pid(pid) ->
-        true
-
-      _ ->
-        false
-    end
-  end
-end
--- a/lib/elixir_ai/cluster_singleton/cluster_singleton_launcher.ex
+++ b/lib/elixir_ai/cluster_singleton/cluster_singleton_launcher.ex
@@ -0,0 +1,78 @@
+defmodule ElixirAi.ClusterSingletonLauncher do
+  require Logger
+
+  @retry_delay_ms 500
+
+  def start_link(opts) do
+    Task.start_link(fn -> run(opts) end)
+  end
+
+  def child_spec(opts) do
+    %{
+      id: {__MODULE__, Keyword.fetch!(opts, :module)},
+      start: {__MODULE__, :start_link, [opts]},
+      restart: :transient
+    }
+  end
+
+  # Returns [{module, node_or_nil}] for all configured singletons.
+  # node_or_nil is nil when the singleton is not currently running anywhere.
+  def singleton_locations do
+    [ElixirAi.ConversationManager]
+    |> Enum.map(fn module ->
+      node =
+        case :pg.get_members(ElixirAi.SingletonPG, {:singleton, module}) do
+          [pid | _] -> node(pid)
+          _ -> nil
+        end
+
+      {module, node}
+    end)
+  end
+
+  defp run(opts) do
+    module = Keyword.fetch!(opts, :module)
+
+    if Node.list() == [] do
+      Logger.debug(
+        "ClusterSingletonLauncher: no peer nodes yet, retrying in #{@retry_delay_ms}ms"
+      )
+
+      Process.sleep(@retry_delay_ms)
+      run(opts)
+    else
+      launch(module)
+    end
+  end
+
+  defp launch(module) do
+    if singleton_exists?(module) do
+      Logger.debug(
+        "ClusterSingletonLauncher: singleton already exists, skipping start for #{inspect(module)}"
+      )
+    else
+      case Horde.DynamicSupervisor.start_child(ElixirAi.ChatRunnerSupervisor, module) do
+        {:ok, _pid} ->
+          :ok
+
+        {:error, {:already_started, _pid}} ->
+          :ok
+
+        {:error, :already_present} ->
+          :ok
+
+        {:error, reason} ->
+          Logger.warning(
+            "ClusterSingletonLauncher: failed to start #{inspect(module)}: #{inspect(reason)}"
+          )
+      end
+    end
+  end
+
+  defp singleton_exists?(module) do
+    case Horde.Registry.lookup(ElixirAi.ChatRegistry, module) do
+      [{pid, _metadata} | _] when is_pid(pid) -> true
+      _ -> false
+    end
+  end
+end
--- a/lib/elixir_ai/conversation_manager.ex
+++ b/lib/elixir_ai/conversation_manager.ex
@@ -21,6 +21,8 @@ defmodule ElixirAi.ConversationManager do
  def init(_) do
    Logger.info("ConversationManager initializing...")
    :pg.join(ElixirAi.SingletonPG, {:singleton, __MODULE__}, self())
+    # Mitigation 4: receive :nodedown when a cluster peer disappears (sleep/wake, crash)
+    :net_kernel.monitor_nodes(true)
    send(self(), :load_conversations)
    {:ok, %{conversations: :loading, subscriptions: MapSet.new(), runners: %{}}}
  end
@@ -102,14 +104,57 @@ defmodule ElixirAi.ConversationManager do
  end

  def handle_info({:DOWN, _ref, :process, pid, reason}, %{runners: runners} = state) do
-    runners =
-      Enum.reject(runners, fn {_name, info} -> info.pid == pid end)
+    # Find the name before removing so we can check for a Horde redistribution replacement
+    {name, _} = Enum.find(runners, {nil, nil}, fn {_n, info} -> info.pid == pid end)
+
+    new_runners =
+      runners
+      |> Enum.reject(fn {_n, info} -> info.pid == pid end)
      |> Map.new()

    Logger.info("ConversationManager: runner #{inspect(pid)} went down (#{inspect(reason)})")
-    {:noreply, %{state | runners: runners}}
+
+    # Mitigation 2: Horde may have already restarted the runner on another node; re-monitor it
+    new_runners =
+      if name do
+        case :pg.get_members(ElixirAi.RunnerPG, {:runner, name}) do
+          [new_pid | _] when new_pid != pid ->
+            Logger.info(
+              "ConversationManager: re-monitoring redistributed runner for #{name} at #{inspect(new_pid)}"
+            )
+
+            Process.monitor(new_pid)
+            Map.put(new_runners, name, %{pid: new_pid, node: node(new_pid)})
+
+          _ ->
+            new_runners
+        end
+      else
+        new_runners
+      end
+
+    {:noreply, %{state | runners: new_runners}}
  end

+  # Mitigation 4: node went down — evict all cached runners on that node immediately,
+  # before the individual :DOWN messages for each pid arrive.
+  def handle_info({:nodedown, down_node}, %{runners: runners} = state) do
+    stale = Enum.filter(runners, fn {_name, info} -> info.node == down_node end)
+
+    if stale != [] do
+      names = Enum.map(stale, &elem(&1, 0))
+
+      Logger.info(
+        "ConversationManager: node #{down_node} down, clearing stale runners: #{inspect(names)}"
+      )
+    end
+
+    new_runners = Map.reject(runners, fn {_name, info} -> info.node == down_node end)
+    {:noreply, %{state | runners: new_runners}}
+  end
+
+  def handle_info({:nodeup, _node}, state), do: {:noreply, state}
+
  def handle_info({:error, {:db_error, reason}}, state) do
    Logger.error("ConversationManager received db_error: #{inspect(reason)}")
    {:noreply, state}
@@ -142,7 +187,33 @@ defmodule ElixirAi.ConversationManager do

    conversations = Map.new(conversation_list, fn %{name: name} -> {name, []} end)
    Logger.info("Conversation map keys: #{inspect(Map.keys(conversations))}")
-    {:noreply, %{state | conversations: conversations}}
+
+    # Mitigation 3: after a ConversationManager restart, re-establish monitors for any
+    # ChatRunners that are still alive in Horde — they carry on running but we lost
+    # all monitor refs when this process restarted.
+    runners =
+      :pg.which_groups(ElixirAi.RunnerPG)
+      |> Enum.flat_map(fn
+        {:runner, name} ->
+          case :pg.get_members(ElixirAi.RunnerPG, {:runner, name}) do
+            [pid | _] ->
+              Process.monitor(pid)
+              [{name, %{pid: pid, node: node(pid)}}]
+
+            _ ->
+              []
+          end
+
+        _ ->
+          []
+      end)
+      |> Map.new()
+
+    Logger.info(
+      "ConversationManager: re-established monitors for #{map_size(runners)} live runners"
+    )
+
+    {:noreply, %{state | conversations: conversations, runners: runners}}
  end

  def handle_info({:retry_call, message, from}, state) do
@@ -195,9 +266,32 @@ defmodule ElixirAi.ConversationManager do
             ElixirAi.ChatRunnerSupervisor,
             {ElixirAi.ChatRunner, name: name}
           ) do
-        {:ok, pid} -> {:ok, pid}
-        {:error, {:already_started, pid}} -> {:ok, pid}
-        error -> error
+        {:ok, pid} ->
+          {:ok, pid}
+
+        {:error, {:already_started, pid}} ->
+          # Mitigation 6: the returned pid may be on a node that just went down but whose
+          # :DOWN message hasn't been processed yet; verify the node is still reachable.
+          if node_alive?(node(pid)) do
+            {:ok, pid}
+          else
+            # Node is gone; Horde will redistribute — wait briefly for the new registration.
+            case registry_lookup_with_retry(name) do
+              nil -> {:error, :runner_unavailable}
+              new_pid -> {:ok, new_pid}
+            end
+          end
+
+        # Mitigation 1: :already_present means Horde knows the child spec but the process
+        # is mid-redistribution and not yet registered. Retry the registry until it appears.
+        {:error, :already_present} ->
+          case registry_lookup_with_retry(name) do
+            nil -> {:error, :runner_unavailable}
+            pid -> {:ok, pid}
+          end
+
+        error ->
+          error
      end

    case result do
@@ -210,14 +304,21 @@ defmodule ElixirAi.ConversationManager do
            MapSet.put(state.subscriptions, name)
          end

-        existing_runners = Map.get(state, :runners, %{})
-
        new_runners =
-          if Map.has_key?(existing_runners, name) do
-            existing_runners
-          else
-            Process.monitor(pid)
-            Map.put(existing_runners, name, %{pid: pid, node: node(pid)})
+          case Map.get(state.runners, name) do
+            nil ->
+              Process.monitor(pid)
+              Map.put(state.runners, name, %{pid: pid, node: node(pid)})
+
+            %{pid: ^pid} ->
+              # Same pid — nothing to update
+              state.runners
+
+            %{pid: old_pid} ->
+              # Pid changed (redistribution raced ahead of :DOWN) — swap the monitor
+              Process.demonitor(old_pid, [:flush])
+              Process.monitor(pid)
+              Map.put(state.runners, name, %{pid: pid, node: node(pid)})
          end

        {:ok, pid, new_subscriptions, new_runners}
@@ -226,4 +327,22 @@ defmodule ElixirAi.ConversationManager do
        error
    end
  end
+
+  # Mitigation 5: Horde registry syncs via delta-CRDT with up to ~100ms lag after a
+  # process moves nodes. Retry with exponential backoff before concluding it doesn't exist.
+  defp registry_lookup_with_retry(name, retries \\ 3, delay_ms \\ 50)
+  defp registry_lookup_with_retry(_name, 0, _delay_ms), do: nil
+
+  defp registry_lookup_with_retry(name, retries, delay_ms) do
+    case Horde.Registry.lookup(ElixirAi.ChatRegistry, name) do
+      [{pid, _} | _] when is_pid(pid) ->
+        pid
+
+      _ ->
+        Process.sleep(delay_ms)
+        registry_lookup_with_retry(name, retries - 1, delay_ms * 2)
+    end
+  end
+
+  defp node_alive?(n), do: n == Node.self() or n in Node.list()
 end