diff --git a/.gitignore b/.gitignore index 2496090..4c52eec 100644 --- a/.gitignore +++ b/.gitignore @@ -38,4 +38,6 @@ npm-debug.log elixir_ls/ .env -*.tmp \ No newline at end of file +*.tmp + +providers.yml \ No newline at end of file diff --git a/config/config.exs b/config/config.exs index 98ba630..eeb77cd 100644 --- a/config/config.exs +++ b/config/config.exs @@ -1,17 +1,10 @@ -# This file is responsible for configuring your application -# and its dependencies with the aid of the Config module. -# -# This configuration file is loaded before any dependency and -# is restricted to this project. - -# General application configuration +# General config, overriden by other files in this directory. import Config config :elixir_ai, ecto_repos: [ElixirAi.Repo], generators: [timestamp_type: :utc_datetime] -# Configures the endpoint config :elixir_ai, ElixirAiWeb.Endpoint, url: [host: "localhost"], adapter: Bandit.PhoenixAdapter, @@ -22,7 +15,6 @@ config :elixir_ai, ElixirAiWeb.Endpoint, pubsub_server: ElixirAi.PubSub, live_view: [signing_salt: "4UG1IVt+"] -# Configure esbuild (the version is required) config :esbuild, version: "0.17.11", elixir_ai: [ @@ -32,7 +24,6 @@ config :esbuild, env: %{"NODE_PATH" => Path.expand("../deps", __DIR__)} ] -# Configure tailwind (the version is required) config :tailwind, version: "4.0.9", elixir_ai: [ @@ -43,17 +34,12 @@ config :tailwind, cd: Path.expand("../assets", __DIR__) ] -# Configures Elixir's Logger config :logger, :console, format: "$time $metadata[$level] $message\n", metadata: [:request_id] -# Use Jason for JSON parsing in Phoenix config :phoenix, :json_library, Jason -# Lower the BEAM node-down detection window from the default 60s. -# Nodes send ticks every (net_ticktime / 4)s; a node is declared down -# after 4 missed ticks (net_ticktime total). 5s means detection in ≤5s. if System.get_env("RELEASE_MODE") do config :kernel, net_ticktime: 2 end @@ -67,6 +53,4 @@ config :libcluster, ] ] -# Import environment specific config. This must remain at the bottom -# of this file so it overrides the configuration defined above. import_config "#{config_env()}.exs" diff --git a/config/runtime.exs b/config/runtime.exs index c837ed5..428ba83 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -4,9 +4,9 @@ import Dotenvy source!([".env", System.get_env()]) config :elixir_ai, - ai_endpoint: env!("AI_RESPONSES_ENDPOINT", :string!), - ai_token: env!("AI_TOKEN", :string!), - ai_model: env!("AI_MODEL", :string!) + ai_endpoint: System.get_env("AI_RESPONSES_ENDPOINT"), + ai_token: System.get_env("AI_TOKEN"), + ai_model: System.get_env("AI_MODEL") # config/runtime.exs is executed for all environments, including # during releases. It is executed after compilation and before the @@ -72,7 +72,7 @@ if config_env() == :prod do ] end - host = System.get_env("PHX_HOST") || "example.com" + host = System.get_env("PHX_HOST") || raise "environment variable PHX_HOST is missing." port = String.to_integer(System.get_env("PORT") || "4000") config :elixir_ai, :dns_cluster_query, System.get_env("DNS_CLUSTER_QUERY") @@ -88,36 +88,4 @@ if config_env() == :prod do port: port ], secret_key_base: secret_key_base - - # ## SSL Support - # - # To get SSL working, you will need to add the `https` key - # to your endpoint configuration: - # - # config :elixir_ai, ElixirAiWeb.Endpoint, - # https: [ - # ..., - # port: 443, - # cipher_suite: :strong, - # keyfile: System.get_env("SOME_APP_SSL_KEY_PATH"), - # certfile: System.get_env("SOME_APP_SSL_CERT_PATH") - # ] - # - # The `cipher_suite` is set to `:strong` to support only the - # latest and more secure SSL ciphers. This means old browsers - # and clients may not be supported. You can set it to - # `:compatible` for wider support. - # - # `:keyfile` and `:certfile` expect an absolute path to the key - # and cert in disk or a relative path inside priv, for example - # "priv/ssl/server.key". For all supported SSL configuration - # options, see https://hexdocs.pm/plug/Plug.SSL.html#configure/1 - # - # We also recommend setting `force_ssl` in your config/prod.exs, - # ensuring no data is ever sent via http, always redirecting to https: - # - # config :elixir_ai, ElixirAiWeb.Endpoint, - # force_ssl: [hsts: true] - # - # Check `Plug.SSL` for all available options in `force_ssl`. end diff --git a/docker-compose.yml b/docker-compose.yml index 002ec69..aa69d4b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -31,6 +31,7 @@ services: RELEASE_NODE: elixir_ai@node1 RELEASE_COOKIE: secret_cluster_cookie SECRET_KEY_BASE: F1nY5uSyD0HfoWejcuuQiaQoMQrjrlFigb3bJ7p4hTXwpTza6sPLpmd+jLS7p0Sh + PROVIDERS_CONFIG_PATH: /app/providers.yml user: root command: | sh -c ' @@ -41,6 +42,7 @@ services: volumes: - .:/app - /app/_build + - ./providers.yml:/app/providers.yml:ro ports: - "4001:4000" depends_on: @@ -68,6 +70,7 @@ services: RELEASE_NODE: elixir_ai@node2 RELEASE_COOKIE: secret_cluster_cookie SECRET_KEY_BASE: F1nY5uSyD0HfoWejcuuQiaQoMQrjrlFigb3bJ7p4hTXwpTza6sPLpmd+jLS7p0Sh + PROVIDERS_CONFIG_PATH: /app/providers.yml user: root command: | sh -c ' @@ -78,6 +81,7 @@ services: volumes: - .:/app - /app/_build + - ./providers.yml:/app/providers.yml:ro ports: - "4002:4000" depends_on: diff --git a/example.providers.yml b/example.providers.yml new file mode 100644 index 0000000..e717661 --- /dev/null +++ b/example.providers.yml @@ -0,0 +1,9 @@ +providers: +- name: provider name + model: gpt-oss-20b + responses_endpoint: http://example.com/api/responses + api_key: your_api_key_here +- name: provider name 2 + model: gpt-oss-20b + responses_endpoint: http://example.com/api/responses + api_key: your_api_key_here \ No newline at end of file diff --git a/kubernetes/statefulset.yml b/kubernetes/statefulset.yml index 3b4c26e..ab2442d 100644 --- a/kubernetes/statefulset.yml +++ b/kubernetes/statefulset.yml @@ -4,7 +4,7 @@ metadata: name: ai-ha-elixir namespace: ai-ha-elixir spec: - serviceName: ai-ha-elixir-headless + serviceName: ai-ha-elixir-headless # replica1.ai-ha-elixir-headless.svc.cluster.local replicas: 2 podManagementPolicy: Parallel updateStrategy: diff --git a/lib/elixir_ai/ai_utils/chat_utils.ex b/lib/elixir_ai/ai_utils/chat_utils.ex index 31ed406..30652e2 100644 --- a/lib/elixir_ai/ai_utils/chat_utils.ex +++ b/lib/elixir_ai/ai_utils/chat_utils.ex @@ -27,9 +27,16 @@ defmodule ElixirAi.ChatUtils do } run_function = fn current_message_id, tool_call_id, args -> - Task.start(fn -> - result = function.(args) - send(server, {:tool_response, current_message_id, tool_call_id, result}) + Task.start_link(fn -> + try do + result = function.(args) + send(server, {:tool_response, current_message_id, tool_call_id, result}) + rescue + e -> + reason = Exception.format(:error, e, __STACKTRACE__) + Logger.error("Tool task crashed: #{reason}") + send(server, {:tool_response, current_message_id, tool_call_id, {:error, reason}}) + end end) end @@ -41,7 +48,7 @@ defmodule ElixirAi.ChatUtils do end def request_ai_response(server, messages, tools, provider) do - Task.start(fn -> + Task.start_link(fn -> api_url = provider.completions_url api_key = provider.api_token model = provider.model_name @@ -82,7 +89,8 @@ defmodule ElixirAi.ChatUtils do :ok {:error, reason} -> - IO.warn("AI request failed: #{inspect(reason)} for #{api_url}") + Logger.warning("AI request failed: #{inspect(reason)} for #{api_url}") + send(server, {:ai_request_error, reason}) end end) end diff --git a/lib/elixir_ai/application.ex b/lib/elixir_ai/application.ex index 4981f57..b6ceae3 100644 --- a/lib/elixir_ai/application.ex +++ b/lib/elixir_ai/application.ex @@ -12,6 +12,7 @@ defmodule ElixirAi.Application do {Cluster.Supervisor, [Application.get_env(:libcluster, :topologies, []), [name: ElixirAi.ClusterSupervisor]]}, {Phoenix.PubSub, name: ElixirAi.PubSub}, + {ElixirAi.LiveViewPG, []}, ElixirAi.ToolTesting, ElixirAiWeb.Endpoint, {Horde.Registry, @@ -55,7 +56,7 @@ defmodule ElixirAi.Application do if Application.get_env(:elixir_ai, :env) == :test do Supervisor.child_spec({Task, fn -> :ok end}, id: :skip_default_provider) else - {Task, fn -> ElixirAi.AiProvider.ensure_default_provider() end} + {Task, fn -> ElixirAi.AiProvider.ensure_configured_providers() end} end end diff --git a/lib/elixir_ai/chat_runner.ex b/lib/elixir_ai/chat_runner.ex index 82ed7b5..d133cbb 100644 --- a/lib/elixir_ai/chat_runner.ex +++ b/lib/elixir_ai/chat_runner.ex @@ -49,6 +49,7 @@ defmodule ElixirAi.ChatRunner do "Last message role was #{last_message.role}, requesting AI response for conversation #{name}" ) + broadcast_ui(name, :recovery_restart) ElixirAi.ChatUtils.request_ai_response(self(), messages, tools(self(), name), provider) end diff --git a/lib/elixir_ai/cluster_singleton.ex b/lib/elixir_ai/cluster_singleton.ex index e8c2b45..874413b 100644 --- a/lib/elixir_ai/cluster_singleton.ex +++ b/lib/elixir_ai/cluster_singleton.ex @@ -3,17 +3,33 @@ defmodule ElixirAi.ClusterSingleton do require Logger @sync_delay_ms 200 + @retry_delay_ms 500 @singletons [ElixirAi.ConversationManager] def start_link(opts), do: GenServer.start_link(__MODULE__, opts, name: __MODULE__) + def status, do: GenServer.call(__MODULE__, :status) + + def configured_singletons, do: @singletons + def init(_opts) do Process.send_after(self(), :start_singletons, @sync_delay_ms) {:ok, :pending} end - def handle_info(:start_singletons, _state) do + def handle_info(:start_singletons, state) do + if Node.list() == [] do + Logger.debug("ClusterSingleton: no peer nodes yet, retrying in #{@retry_delay_ms}ms") + Process.send_after(self(), :start_singletons, @retry_delay_ms) + {:noreply, state} + else + start_singletons() + {:noreply, :started} + end + end + + defp start_singletons do for module <- @singletons do if singleton_exists?(module) do Logger.debug( @@ -37,10 +53,10 @@ defmodule ElixirAi.ClusterSingleton do end end end - - {:noreply, :started} end + def handle_call(:status, _from, state), do: {:reply, state, state} + defp singleton_exists?(module) do case Horde.Registry.lookup(ElixirAi.ChatRegistry, module) do [{pid, _metadata} | _] when is_pid(pid) -> diff --git a/lib/elixir_ai/conversation_manager.ex b/lib/elixir_ai/conversation_manager.ex index 8da6f68..51de04c 100644 --- a/lib/elixir_ai/conversation_manager.ex +++ b/lib/elixir_ai/conversation_manager.ex @@ -21,7 +21,7 @@ defmodule ElixirAi.ConversationManager do def init(_) do Logger.info("ConversationManager initializing...") send(self(), :load_conversations) - {:ok, %{conversations: :loading, subscriptions: MapSet.new()}} + {:ok, %{conversations: :loading, subscriptions: MapSet.new(), runners: %{}}} end def create_conversation(name, ai_provider_id) do @@ -40,6 +40,10 @@ defmodule ElixirAi.ConversationManager do GenServer.call(@name, {:get_messages, name}) end + def list_runners do + GenServer.call(@name, :list_runners) + end + def handle_call(message, from, %{conversations: :loading} = state) do Logger.warning( "Received call #{inspect(message)} from #{inspect(from)} while loading conversations. Retrying after delay." @@ -75,7 +79,7 @@ defmodule ElixirAi.ConversationManager do %{conversations: conversations} = state ) do if Map.has_key?(conversations, name) do - reply_with_started(name, state) + reply_with_conversation(name, state) else {:reply, {:error, :not_found}, state} end @@ -84,10 +88,6 @@ defmodule ElixirAi.ConversationManager do def handle_call(:list, _from, %{conversations: conversations} = state) do keys = Map.keys(conversations) - Logger.debug( - "list_conversations returning: #{inspect(keys, limit: :infinity, printable_limit: :infinity, binaries: :as_binaries)}" - ) - {:reply, keys, state} end @@ -95,6 +95,19 @@ defmodule ElixirAi.ConversationManager do {:reply, Map.get(conversations, name, []), state} end + def handle_call(:list_runners, _from, state) do + {:reply, Map.get(state, :runners, %{}), state} + end + + def handle_info({:DOWN, _ref, :process, pid, reason}, %{runners: runners} = state) do + runners = + Enum.reject(runners, fn {_name, info} -> info.pid == pid end) + |> Map.new() + + Logger.info("ConversationManager: runner #{inspect(pid)} went down (#{inspect(reason)})") + {:noreply, %{state | runners: runners}} + end + def handle_info({:db_error, reason}, state) do Logger.error("ConversationManager received db_error: #{inspect(reason)}") {:noreply, state} @@ -138,10 +151,13 @@ defmodule ElixirAi.ConversationManager do end end + # Returns {pid} to callers that only need to know the process started (e.g. create). defp reply_with_started(name, state, update_state \\ fn s -> s end) do - case start_and_subscribe(name, state.subscriptions) do - {:ok, pid, new_subscriptions} -> - new_state = update_state.(%{state | subscriptions: new_subscriptions}) + case start_and_subscribe(name, state) do + {:ok, pid, new_subscriptions, new_runners} -> + new_state = + update_state.(%{state | subscriptions: new_subscriptions, runners: new_runners}) + {:reply, {:ok, pid}, new_state} {:error, _reason} = error -> @@ -149,7 +165,21 @@ defmodule ElixirAi.ConversationManager do end end - defp start_and_subscribe(name, subscriptions) do + # Returns the full conversation state using the pid directly, bypassing the + # Horde registry (which may not have synced yet on the calling node). + defp reply_with_conversation(name, state) do + case start_and_subscribe(name, state) do + {:ok, pid, new_subscriptions, new_runners} -> + new_state = %{state | subscriptions: new_subscriptions, runners: new_runners} + conversation = GenServer.call(pid, :get_conversation) + {:reply, {:ok, conversation}, new_state} + + {:error, _reason} = error -> + {:reply, error, state} + end + end + + defp start_and_subscribe(name, state) do result = case Horde.DynamicSupervisor.start_child( ElixirAi.ChatRunnerSupervisor, @@ -163,14 +193,24 @@ defmodule ElixirAi.ConversationManager do case result do {:ok, pid} -> new_subscriptions = - if MapSet.member?(subscriptions, name) do - subscriptions + if MapSet.member?(state.subscriptions, name) do + state.subscriptions else Phoenix.PubSub.subscribe(ElixirAi.PubSub, conversation_message_topic(name)) - MapSet.put(subscriptions, name) + MapSet.put(state.subscriptions, name) end - {:ok, pid, new_subscriptions} + existing_runners = Map.get(state, :runners, %{}) + + new_runners = + if Map.has_key?(existing_runners, name) do + existing_runners + else + Process.monitor(pid) + Map.put(existing_runners, name, %{pid: pid, node: node(pid)}) + end + + {:ok, pid, new_subscriptions, new_runners} error -> error diff --git a/lib/elixir_ai/data/ai_provider.ex b/lib/elixir_ai/data/ai_provider.ex index 271db62..3b3fa5c 100644 --- a/lib/elixir_ai/data/ai_provider.ex +++ b/lib/elixir_ai/data/ai_provider.ex @@ -138,28 +138,80 @@ defmodule ElixirAi.AiProvider do end def ensure_default_provider do - sql = "SELECT COUNT(*) FROM ai_providers" - params = %{} + endpoint = Application.get_env(:elixir_ai, :ai_endpoint) + token = Application.get_env(:elixir_ai, :ai_token) + model = Application.get_env(:elixir_ai, :ai_model) - case DbHelpers.run_sql(sql, params, providers_topic()) do - {:error, :db_error} -> - {:error, :db_error} + if endpoint && token && model do + case find_by_name("default") do + {:error, :not_found} -> + attrs = %{ + name: "default", + model_name: model, + api_token: token, + completions_url: endpoint + } - rows -> - case rows do - [%{"count" => 0}] -> - attrs = %{ - name: "default", - model_name: Application.fetch_env!(:elixir_ai, :ai_model), - api_token: Application.fetch_env!(:elixir_ai, :ai_token), - completions_url: Application.fetch_env!(:elixir_ai, :ai_endpoint) - } + create(attrs) - create(attrs) + {:ok, _} -> + :ok - _ -> - :ok + {:error, reason} -> + {:error, reason} + end + else + Logger.info("AI env vars not configured, skipping default provider creation") + :ok + end + end + + def ensure_providers_from_file do + case System.get_env("PROVIDERS_CONFIG_PATH") do + nil -> + :ok + + path -> + case YamlElixir.read_from_file(path) do + {:ok, %{"providers" => providers}} when is_list(providers) -> + Enum.each(providers, &ensure_provider_from_yaml/1) + + {:ok, _} -> + Logger.warning("providers.yml: expected a top-level 'providers' list, skipping") + + {:error, reason} -> + Logger.warning("Could not read providers config from #{path}: #{inspect(reason)}") end end end + + def ensure_configured_providers do + ensure_default_provider() + ensure_providers_from_file() + end + + defp ensure_provider_from_yaml(%{ + "name" => name, + "model" => model, + "responses_endpoint" => endpoint, + "api_key" => api_key + }) do + case find_by_name(name) do + {:error, :not_found} -> + Logger.info("Creating provider '#{name}' from providers config file") + create(%{name: name, model_name: model, api_token: api_key, completions_url: endpoint}) + + {:ok, _} -> + Logger.debug("Provider '#{name}' already exists, skipping") + + {:error, reason} -> + Logger.warning("Could not check existence of provider '#{name}': #{inspect(reason)}") + end + end + + defp ensure_provider_from_yaml(entry) do + Logger.warning( + "Skipping invalid provider entry in providers config file (must have name, model, responses_endpoint, api_key): #{inspect(entry)}" + ) + end end diff --git a/lib/elixir_ai/live_view_pg.ex b/lib/elixir_ai/live_view_pg.ex new file mode 100644 index 0000000..b9f19ea --- /dev/null +++ b/lib/elixir_ai/live_view_pg.ex @@ -0,0 +1,16 @@ +defmodule ElixirAi.LiveViewPG do + @moduledoc """ + Named :pg scope for tracking LiveView processes across the cluster. + Each LiveView joins {:liveview, ViewModule} on connect; :pg syncs membership + automatically and removes dead processes without any additional cleanup. + """ + + def child_spec(_opts) do + %{ + id: __MODULE__, + start: {:pg, :start_link, [__MODULE__]}, + type: :worker, + restart: :permanent + } + end +end diff --git a/lib/elixir_ai_web/admin/admin_live.ex b/lib/elixir_ai_web/admin/admin_live.ex new file mode 100644 index 0000000..285d614 --- /dev/null +++ b/lib/elixir_ai_web/admin/admin_live.ex @@ -0,0 +1,245 @@ +defmodule ElixirAiWeb.AdminLive do + use ElixirAiWeb, :live_view + require Logger + + @refresh_ms 1_000 + + def mount(_params, _session, socket) do + if connected?(socket) do + :net_kernel.monitor_nodes(true) + :pg.join(ElixirAi.LiveViewPG, {:liveview, __MODULE__}, self()) + schedule_refresh() + end + + {:ok, assign(socket, cluster_info: gather_info())} + end + + def handle_info({:nodeup, _node}, socket) do + {:noreply, assign(socket, cluster_info: gather_info())} + end + + def handle_info({:nodedown, _node}, socket) do + {:noreply, assign(socket, cluster_info: gather_info())} + end + + def handle_info(:refresh, socket) do + schedule_refresh() + {:noreply, assign(socket, cluster_info: gather_info())} + end + + defp schedule_refresh, do: Process.send_after(self(), :refresh, @refresh_ms) + + defp gather_info do + import ElixirAi.PubsubTopics + + all_nodes = [Node.self() | Node.list()] + configured = ElixirAi.ClusterSingleton.configured_singletons() + + node_statuses = + Enum.map(all_nodes, fn node -> + status = + if node == Node.self() do + try do + ElixirAi.ClusterSingleton.status() + catch + _, _ -> :unreachable + end + else + case :rpc.call(node, ElixirAi.ClusterSingleton, :status, [], 3_000) do + {:badrpc, _} -> :unreachable + result -> result + end + end + + {node, status} + end) + + singleton_locations = + Enum.map(configured, fn module -> + location = + case Horde.Registry.lookup(ElixirAi.ChatRegistry, module) do + [{pid, _}] -> node(pid) + _ -> nil + end + + {module, location} + end) + + # All ChatRunner entries in the distributed registry, keyed by conversation name. + # Each entry is a {name, node, pid, supervisor_node} tuple. + chat_runners = + Horde.DynamicSupervisor.which_children(ElixirAi.ChatRunnerSupervisor) + |> Enum.flat_map(fn + {_, pid, _, _} when is_pid(pid) -> + case Horde.Registry.select(ElixirAi.ChatRegistry, [ + {{:"$1", pid, :"$2"}, [], [{{:"$1", pid, :"$2"}}]} + ]) do + [{name, ^pid, _}] when is_binary(name) -> [{name, node(pid), pid}] + _ -> [] + end + + _ -> + [] + end) + |> Enum.sort_by(&elem(&1, 0)) + + # :pg is cluster-wide — one local call returns members from all nodes. + # Processes are automatically removed from their group when they die. + liveviews = + :pg.which_groups(ElixirAi.LiveViewPG) + |> Enum.flat_map(fn + {:liveview, view} -> + :pg.get_members(ElixirAi.LiveViewPG, {:liveview, view}) + |> Enum.map(fn pid -> {view, node(pid)} end) + + _ -> + [] + end) + + %{ + nodes: node_statuses, + configured_singletons: configured, + singleton_locations: singleton_locations, + chat_runners: chat_runners, + liveviews: liveviews + } + end + + def render(assigns) do + ~H""" +
+

Cluster Admin

+ +
+ <%= for {node, status} <- @cluster_info.nodes do %> + <% node_singletons = + Enum.filter(@cluster_info.singleton_locations, fn {_, loc} -> loc == node end) %> + <% node_runners = + Enum.filter(@cluster_info.chat_runners, fn {_, rnode, _} -> rnode == node end) %> + <% node_liveviews = + @cluster_info.liveviews + |> Enum.filter(fn {_, n} -> n == node end) + |> Enum.group_by(fn {view, _} -> view end) %> + +
+
+
+ {node} + <%= if node == Node.self() do %> + self + <% end %> +
+ <.status_badge status={status} /> +
+ +
+ <%= if node_singletons != [] do %> +
+

+ Singletons +

+
+ <%= for {module, _} <- node_singletons do %> +
+ {inspect(module)} +
+ <% end %> +
+
+ <% end %> + + <%= if node_runners != [] do %> +
+

+ Chat Runners + + {length(node_runners)} + +

+
+ <%= for {name, _, _} <- node_runners do %> +
+ {name} +
+ <% end %> +
+
+ <% end %> + + <%= if node_liveviews != %{} do %> +
+

+ LiveViews +

+
+ <%= for {view, instances} <- node_liveviews do %> +
+ {short_module(view)} + ×{length(instances)} +
+ <% end %> +
+
+ <% end %> + + <%= if node_singletons == [] and node_runners == [] and node_liveviews == %{} do %> +

No active processes

+ <% end %> +
+
+ <% end %> +
+ + <% unlocated = + Enum.filter(@cluster_info.singleton_locations, fn {_, loc} -> is_nil(loc) end) %> + <%= if unlocated != [] do %> +
+

+ Singletons Not Running +

+
+ <%= for {module, _} <- unlocated do %> + + {inspect(module)} + + <% end %> +
+
+ <% end %> + +

Refreshes every 1s or on node events.

+
+ """ + end + + defp short_module(module) when is_atom(module) do + module + |> Atom.to_string() + |> String.replace_prefix("Elixir.", "") + |> String.split(".") + |> List.last() + end + + defp status_badge(assigns) do + ~H""" + <%= case @status do %> + <% :started -> %> + + started + + <% :pending -> %> + + pending + + <% :unreachable -> %> + + unreachable + + <% other -> %> + + {inspect(other)} + + <% end %> + """ + end +end diff --git a/lib/elixir_ai_web/chat/chat_live.ex b/lib/elixir_ai_web/chat/chat_live.ex index 5bb3c84..1f739ea 100644 --- a/lib/elixir_ai_web/chat/chat_live.ex +++ b/lib/elixir_ai_web/chat/chat_live.ex @@ -10,13 +10,12 @@ defmodule ElixirAiWeb.ChatLive do def mount(%{"name" => name}, _session, socket) do case ConversationManager.open_conversation(name) do - {:ok, _pid} -> + {:ok, conversation} -> if connected?(socket) do Phoenix.PubSub.subscribe(ElixirAi.PubSub, chat_topic(name)) + :pg.join(ElixirAi.LiveViewPG, {:liveview, __MODULE__}, self()) end - conversation = ChatRunner.get_conversation(name) - {:ok, socket |> assign(conversation_name: name) @@ -25,7 +24,8 @@ defmodule ElixirAiWeb.ChatLive do |> assign(streaming_response: conversation.streaming_response) |> assign(background_color: "bg-cyan-950/30") |> assign(provider: conversation.provider) - |> assign(db_error: nil)} + |> assign(db_error: nil) + |> assign(ai_error: nil)} {:error, :not_found} -> {:ok, push_navigate(socket, to: "/")} @@ -41,7 +41,8 @@ defmodule ElixirAiWeb.ChatLive do |> assign(streaming_response: nil) |> assign(background_color: "bg-cyan-950/30") |> assign(provider: nil) - |> assign(db_error: Exception.format(:error, reason))} + |> assign(db_error: Exception.format(:error, reason)) + |> assign(ai_error: nil)} end end @@ -60,6 +61,11 @@ defmodule ElixirAiWeb.ChatLive do Database error: {@db_error} <% end %> + <%= if @ai_error do %> + + <% end %>
"#{inspect(mod)}: #{inspect(r)}" + _ -> inspect(reason) + end + + {:noreply, assign(socket, ai_error: error_message, streaming_response: nil)} + end + def handle_info({:set_background_color, color}, socket) do Logger.info("setting background color to #{color}") {:noreply, assign(socket, background_color: color)} diff --git a/lib/elixir_ai_web/home/home_live.ex b/lib/elixir_ai_web/home/home_live.ex index ae27986..02fb21b 100644 --- a/lib/elixir_ai_web/home/home_live.ex +++ b/lib/elixir_ai_web/home/home_live.ex @@ -8,6 +8,7 @@ defmodule ElixirAiWeb.HomeLive do def mount(_params, _session, socket) do if connected?(socket) do Phoenix.PubSub.subscribe(ElixirAi.PubSub, providers_topic()) + :pg.join(ElixirAi.LiveViewPG, {:liveview, __MODULE__}, self()) send(self(), :load_data) end diff --git a/lib/elixir_ai_web/router.ex b/lib/elixir_ai_web/router.ex index d091cb3..0cfb23e 100644 --- a/lib/elixir_ai_web/router.ex +++ b/lib/elixir_ai_web/router.ex @@ -19,6 +19,7 @@ defmodule ElixirAiWeb.Router do live "/", HomeLive live "/chat/:name", ChatLive + live "/admin", AdminLive end # Other scopes may use custom stacks. diff --git a/mix.exs b/mix.exs index ed92bcd..248e3f8 100644 --- a/mix.exs +++ b/mix.exs @@ -5,7 +5,7 @@ defmodule ElixirAi.MixProject do [ app: :elixir_ai, version: "0.1.0", - elixir: "~> 1.18", + elixir: "~> 1.19", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, aliases: aliases(), @@ -61,7 +61,8 @@ defmodule ElixirAi.MixProject do {:horde, "~> 0.9"}, {:credo, "~> 1.7", only: [:dev, :test], runtime: false}, {:mimic, "~> 2.3.0"}, - {:zoi, "~> 0.17"} + {:zoi, "~> 0.17"}, + {:yaml_elixir, "~> 2.12"} ] end diff --git a/mix.lock b/mix.lock index 4d25f2f..a61bd01 100644 --- a/mix.lock +++ b/mix.lock @@ -68,5 +68,7 @@ "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.1", "a48703a25c170eedadca83b11e88985af08d35f37c6f664d6dcfb106a97782fc", [:rebar3], [], "hexpm", "b3a917854ce3ae233619744ad1e0102e05673136776fb2fa76234f3e03b23642"}, "websock": {:hex, :websock, "0.5.3", "2f69a6ebe810328555b6fe5c831a851f485e303a7c8ce6c5f675abeb20ebdadc", [:mix], [], "hexpm", "6105453d7fac22c712ad66fab1d45abdf049868f253cf719b625151460b8b453"}, "websock_adapter": {:hex, :websock_adapter, "0.5.9", "43dc3ba6d89ef5dec5b1d0a39698436a1e856d000d84bf31a3149862b01a287f", [:mix], [{:bandit, ">= 0.6.0", [hex: :bandit, repo: "hexpm", optional: true]}, {:plug, "~> 1.14", [hex: :plug, repo: "hexpm", optional: false]}, {:plug_cowboy, "~> 2.6", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:websock, "~> 0.5", [hex: :websock, repo: "hexpm", optional: false]}], "hexpm", "5534d5c9adad3c18a0f58a9371220d75a803bf0b9a3d87e6fe072faaeed76a08"}, + "yamerl": {:hex, :yamerl, "0.10.0", "4ff81fee2f1f6a46f1700c0d880b24d193ddb74bd14ef42cb0bcf46e81ef2f8e", [:rebar3], [], "hexpm", "346adb2963f1051dc837a2364e4acf6eb7d80097c0f53cbdc3046ec8ec4b4e6e"}, + "yaml_elixir": {:hex, :yaml_elixir, "2.12.1", "d74f2d82294651b58dac849c45a82aaea639766797359baff834b64439f6b3f4", [:mix], [{:yamerl, "~> 0.10", [hex: :yamerl, repo: "hexpm", optional: false]}], "hexpm", "d9ac16563c737d55f9bfeed7627489156b91268a3a21cd55c54eb2e335207fed"}, "zoi": {:hex, :zoi, "0.17.1", "406aa87bb4181f41dee64336b75434367b7d3e88db813b0e6db0ae2d0f81f743", [:mix], [{:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.14.2 or ~> 3.0 or ~> 4.1", [hex: :phoenix_html, repo: "hexpm", optional: true]}], "hexpm", "3a11bf3bc9189f988ac74e81b5d7ca0c689b2a20eed220746a7043aa528e2aab"}, } diff --git a/test/elixir_ai_web/live/chat_live_test.exs b/test/elixir_ai_web/live/chat_live_test.exs index 7c95c06..556edc1 100644 --- a/test/elixir_ai_web/live/chat_live_test.exs +++ b/test/elixir_ai_web/live/chat_live_test.exs @@ -3,10 +3,8 @@ defmodule ElixirAiWeb.ChatLiveTest do import ElixirAi.PubsubTopics, only: [chat_topic: 1] setup do - stub(ElixirAi.ConversationManager, :open_conversation, fn _name -> {:ok, self()} end) - - stub(ElixirAi.ChatRunner, :get_conversation, fn _name -> - %{messages: [], streaming_response: nil} + stub(ElixirAi.ConversationManager, :open_conversation, fn _name -> + {:ok, %{messages: [], streaming_response: nil, provider: nil}} end) :ok diff --git a/tool-calling-outline.md b/tool-calling-outline.md new file mode 100644 index 0000000..1f80c5a --- /dev/null +++ b/tool-calling-outline.md @@ -0,0 +1,435 @@ +Below is a **structured design document** intended for another LLM (or engineer) to implement a **persistent BEAM-backed CLI execution system inside a long-lived Docker container using `docker exec`**. + +--- + +# Design Document: Persistent BEAM Tool Runner in Docker + +## 1. Objective + +Build a system where: + +* A **single long-lived Docker container** hosts: + + * a **persistent Elixir (BEAM) daemon** + * a standard **bash execution environment** +* All commands are executed via: + + ```bash + docker exec bash -lc "" + ``` +* Common tools (`cat`, `grep`, etc.) are **intercepted via PATH shims** +* Shims delegate execution to the **persistent BEAM daemon** +* The daemon: + + * executes real system commands + * truncates output deterministically + * returns `{stdout, stderr, exit_code}` + +--- + +## 2. Non-Goals + +* No AI/model integration +* No streaming output (batch only) +* No advanced sandboxing (seccomp/cgroups optional later) +* No distributed execution + +--- + +## 3. System Overview + +```text +Host + └─ docker exec + └─ Container (long-lived) + ├─ bash + │ └─ cat / grep / etc → shim (shell script) + │ └─ Unix socket request + │ └─ BEAM daemon + │ └─ System.cmd("cat", ...) + │ └─ truncate output + │ └─ return response +``` + +--- + +## 4. Key Design Decisions + +### 4.1 Persistent Container + +* Container is started once and reused +* Avoid `docker run` per command + +### 4.2 Persistent BEAM Process + +* Avoid BEAM startup per command +* Centralize execution + truncation + +### 4.3 Bash as Execution Engine + +* Do not reimplement shell parsing +* Support pipes, redirects, chaining + +### 4.4 PATH Interception + +* Replace selected binaries with shims +* Keep system binaries available underneath + +--- + +## 5. Container Specification + +### 5.1 Base Image + +* `debian:bookworm-slim` + +### 5.2 Required Packages + +```bash +elixir +erlang +bash +socat +coreutils +grep +``` + +--- + +### 5.3 Filesystem Layout + +```text +/app + daemon.exs + shims/ + cat + grep +``` + +--- + +### 5.4 PATH Configuration + +```bash +PATH=/app/shims:/usr/bin:/bin +``` + +--- + +### 5.5 Container Startup Command + +```bash +elixir daemon.exs & exec bash +``` + +Requirements: + +* daemon must start before shell usage +* shell must remain interactive/alive + +--- + +## 6. BEAM Daemon Specification + +### 6.1 Transport + +* Unix domain socket: + + ```text + /tmp/tool_runner.sock + ``` + +* Protocol: + + * request: single line + * response: Erlang binary (`:erlang.term_to_binary/1`) + +--- + +### 6.2 Request Format (v1) + +```text +\t\t\n +``` + +Example: + +```text +cat\tfile.txt\n +``` + +--- + +### 6.3 Response Format + +```elixir +{stdout :: binary, stderr :: binary, exit_code :: integer} +``` + +Encoded via: + +```elixir +:erlang.term_to_binary/1 +``` + +--- + +### 6.4 Execution Logic + +For each request: + +1. Parse command + args +2. Call: + +```elixir +System.cmd(cmd, args, stderr_to_stdout: false) +``` + +3. Apply truncation (see below) +4. Return encoded response + +--- + +### 6.5 Truncation Rules + +Configurable constants: + +```elixir +@max_bytes 4000 +@max_lines 200 +``` + +Apply in order: + +1. truncate by bytes +2. truncate by lines + +Append: + +```text +...[truncated] +``` + +--- + +### 6.6 Concurrency Model + +* Accept loop via `:gen_tcp.accept` +* Each client handled in separate lightweight process (`spawn`) +* No shared mutable state required + +--- + +### 6.7 Error Handling + +* Unknown command → return exit_code 127 +* Exceptions → return exit_code 1 + error message +* Socket failure → ignore safely + +--- + +## 7. Shim Specification + +### 7.1 Purpose + +* Replace system binaries (`cat`, `grep`) +* Forward calls to daemon +* Reproduce exact CLI behavior: + + * stdout + * stderr + * exit code + +--- + +### 7.2 Implementation Language + +* Bash (fast startup, no BEAM overhead) + +--- + +### 7.3 Behavior + +For command: + +```bash +cat file.txt +``` + +Shim must: + +1. Build request string +2. Send to socket via `socat` +3. Receive binary response +4. Decode response +5. Write: + + * stdout → STDOUT + * stderr → STDERR +6. Exit with correct code + +--- + +### 7.4 Request Construction (in-memory) + +No temp files. + +```bash +{ + printf "cat" + for arg in "$@"; do + printf "\t%s" "$arg" + done + printf "\n" +} | socat - UNIX-CONNECT:/tmp/tool_runner.sock +``` + +--- + +### 7.5 Response Decoding + +Temporary approach: + +```bash +elixir -e ' + {out, err, code} = :erlang.binary_to_term(IO.read(:stdio, :all)) + IO.write(out) + if err != "", do: IO.write(:stderr, err) + System.halt(code) +' +``` + +--- + +### 7.6 Known Limitation + +* Arguments containing tabs/newlines will break protocol +* Acceptable for v1 +* Future: switch to JSON protocol + +--- + +## 8. Execution Flow Example + +```bash +docker exec container bash -lc "cat file.txt | grep foo" +``` + +Inside container: + +1. `cat` → shim +2. shim → daemon → real `cat` +3. truncated output returned +4. piped to `grep` +5. `grep` → shim → daemon → real `grep` + +--- + +## 9. Performance Expectations + +| Component | Latency | +| ------------- | --------- | +| docker exec | 10–40 ms | +| shim + socket | 1–5 ms | +| System.cmd | 1–5 ms | +| total | ~15–50 ms | + +--- + +## 10. Security Considerations + +Minimal (v1): + +* No command filtering +* Full shell access inside container + +Future: + +* allowlist commands +* resource limits +* seccomp profile + +--- + +## 11. Extensibility + +### 11.1 Add new tools + +* create shim in `/app/shims` +* no daemon change required + +--- + +### 11.2 Central policies + +Implement in daemon: + +* timeouts +* logging +* output shaping +* auditing + +--- + +### 11.3 Protocol upgrade path + +Replace tab protocol with: + +```json +{ "cmd": "...", "args": [...] } +``` + +--- + +## 12. Failure Modes + +| Failure | Behavior | +| ------------------ | ----------------------------- | +| daemon not running | shim fails (connection error) | +| socket missing | immediate error | +| malformed response | decode failure | +| command not found | exit 127 | + +--- + +## 13. Implementation Checklist + +* [ ] Dockerfile builds successfully +* [ ] daemon starts on container launch +* [ ] socket created at `/tmp/tool_runner.sock` +* [ ] shim intercepts commands via PATH +* [ ] shim communicates with daemon +* [ ] stdout/stderr preserved +* [ ] exit codes preserved +* [ ] truncation enforced + +--- + +## 14. Minimal Acceptance Test + +```bash +docker exec container bash -lc "echo hello" +docker exec container bash -lc "cat /etc/passwd | grep root" +docker exec container bash -lc "cat large_file.txt" +``` + +Verify: + +* correct output +* truncated when large +* no noticeable delay beyond ~50ms + +--- + +## 15. Summary + +This system: + +* avoids BEAM startup overhead +* preserves Unix execution semantics +* centralizes control in Elixir +* remains simple and composable + +It matches the intended pattern: + +> “Use the real environment, intercept selectively, and control outputs centrally.”