can transcribe in the ui
Some checks failed
CI/CD Pipeline / build (push) Failing after 4s

This commit is contained in:
2026-03-20 14:49:59 -06:00
parent 85eb8bcefa
commit 6fc4a686f8
13 changed files with 587 additions and 4 deletions

View File

@@ -13,6 +13,8 @@ defmodule ElixirAi.Application do
[Application.get_env(:libcluster, :topologies, []), [name: ElixirAi.ClusterSupervisor]]},
{Phoenix.PubSub, name: ElixirAi.PubSub},
{ElixirAi.LiveViewPG, []},
{ElixirAi.AudioProcessingPG, []},
{DynamicSupervisor, name: ElixirAi.AudioWorkerSupervisor, strategy: :one_for_one},
ElixirAi.ToolTesting,
ElixirAiWeb.Endpoint,
{Horde.Registry,

View File

@@ -0,0 +1,47 @@
defmodule ElixirAi.AudioProcessing do
@moduledoc """
Public API for the demand-driven audio transcription pool.
Dispatch strategy:
1. Pick a random idle worker from the :available pg group.
2. If none are idle and the pool is below @max_workers, spawn a fresh worker
under AudioWorkerSupervisor and route the job directly to it.
3. If already at @max_workers, queue the job to a random existing worker via
its Erlang mailbox — it will process it when its current job finishes.
Scale-up is fully automatic (on demand). Scale-down is handled by each worker's
idle-timeout logic; workers exit after idling and the pool can reach 0.
"""
@max_workers 10
@all_group :all
@available_group :available
@doc """
Submit audio for transcription. The result is delivered asynchronously to
`caller_pid` as:
{:transcription_result, {:ok, text} | {:error, reason}}
"""
def submit(audio_binary, mime_type, caller_pid) do
case :pg.get_members(ElixirAi.AudioProcessingPG, @available_group) do
[] ->
all = :pg.get_members(ElixirAi.AudioProcessingPG, @all_group)
if length(all) < @max_workers do
{:ok, pid} =
DynamicSupervisor.start_child(ElixirAi.AudioWorkerSupervisor, ElixirAi.AudioWorker)
GenServer.cast(pid, {:transcribe, caller_pid, audio_binary, mime_type})
else
# At max capacity — overflow to a random worker's mailbox
GenServer.cast(Enum.random(all), {:transcribe, caller_pid, audio_binary, mime_type})
end
available ->
GenServer.cast(Enum.random(available), {:transcribe, caller_pid, audio_binary, mime_type})
end
:ok
end
end

View File

@@ -0,0 +1,20 @@
defmodule ElixirAi.AudioProcessingPG do
@moduledoc """
Named :pg scope for tracking audio transcription workers across the cluster.
Workers join two groups:
- :all — always a member while alive (used for pool-size accounting)
- :available — member only while idle (used for dispatch; left while processing)
:pg automatically removes dead processes, so no manual cleanup is needed.
"""
def child_spec(_opts) do
%{
id: __MODULE__,
start: {:pg, :start_link, [__MODULE__]},
type: :worker,
restart: :permanent
}
end
end

View File

@@ -0,0 +1,114 @@
defmodule ElixirAi.AudioWorker do
@moduledoc """
GenServer that transcribes audio by posting to a Whisper-compatible HTTP endpoint.
Pool membership in AudioProcessingPG:
- :all — joined on init; left only on exit
- :available — joined on init and after each job; left while processing
This join/leave pattern lets the AudioProcessing dispatcher know which workers are
idle without any central coordinator. When a worker finishes a job it rejoins
:available and becomes eligible for the next dispatch.
Scale-down: workers exit after @idle_timeout_ms of inactivity, allowing the pool
to reach 0. New workers are spawned on demand when the next job arrives.
Results are delivered to the calling LiveView process as:
{:transcription_result, {:ok, text} | {:error, reason}}
"""
use GenServer
require Logger
@all_group :all
@available_group :available
@idle_timeout_ms 30_000
def start_link(opts), do: GenServer.start_link(__MODULE__, opts)
@impl true
def init(_opts) do
:pg.join(ElixirAi.AudioProcessingPG, @all_group, self())
:pg.join(ElixirAi.AudioProcessingPG, @available_group, self())
schedule_idle_check()
{:ok, %{busy: false, idle_since: monotonic_sec()}}
end
@impl true
def handle_cast({:transcribe, caller_pid, audio_binary, mime_type}, state) do
:pg.leave(ElixirAi.AudioProcessingPG, @available_group, self())
worker = self()
Task.start(fn ->
result = do_transcribe(audio_binary, mime_type)
send(worker, {:transcription_done, caller_pid, result})
end)
{:noreply, %{state | busy: true}}
end
@impl true
def handle_info({:transcription_done, caller_pid, result}, state) do
send(caller_pid, {:transcription_result, result})
:pg.join(ElixirAi.AudioProcessingPG, @available_group, self())
{:noreply, %{state | busy: false, idle_since: monotonic_sec()}}
end
def handle_info(:idle_check, %{busy: true} = state) do
schedule_idle_check()
{:noreply, state}
end
def handle_info(:idle_check, %{busy: false, idle_since: idle_since} = state) do
idle_ms = (monotonic_sec() - idle_since) * 1000
if idle_ms >= @idle_timeout_ms do
Logger.debug("AudioWorker #{inspect(self())} exiting — idle for #{div(idle_ms, 1000)}s")
{:stop, :normal, state}
else
schedule_idle_check()
{:noreply, state}
end
end
defp schedule_idle_check do
Process.send_after(self(), :idle_check, @idle_timeout_ms)
end
defp monotonic_sec, do: System.monotonic_time(:second)
defp filename_for(mime_type) do
cond do
String.starts_with?(mime_type, "audio/webm") -> "audio.webm"
String.starts_with?(mime_type, "audio/ogg") -> "audio.ogg"
String.starts_with?(mime_type, "audio/mp4") -> "audio.mp4"
true -> "audio.bin"
end
end
defp do_transcribe(audio_binary, mime_type) do
endpoint = Application.get_env(:elixir_ai, :whisper_endpoint)
filename = filename_for(mime_type)
case Req.post(endpoint,
form_multipart: [
file: {audio_binary, filename: filename, content_type: mime_type},
response_format: "json",
language: "en"
],
receive_timeout: 30_000
) do
{:ok, %{status: 200, body: %{"text" => text}}} ->
{:ok, String.trim(text)}
{:ok, %{status: status, body: body}} ->
Logger.warning("AudioWorker: Whisper returned HTTP #{status}: #{inspect(body)}")
{:error, {:http_error, status}}
{:error, reason} ->
Logger.error("AudioWorker: request failed: #{inspect(reason)}")
{:error, reason}
end
end
end

View File

@@ -130,9 +130,6 @@ defmodule ElixirAiWeb.ChatLive do
{:noreply, assign(socket, streaming_response: nil, ai_error: nil)}
end
# Fetches the authoritative streaming snapshot directly from the runner pid,
# bypassing the Horde registry. Sent to self immediately after subscribing on
# connect so it is the first message processed — before any PubSub chunks.
def handle_info(:sync_streaming, %{assigns: %{runner_pid: pid}} = socket)
when is_pid(pid) do
case GenServer.call(pid, :get_streaming_response) do

View File

@@ -18,6 +18,7 @@
</script>
</head>
<body class="bg-cyan-900 text-cyan-50">
{live_render(@conn, ElixirAiWeb.VoiceLive, id: "voice-control")}
{@inner_content}
</body>
</html>

View File

@@ -0,0 +1,117 @@
defmodule ElixirAiWeb.VoiceLive do
use ElixirAiWeb, :live_view
require Logger
def mount(_params, _session, socket) do
{:ok, assign(socket, state: :idle, transcription: nil), layout: false}
end
def render(assigns) do
~H"""
<div id="voice-control-hook" phx-hook="VoiceControl">
<div class="fixed top-4 right-4 w-72 bg-cyan-950/95 border border-cyan-800 rounded-2xl shadow-2xl z-50 p-4 flex flex-col gap-3 backdrop-blur">
<div class="flex items-center gap-3">
<%= if @state == :idle do %>
<svg xmlns="http://www.w3.org/2000/svg" class="h-4 w-4 text-cyan-500 shrink-0" viewBox="0 0 24 24" fill="currentColor">
<path d="M12 1a4 4 0 0 1 4 4v7a4 4 0 0 1-8 0V5a4 4 0 0 1 4-4zm0 2a2 2 0 0 0-2 2v7a2 2 0 1 0 4 0V5a2 2 0 0 0-2-2zm-7 9a7 7 0 0 0 14 0h2a9 9 0 0 1-8 8.94V23h-2v-2.06A9 9 0 0 1 3 12H5z"/>
</svg>
<span class="text-cyan-400 font-semibold text-sm">Voice Input</span>
<% end %>
<%= if @state == :recording do %>
<span class="relative flex h-3 w-3 shrink-0">
<span class="animate-ping absolute inline-flex h-full w-full rounded-full bg-red-500 opacity-75"></span>
<span class="relative inline-flex rounded-full h-3 w-3 bg-red-500"></span>
</span>
<span class="text-cyan-50 font-semibold text-sm">Recording</span>
<% end %>
<%= if @state == :processing do %>
<span class="relative flex h-3 w-3 shrink-0">
<span class="animate-ping absolute inline-flex h-full w-full rounded-full bg-cyan-400 opacity-75"></span>
<span class="relative inline-flex rounded-full h-3 w-3 bg-cyan-400"></span>
</span>
<span class="text-cyan-50 font-semibold text-sm">Processing…</span>
<% end %>
<%= if @state == :transcribed do %>
<span class="text-cyan-300 font-semibold text-sm">Transcription</span>
<% end %>
</div>
<%= if @state in [:recording, :processing] do %>
<div id="voice-viz-wrapper" phx-update="ignore">
<canvas id="voice-viz-canvas" height="72" class="w-full rounded-lg bg-cyan-950 block">
</canvas>
</div>
<% end %>
<%= if @state == :transcribed do %>
<div class="rounded-xl bg-cyan-900/60 border border-cyan-700 px-3 py-2">
<p class="text-sm text-cyan-50 leading-relaxed">{@transcription}</p>
</div>
<% end %>
<%= if @state == :idle do %>
<button
phx-click={JS.dispatch("voice:start", to: "#voice-control-hook")}
class="w-full flex items-center justify-between px-3 py-1.5 rounded-lg bg-cyan-700 hover:bg-cyan-600 text-cyan-50 text-xs font-medium transition-colors"
>
<span>Start Recording</span>
<kbd class="text-cyan-300 bg-cyan-800 border border-cyan-600 px-1.5 py-0.5 rounded font-mono">Ctrl+Space</kbd>
</button>
<% end %>
<%= if @state == :recording do %>
<button
phx-click={JS.dispatch("voice:stop", to: "#voice-control-hook")}
class="w-full flex items-center justify-between px-3 py-1.5 rounded-lg bg-cyan-800 hover:bg-cyan-700 text-cyan-50 text-xs font-medium transition-colors border border-cyan-700"
>
<span>Stop Recording</span>
<kbd class="text-cyan-300 bg-cyan-900 border border-cyan-700 px-1.5 py-0.5 rounded font-mono">Space</kbd>
</button>
<% end %>
<%= if @state == :transcribed do %>
<button
phx-click="dismiss_transcription"
class="text-xs text-cyan-500 hover:text-cyan-300 transition-colors text-center w-full"
>
Dismiss
</button>
<% end %>
</div>
</div>
"""
end
def handle_event("recording_started", _params, socket) do
{:noreply, assign(socket, state: :recording)}
end
def handle_event("audio_recorded", %{"data" => base64, "mime_type" => mime_type}, socket) do
case Base.decode64(base64) do
{:ok, audio_binary} ->
Logger.info(
"VoiceLive: received #{byte_size(audio_binary)} bytes of audio (#{mime_type})"
)
ElixirAi.AudioProcessing.submit(audio_binary, mime_type, self())
{:noreply, assign(socket, state: :processing)}
:error ->
Logger.error("VoiceLive: failed to decode base64 audio data")
{:noreply, assign(socket, state: :idle)}
end
end
def handle_event("recording_error", %{"reason" => reason}, socket) do
Logger.warning("VoiceLive: recording error: #{reason}")
{:noreply, assign(socket, state: :idle)}
end
def handle_event("dismiss_transcription", _params, socket) do
{:noreply, assign(socket, state: :idle, transcription: nil)}
end
def handle_info({:transcription_result, {:ok, text}}, socket) do
{:noreply, assign(socket, state: :transcribed, transcription: text)}
end
def handle_info({:transcription_result, {:error, reason}}, socket) do
Logger.error("VoiceLive: transcription failed: #{inspect(reason)}")
{:noreply, assign(socket, state: :idle)}
end
end