From 6fc4a686f89c02b150e202d394f9a8b90bfd9d21 Mon Sep 17 00:00:00 2001 From: Alex Mickelson Date: Fri, 20 Mar 2026 14:49:59 -0600 Subject: [PATCH] can transcribe in the ui --- assets/css/app.css | 5 + assets/js/app.js | 3 + assets/js/voice_control.js | 164 ++++++++++++++++++ config/runtime.exs | 3 +- docker-compose.yml | 3 + lib/elixir_ai/application.ex | 2 + lib/elixir_ai/audio/audio_processing.ex | 47 +++++ lib/elixir_ai/audio/audio_processing_pg.ex | 20 +++ lib/elixir_ai/audio/audio_worker.ex | 114 ++++++++++++ lib/elixir_ai_web/chat/chat_live.ex | 3 - .../components/layouts/root.html.heex | 1 + lib/elixir_ai_web/voice/voice_live.ex | 117 +++++++++++++ streaming-outline.md | 109 ++++++++++++ 13 files changed, 587 insertions(+), 4 deletions(-) create mode 100644 assets/js/voice_control.js create mode 100644 lib/elixir_ai/audio/audio_processing.ex create mode 100644 lib/elixir_ai/audio/audio_processing_pg.ex create mode 100644 lib/elixir_ai/audio/audio_worker.ex create mode 100644 lib/elixir_ai_web/voice/voice_live.ex create mode 100644 streaming-outline.md diff --git a/assets/css/app.css b/assets/css/app.css index ca5187e..98fdfe7 100644 --- a/assets/css/app.css +++ b/assets/css/app.css @@ -58,6 +58,10 @@ legend { @apply text-sm font-semibold text-cyan-400 px-1; } +button { + @apply cursor-pointer; +} + @import "./spinner.css"; @import "./markdown.css"; @@ -72,3 +76,4 @@ legend { + diff --git a/assets/js/app.js b/assets/js/app.js index 9d38d2f..fa369ea 100644 --- a/assets/js/app.js +++ b/assets/js/app.js @@ -19,9 +19,12 @@ import "phoenix_html"; import { Socket } from "phoenix"; import { LiveSocket } from "phoenix_live_view"; import topbar from "../vendor/topbar"; +import { VoiceControl } from "./voice_control"; let Hooks = {}; +Hooks.VoiceControl = VoiceControl; + // Renders a complete markdown string client-side on mount. // The raw markdown is passed as the data-md attribute. Hooks.MarkdownRender = { diff --git a/assets/js/voice_control.js b/assets/js/voice_control.js new file mode 100644 index 0000000..68db971 --- /dev/null +++ b/assets/js/voice_control.js @@ -0,0 +1,164 @@ +const VoiceControl = { + mounted() { + this._mediaRecorder = null; + this._chunks = []; + this._recording = false; + this._audioCtx = null; + this._analyser = null; + this._animFrame = null; + + this._onKeyDown = (e) => { + // Ctrl+Space → start + if (e.ctrlKey && e.code === "Space" && !this._recording) { + e.preventDefault(); + this.startRecording(); + // Space alone → stop (prevent page scroll while recording) + } else if ( + e.code === "Space" && + !e.ctrlKey && + !e.altKey && + !e.metaKey && + this._recording + ) { + e.preventDefault(); + this.stopRecording(); + } + }; + + window.addEventListener("keydown", this._onKeyDown); + + // Button clicks dispatch DOM events to avoid a server round-trip + this.el.addEventListener("voice:start", () => this.startRecording()); + this.el.addEventListener("voice:stop", () => this.stopRecording()); + }, + + destroyed() { + window.removeEventListener("keydown", this._onKeyDown); + this._stopVisualization(); + if (this._mediaRecorder && this._recording) { + this._mediaRecorder.stop(); + } + }, + + _startVisualization(stream) { + this._audioCtx = new AudioContext(); + this._analyser = this._audioCtx.createAnalyser(); + // 64 bins gives a clean bar chart without being too dense + this._analyser.fftSize = 64; + this._analyser.smoothingTimeConstant = 0.75; + + const source = this._audioCtx.createMediaStreamSource(stream); + source.connect(this._analyser); + + const bufferLength = this._analyser.frequencyBinCount; // 32 + const dataArray = new Uint8Array(bufferLength); + + const draw = () => { + this._animFrame = requestAnimationFrame(draw); + + const canvas = document.getElementById("voice-viz-canvas"); + if (!canvas) return; + const ctx = canvas.getContext("2d"); + + // Sync pixel buffer to CSS display size + const displayWidth = canvas.offsetWidth; + const displayHeight = canvas.offsetHeight; + if (canvas.width !== displayWidth) canvas.width = displayWidth; + if (canvas.height !== displayHeight) canvas.height = displayHeight; + + this._analyser.getByteFrequencyData(dataArray); + + ctx.clearRect(0, 0, canvas.width, canvas.height); + + const totalBars = bufferLength; + const barWidth = (canvas.width / totalBars) * 0.7; + const gap = canvas.width / totalBars - barWidth; + const radius = Math.max(2, barWidth / 4); + + for (let i = 0; i < totalBars; i++) { + const value = dataArray[i] / 255; + const barHeight = Math.max(4, value * canvas.height); + const x = i * (barWidth + gap) + gap / 2; + const y = canvas.height - barHeight; + + // Cyan at low amplitude → teal → green at high amplitude + const hue = 185 - value * 80; + const lightness = 40 + value * 25; + ctx.fillStyle = `hsl(${hue}, 90%, ${lightness}%)`; + + ctx.beginPath(); + ctx.roundRect(x, y, barWidth, barHeight, radius); + ctx.fill(); + } + }; + + draw(); + }, + + _stopVisualization() { + if (this._animFrame) { + cancelAnimationFrame(this._animFrame); + this._animFrame = null; + } + if (this._audioCtx) { + this._audioCtx.close(); + this._audioCtx = null; + this._analyser = null; + } + }, + + async startRecording() { + let stream; + try { + stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + } catch (err) { + console.error( + "VoiceControl: microphone access denied or unavailable", + err, + ); + this.pushEvent("recording_error", { reason: err.message }); + return; + } + + this._chunks = []; + this._mediaRecorder = new MediaRecorder(stream); + + this._mediaRecorder.ondataavailable = (e) => { + if (e.data.size > 0) this._chunks.push(e.data); + }; + + this._mediaRecorder.onstop = () => { + const mimeType = this._mediaRecorder.mimeType; + const blob = new Blob(this._chunks, { type: mimeType }); + + const reader = new FileReader(); + reader.onloadend = () => { + // reader.result is "data:;base64," — strip the prefix + const base64 = reader.result.split(",")[1]; + this.pushEvent("audio_recorded", { data: base64, mime_type: mimeType }); + }; + reader.readAsDataURL(blob); + + // Release the microphone indicator in the OS browser tab + stream.getTracks().forEach((t) => t.stop()); + this._stopVisualization(); + this._recording = false; + }; + + this._mediaRecorder.start(); + this._recording = true; + this.pushEvent("recording_started", {}); + + // Defer visualization start by one tick so LiveView has rendered the canvas + setTimeout(() => this._startVisualization(stream), 50); + }, + + stopRecording() { + if (this._mediaRecorder && this._mediaRecorder.state !== "inactive") { + this._mediaRecorder.stop(); + // _recording flipped to false inside onstop after blob is ready + } + }, +}; + +export { VoiceControl }; diff --git a/config/runtime.exs b/config/runtime.exs index 428ba83..9798219 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -6,7 +6,8 @@ source!([".env", System.get_env()]) config :elixir_ai, ai_endpoint: System.get_env("AI_RESPONSES_ENDPOINT"), ai_token: System.get_env("AI_TOKEN"), - ai_model: System.get_env("AI_MODEL") + ai_model: System.get_env("AI_MODEL"), + whisper_endpoint: System.get_env("WHISPER_ENDPOINT") # config/runtime.exs is executed for all environments, including # during releases. It is executed after compilation and before the diff --git a/docker-compose.yml b/docker-compose.yml index aa69d4b..7493389 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,6 +5,7 @@ services: POSTGRES_USER: elixir_ai POSTGRES_PASSWORD: elixir_ai POSTGRES_DB: elixir_ai_dev + command: postgres -c hba_file=/etc/postgresql/pg_hba.conf volumes: - ./postgres/schema/:/docker-entrypoint-initdb.d/ @@ -32,6 +33,7 @@ services: RELEASE_COOKIE: secret_cluster_cookie SECRET_KEY_BASE: F1nY5uSyD0HfoWejcuuQiaQoMQrjrlFigb3bJ7p4hTXwpTza6sPLpmd+jLS7p0Sh PROVIDERS_CONFIG_PATH: /app/providers.yml + WHISPER_ENDPOINT: http://ai-office-server.beefalo-newton.ts.net:8082/inference user: root command: | sh -c ' @@ -71,6 +73,7 @@ services: RELEASE_COOKIE: secret_cluster_cookie SECRET_KEY_BASE: F1nY5uSyD0HfoWejcuuQiaQoMQrjrlFigb3bJ7p4hTXwpTza6sPLpmd+jLS7p0Sh PROVIDERS_CONFIG_PATH: /app/providers.yml + WHISPER_ENDPOINT: http://ai-office-server.beefalo-newton.ts.net:8082/inference user: root command: | sh -c ' diff --git a/lib/elixir_ai/application.ex b/lib/elixir_ai/application.ex index b6ceae3..005720b 100644 --- a/lib/elixir_ai/application.ex +++ b/lib/elixir_ai/application.ex @@ -13,6 +13,8 @@ defmodule ElixirAi.Application do [Application.get_env(:libcluster, :topologies, []), [name: ElixirAi.ClusterSupervisor]]}, {Phoenix.PubSub, name: ElixirAi.PubSub}, {ElixirAi.LiveViewPG, []}, + {ElixirAi.AudioProcessingPG, []}, + {DynamicSupervisor, name: ElixirAi.AudioWorkerSupervisor, strategy: :one_for_one}, ElixirAi.ToolTesting, ElixirAiWeb.Endpoint, {Horde.Registry, diff --git a/lib/elixir_ai/audio/audio_processing.ex b/lib/elixir_ai/audio/audio_processing.ex new file mode 100644 index 0000000..08ec191 --- /dev/null +++ b/lib/elixir_ai/audio/audio_processing.ex @@ -0,0 +1,47 @@ +defmodule ElixirAi.AudioProcessing do + @moduledoc """ + Public API for the demand-driven audio transcription pool. + + Dispatch strategy: + 1. Pick a random idle worker from the :available pg group. + 2. If none are idle and the pool is below @max_workers, spawn a fresh worker + under AudioWorkerSupervisor and route the job directly to it. + 3. If already at @max_workers, queue the job to a random existing worker via + its Erlang mailbox — it will process it when its current job finishes. + + Scale-up is fully automatic (on demand). Scale-down is handled by each worker's + idle-timeout logic; workers exit after idling and the pool can reach 0. + """ + + @max_workers 10 + @all_group :all + @available_group :available + + @doc """ + Submit audio for transcription. The result is delivered asynchronously to + `caller_pid` as: + + {:transcription_result, {:ok, text} | {:error, reason}} + """ + def submit(audio_binary, mime_type, caller_pid) do + case :pg.get_members(ElixirAi.AudioProcessingPG, @available_group) do + [] -> + all = :pg.get_members(ElixirAi.AudioProcessingPG, @all_group) + + if length(all) < @max_workers do + {:ok, pid} = + DynamicSupervisor.start_child(ElixirAi.AudioWorkerSupervisor, ElixirAi.AudioWorker) + + GenServer.cast(pid, {:transcribe, caller_pid, audio_binary, mime_type}) + else + # At max capacity — overflow to a random worker's mailbox + GenServer.cast(Enum.random(all), {:transcribe, caller_pid, audio_binary, mime_type}) + end + + available -> + GenServer.cast(Enum.random(available), {:transcribe, caller_pid, audio_binary, mime_type}) + end + + :ok + end +end diff --git a/lib/elixir_ai/audio/audio_processing_pg.ex b/lib/elixir_ai/audio/audio_processing_pg.ex new file mode 100644 index 0000000..bd10025 --- /dev/null +++ b/lib/elixir_ai/audio/audio_processing_pg.ex @@ -0,0 +1,20 @@ +defmodule ElixirAi.AudioProcessingPG do + @moduledoc """ + Named :pg scope for tracking audio transcription workers across the cluster. + + Workers join two groups: + - :all — always a member while alive (used for pool-size accounting) + - :available — member only while idle (used for dispatch; left while processing) + + :pg automatically removes dead processes, so no manual cleanup is needed. + """ + + def child_spec(_opts) do + %{ + id: __MODULE__, + start: {:pg, :start_link, [__MODULE__]}, + type: :worker, + restart: :permanent + } + end +end diff --git a/lib/elixir_ai/audio/audio_worker.ex b/lib/elixir_ai/audio/audio_worker.ex new file mode 100644 index 0000000..8a601c0 --- /dev/null +++ b/lib/elixir_ai/audio/audio_worker.ex @@ -0,0 +1,114 @@ +defmodule ElixirAi.AudioWorker do + @moduledoc """ + GenServer that transcribes audio by posting to a Whisper-compatible HTTP endpoint. + + Pool membership in AudioProcessingPG: + - :all — joined on init; left only on exit + - :available — joined on init and after each job; left while processing + + This join/leave pattern lets the AudioProcessing dispatcher know which workers are + idle without any central coordinator. When a worker finishes a job it rejoins + :available and becomes eligible for the next dispatch. + + Scale-down: workers exit after @idle_timeout_ms of inactivity, allowing the pool + to reach 0. New workers are spawned on demand when the next job arrives. + + Results are delivered to the calling LiveView process as: + {:transcription_result, {:ok, text} | {:error, reason}} + """ + + use GenServer + require Logger + + @all_group :all + @available_group :available + @idle_timeout_ms 30_000 + + def start_link(opts), do: GenServer.start_link(__MODULE__, opts) + + @impl true + def init(_opts) do + :pg.join(ElixirAi.AudioProcessingPG, @all_group, self()) + :pg.join(ElixirAi.AudioProcessingPG, @available_group, self()) + schedule_idle_check() + {:ok, %{busy: false, idle_since: monotonic_sec()}} + end + + @impl true + def handle_cast({:transcribe, caller_pid, audio_binary, mime_type}, state) do + :pg.leave(ElixirAi.AudioProcessingPG, @available_group, self()) + worker = self() + + Task.start(fn -> + result = do_transcribe(audio_binary, mime_type) + send(worker, {:transcription_done, caller_pid, result}) + end) + + {:noreply, %{state | busy: true}} + end + + @impl true + def handle_info({:transcription_done, caller_pid, result}, state) do + send(caller_pid, {:transcription_result, result}) + :pg.join(ElixirAi.AudioProcessingPG, @available_group, self()) + {:noreply, %{state | busy: false, idle_since: monotonic_sec()}} + end + + def handle_info(:idle_check, %{busy: true} = state) do + schedule_idle_check() + {:noreply, state} + end + + def handle_info(:idle_check, %{busy: false, idle_since: idle_since} = state) do + idle_ms = (monotonic_sec() - idle_since) * 1000 + + if idle_ms >= @idle_timeout_ms do + Logger.debug("AudioWorker #{inspect(self())} exiting — idle for #{div(idle_ms, 1000)}s") + + {:stop, :normal, state} + else + schedule_idle_check() + {:noreply, state} + end + end + + defp schedule_idle_check do + Process.send_after(self(), :idle_check, @idle_timeout_ms) + end + + defp monotonic_sec, do: System.monotonic_time(:second) + + defp filename_for(mime_type) do + cond do + String.starts_with?(mime_type, "audio/webm") -> "audio.webm" + String.starts_with?(mime_type, "audio/ogg") -> "audio.ogg" + String.starts_with?(mime_type, "audio/mp4") -> "audio.mp4" + true -> "audio.bin" + end + end + + defp do_transcribe(audio_binary, mime_type) do + endpoint = Application.get_env(:elixir_ai, :whisper_endpoint) + filename = filename_for(mime_type) + + case Req.post(endpoint, + form_multipart: [ + file: {audio_binary, filename: filename, content_type: mime_type}, + response_format: "json", + language: "en" + ], + receive_timeout: 30_000 + ) do + {:ok, %{status: 200, body: %{"text" => text}}} -> + {:ok, String.trim(text)} + + {:ok, %{status: status, body: body}} -> + Logger.warning("AudioWorker: Whisper returned HTTP #{status}: #{inspect(body)}") + {:error, {:http_error, status}} + + {:error, reason} -> + Logger.error("AudioWorker: request failed: #{inspect(reason)}") + {:error, reason} + end + end +end diff --git a/lib/elixir_ai_web/chat/chat_live.ex b/lib/elixir_ai_web/chat/chat_live.ex index 4aafcee..418d5a1 100644 --- a/lib/elixir_ai_web/chat/chat_live.ex +++ b/lib/elixir_ai_web/chat/chat_live.ex @@ -130,9 +130,6 @@ defmodule ElixirAiWeb.ChatLive do {:noreply, assign(socket, streaming_response: nil, ai_error: nil)} end - # Fetches the authoritative streaming snapshot directly from the runner pid, - # bypassing the Horde registry. Sent to self immediately after subscribing on - # connect so it is the first message processed — before any PubSub chunks. def handle_info(:sync_streaming, %{assigns: %{runner_pid: pid}} = socket) when is_pid(pid) do case GenServer.call(pid, :get_streaming_response) do diff --git a/lib/elixir_ai_web/components/layouts/root.html.heex b/lib/elixir_ai_web/components/layouts/root.html.heex index b74e100..7c423d8 100644 --- a/lib/elixir_ai_web/components/layouts/root.html.heex +++ b/lib/elixir_ai_web/components/layouts/root.html.heex @@ -18,6 +18,7 @@ + {live_render(@conn, ElixirAiWeb.VoiceLive, id: "voice-control")} {@inner_content} diff --git a/lib/elixir_ai_web/voice/voice_live.ex b/lib/elixir_ai_web/voice/voice_live.ex new file mode 100644 index 0000000..3be2adb --- /dev/null +++ b/lib/elixir_ai_web/voice/voice_live.ex @@ -0,0 +1,117 @@ +defmodule ElixirAiWeb.VoiceLive do + use ElixirAiWeb, :live_view + require Logger + + def mount(_params, _session, socket) do + {:ok, assign(socket, state: :idle, transcription: nil), layout: false} + end + + def render(assigns) do + ~H""" +
+
+
+ <%= if @state == :idle do %> + + + + Voice Input + <% end %> + <%= if @state == :recording do %> + + + + + Recording + <% end %> + <%= if @state == :processing do %> + + + + + Processing… + <% end %> + <%= if @state == :transcribed do %> + Transcription + <% end %> +
+ <%= if @state in [:recording, :processing] do %> +
+ + +
+ <% end %> + <%= if @state == :transcribed do %> +
+

{@transcription}

+
+ <% end %> + <%= if @state == :idle do %> + + <% end %> + <%= if @state == :recording do %> + + <% end %> + <%= if @state == :transcribed do %> + + <% end %> +
+
+ """ + end + + def handle_event("recording_started", _params, socket) do + {:noreply, assign(socket, state: :recording)} + end + + def handle_event("audio_recorded", %{"data" => base64, "mime_type" => mime_type}, socket) do + case Base.decode64(base64) do + {:ok, audio_binary} -> + Logger.info( + "VoiceLive: received #{byte_size(audio_binary)} bytes of audio (#{mime_type})" + ) + + ElixirAi.AudioProcessing.submit(audio_binary, mime_type, self()) + {:noreply, assign(socket, state: :processing)} + + :error -> + Logger.error("VoiceLive: failed to decode base64 audio data") + {:noreply, assign(socket, state: :idle)} + end + end + + def handle_event("recording_error", %{"reason" => reason}, socket) do + Logger.warning("VoiceLive: recording error: #{reason}") + {:noreply, assign(socket, state: :idle)} + end + + def handle_event("dismiss_transcription", _params, socket) do + {:noreply, assign(socket, state: :idle, transcription: nil)} + end + + def handle_info({:transcription_result, {:ok, text}}, socket) do + {:noreply, assign(socket, state: :transcribed, transcription: text)} + end + + def handle_info({:transcription_result, {:error, reason}}, socket) do + Logger.error("VoiceLive: transcription failed: #{inspect(reason)}") + {:noreply, assign(socket, state: :idle)} + end +end diff --git a/streaming-outline.md b/streaming-outline.md new file mode 100644 index 0000000..a4977fa --- /dev/null +++ b/streaming-outline.md @@ -0,0 +1,109 @@ +# Voice Recording + Whisper + +Reference: [`alexmickelson/office-infrastructure`](https://github.com/alexmickelson/office-infrastructure/tree/master/ai-office-server/nginx/html) + +--- + +## Recording + +```js +let mediaStream = null; +let recorder = null; +let chunks = []; + +async function startRecording() { + if (!mediaStream) { + mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false }); + } + + const mimeType = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/ogg"] + .find((t) => MediaRecorder.isTypeSupported(t)) || ""; + + chunks = []; + recorder = new MediaRecorder(mediaStream, mimeType ? { mimeType } : undefined); + recorder.ondataavailable = (e) => { if (e.data?.size > 0) chunks.push(e.data); }; + recorder.onstop = async () => { + const blob = new Blob(chunks, { type: recorder.mimeType || "audio/webm" }); + const text = await sendToWhisper(blob, "your prompt context here"); + console.log(text); + }; + recorder.start(100); // fires ondataavailable every 100ms +} + +function stopRecording() { + if (recorder?.state === "recording") recorder.stop(); +} +``` + +--- + +## Sending to Whisper + +`POST {serverUrl}/inference` as `multipart/form-data`. Returns `{ "text": "..." }`. + +```js +async function sendToWhisper(blob, prompt) { + const formData = new FormData(); + formData.append("file", blob, "audio.webm"); + formData.append("response_format", "json"); + formData.append("language", "en"); // or "" for auto-detect + if (prompt) formData.append("prompt", prompt); + + const res = await fetch("https://your-whisper-server/inference", { + method: "POST", + body: formData, + }); + const data = await res.json(); + return (data.text || "").trim(); +} +``` + +The `prompt` field accepts the last ~20 words of prior transcript — Whisper uses it as context to improve continuity across chunks. + +--- + +## Visualization + +Requires a `` in the HTML. + +```js +const canvas = document.getElementById("volumeCanvas"); +const ctx = canvas.getContext("2d"); +const MAX_BARS = 180; // 6s × 30fps +const volHistory = []; +let vizRaf = null; + +function startViz(stream) { + const audioCtx = new AudioContext(); + const analyser = audioCtx.createAnalyser(); + audioCtx.createMediaStreamSource(stream).connect(analyser); + analyser.fftSize = 1024; + const buf = new Uint8Array(analyser.frequencyBinCount); + + function tick() { + vizRaf = requestAnimationFrame(tick); + analyser.getByteFrequencyData(buf); + const rms = Math.sqrt(buf.reduce((s, v) => s + v * v, 0) / buf.length) / 255; + volHistory.push(rms); + if (volHistory.length > MAX_BARS) volHistory.shift(); + + const W = canvas.offsetWidth * devicePixelRatio; + const H = canvas.offsetHeight * devicePixelRatio; + if (canvas.width !== W || canvas.height !== H) { canvas.width = W; canvas.height = H; } + ctx.clearRect(0, 0, W, H); + const barW = W / MAX_BARS; + volHistory.forEach((v, i) => { + ctx.fillStyle = `hsl(${120 - v * 120}, 80%, 45%)`; // green → red + ctx.fillRect(i * barW, H - v * H, Math.max(1, barW - 1), v * H); + }); + } + tick(); +} + +function stopViz() { + cancelAnimationFrame(vizRaf); + vizRaf = null; +} +``` + +Call `startViz(mediaStream)` right after `getUserMedia`, and `stopViz()` after `recorder.stop()`.