This commit is contained in:
@@ -58,6 +58,10 @@ legend {
|
|||||||
@apply text-sm font-semibold text-cyan-400 px-1;
|
@apply text-sm font-semibold text-cyan-400 px-1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
button {
|
||||||
|
@apply cursor-pointer;
|
||||||
|
}
|
||||||
|
|
||||||
@import "./spinner.css";
|
@import "./spinner.css";
|
||||||
@import "./markdown.css";
|
@import "./markdown.css";
|
||||||
|
|
||||||
@@ -72,3 +76,4 @@ legend {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -19,9 +19,12 @@ import "phoenix_html";
|
|||||||
import { Socket } from "phoenix";
|
import { Socket } from "phoenix";
|
||||||
import { LiveSocket } from "phoenix_live_view";
|
import { LiveSocket } from "phoenix_live_view";
|
||||||
import topbar from "../vendor/topbar";
|
import topbar from "../vendor/topbar";
|
||||||
|
import { VoiceControl } from "./voice_control";
|
||||||
|
|
||||||
let Hooks = {};
|
let Hooks = {};
|
||||||
|
|
||||||
|
Hooks.VoiceControl = VoiceControl;
|
||||||
|
|
||||||
// Renders a complete markdown string client-side on mount.
|
// Renders a complete markdown string client-side on mount.
|
||||||
// The raw markdown is passed as the data-md attribute.
|
// The raw markdown is passed as the data-md attribute.
|
||||||
Hooks.MarkdownRender = {
|
Hooks.MarkdownRender = {
|
||||||
|
|||||||
164
assets/js/voice_control.js
Normal file
164
assets/js/voice_control.js
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
const VoiceControl = {
|
||||||
|
mounted() {
|
||||||
|
this._mediaRecorder = null;
|
||||||
|
this._chunks = [];
|
||||||
|
this._recording = false;
|
||||||
|
this._audioCtx = null;
|
||||||
|
this._analyser = null;
|
||||||
|
this._animFrame = null;
|
||||||
|
|
||||||
|
this._onKeyDown = (e) => {
|
||||||
|
// Ctrl+Space → start
|
||||||
|
if (e.ctrlKey && e.code === "Space" && !this._recording) {
|
||||||
|
e.preventDefault();
|
||||||
|
this.startRecording();
|
||||||
|
// Space alone → stop (prevent page scroll while recording)
|
||||||
|
} else if (
|
||||||
|
e.code === "Space" &&
|
||||||
|
!e.ctrlKey &&
|
||||||
|
!e.altKey &&
|
||||||
|
!e.metaKey &&
|
||||||
|
this._recording
|
||||||
|
) {
|
||||||
|
e.preventDefault();
|
||||||
|
this.stopRecording();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
window.addEventListener("keydown", this._onKeyDown);
|
||||||
|
|
||||||
|
// Button clicks dispatch DOM events to avoid a server round-trip
|
||||||
|
this.el.addEventListener("voice:start", () => this.startRecording());
|
||||||
|
this.el.addEventListener("voice:stop", () => this.stopRecording());
|
||||||
|
},
|
||||||
|
|
||||||
|
destroyed() {
|
||||||
|
window.removeEventListener("keydown", this._onKeyDown);
|
||||||
|
this._stopVisualization();
|
||||||
|
if (this._mediaRecorder && this._recording) {
|
||||||
|
this._mediaRecorder.stop();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
_startVisualization(stream) {
|
||||||
|
this._audioCtx = new AudioContext();
|
||||||
|
this._analyser = this._audioCtx.createAnalyser();
|
||||||
|
// 64 bins gives a clean bar chart without being too dense
|
||||||
|
this._analyser.fftSize = 64;
|
||||||
|
this._analyser.smoothingTimeConstant = 0.75;
|
||||||
|
|
||||||
|
const source = this._audioCtx.createMediaStreamSource(stream);
|
||||||
|
source.connect(this._analyser);
|
||||||
|
|
||||||
|
const bufferLength = this._analyser.frequencyBinCount; // 32
|
||||||
|
const dataArray = new Uint8Array(bufferLength);
|
||||||
|
|
||||||
|
const draw = () => {
|
||||||
|
this._animFrame = requestAnimationFrame(draw);
|
||||||
|
|
||||||
|
const canvas = document.getElementById("voice-viz-canvas");
|
||||||
|
if (!canvas) return;
|
||||||
|
const ctx = canvas.getContext("2d");
|
||||||
|
|
||||||
|
// Sync pixel buffer to CSS display size
|
||||||
|
const displayWidth = canvas.offsetWidth;
|
||||||
|
const displayHeight = canvas.offsetHeight;
|
||||||
|
if (canvas.width !== displayWidth) canvas.width = displayWidth;
|
||||||
|
if (canvas.height !== displayHeight) canvas.height = displayHeight;
|
||||||
|
|
||||||
|
this._analyser.getByteFrequencyData(dataArray);
|
||||||
|
|
||||||
|
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
||||||
|
|
||||||
|
const totalBars = bufferLength;
|
||||||
|
const barWidth = (canvas.width / totalBars) * 0.7;
|
||||||
|
const gap = canvas.width / totalBars - barWidth;
|
||||||
|
const radius = Math.max(2, barWidth / 4);
|
||||||
|
|
||||||
|
for (let i = 0; i < totalBars; i++) {
|
||||||
|
const value = dataArray[i] / 255;
|
||||||
|
const barHeight = Math.max(4, value * canvas.height);
|
||||||
|
const x = i * (barWidth + gap) + gap / 2;
|
||||||
|
const y = canvas.height - barHeight;
|
||||||
|
|
||||||
|
// Cyan at low amplitude → teal → green at high amplitude
|
||||||
|
const hue = 185 - value * 80;
|
||||||
|
const lightness = 40 + value * 25;
|
||||||
|
ctx.fillStyle = `hsl(${hue}, 90%, ${lightness}%)`;
|
||||||
|
|
||||||
|
ctx.beginPath();
|
||||||
|
ctx.roundRect(x, y, barWidth, barHeight, radius);
|
||||||
|
ctx.fill();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
draw();
|
||||||
|
},
|
||||||
|
|
||||||
|
_stopVisualization() {
|
||||||
|
if (this._animFrame) {
|
||||||
|
cancelAnimationFrame(this._animFrame);
|
||||||
|
this._animFrame = null;
|
||||||
|
}
|
||||||
|
if (this._audioCtx) {
|
||||||
|
this._audioCtx.close();
|
||||||
|
this._audioCtx = null;
|
||||||
|
this._analyser = null;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
async startRecording() {
|
||||||
|
let stream;
|
||||||
|
try {
|
||||||
|
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
|
} catch (err) {
|
||||||
|
console.error(
|
||||||
|
"VoiceControl: microphone access denied or unavailable",
|
||||||
|
err,
|
||||||
|
);
|
||||||
|
this.pushEvent("recording_error", { reason: err.message });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
this._chunks = [];
|
||||||
|
this._mediaRecorder = new MediaRecorder(stream);
|
||||||
|
|
||||||
|
this._mediaRecorder.ondataavailable = (e) => {
|
||||||
|
if (e.data.size > 0) this._chunks.push(e.data);
|
||||||
|
};
|
||||||
|
|
||||||
|
this._mediaRecorder.onstop = () => {
|
||||||
|
const mimeType = this._mediaRecorder.mimeType;
|
||||||
|
const blob = new Blob(this._chunks, { type: mimeType });
|
||||||
|
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onloadend = () => {
|
||||||
|
// reader.result is "data:<mime>;base64,<data>" — strip the prefix
|
||||||
|
const base64 = reader.result.split(",")[1];
|
||||||
|
this.pushEvent("audio_recorded", { data: base64, mime_type: mimeType });
|
||||||
|
};
|
||||||
|
reader.readAsDataURL(blob);
|
||||||
|
|
||||||
|
// Release the microphone indicator in the OS browser tab
|
||||||
|
stream.getTracks().forEach((t) => t.stop());
|
||||||
|
this._stopVisualization();
|
||||||
|
this._recording = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
this._mediaRecorder.start();
|
||||||
|
this._recording = true;
|
||||||
|
this.pushEvent("recording_started", {});
|
||||||
|
|
||||||
|
// Defer visualization start by one tick so LiveView has rendered the canvas
|
||||||
|
setTimeout(() => this._startVisualization(stream), 50);
|
||||||
|
},
|
||||||
|
|
||||||
|
stopRecording() {
|
||||||
|
if (this._mediaRecorder && this._mediaRecorder.state !== "inactive") {
|
||||||
|
this._mediaRecorder.stop();
|
||||||
|
// _recording flipped to false inside onstop after blob is ready
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export { VoiceControl };
|
||||||
@@ -6,7 +6,8 @@ source!([".env", System.get_env()])
|
|||||||
config :elixir_ai,
|
config :elixir_ai,
|
||||||
ai_endpoint: System.get_env("AI_RESPONSES_ENDPOINT"),
|
ai_endpoint: System.get_env("AI_RESPONSES_ENDPOINT"),
|
||||||
ai_token: System.get_env("AI_TOKEN"),
|
ai_token: System.get_env("AI_TOKEN"),
|
||||||
ai_model: System.get_env("AI_MODEL")
|
ai_model: System.get_env("AI_MODEL"),
|
||||||
|
whisper_endpoint: System.get_env("WHISPER_ENDPOINT")
|
||||||
|
|
||||||
# config/runtime.exs is executed for all environments, including
|
# config/runtime.exs is executed for all environments, including
|
||||||
# during releases. It is executed after compilation and before the
|
# during releases. It is executed after compilation and before the
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ services:
|
|||||||
POSTGRES_USER: elixir_ai
|
POSTGRES_USER: elixir_ai
|
||||||
POSTGRES_PASSWORD: elixir_ai
|
POSTGRES_PASSWORD: elixir_ai
|
||||||
POSTGRES_DB: elixir_ai_dev
|
POSTGRES_DB: elixir_ai_dev
|
||||||
|
|
||||||
command: postgres -c hba_file=/etc/postgresql/pg_hba.conf
|
command: postgres -c hba_file=/etc/postgresql/pg_hba.conf
|
||||||
volumes:
|
volumes:
|
||||||
- ./postgres/schema/:/docker-entrypoint-initdb.d/
|
- ./postgres/schema/:/docker-entrypoint-initdb.d/
|
||||||
@@ -32,6 +33,7 @@ services:
|
|||||||
RELEASE_COOKIE: secret_cluster_cookie
|
RELEASE_COOKIE: secret_cluster_cookie
|
||||||
SECRET_KEY_BASE: F1nY5uSyD0HfoWejcuuQiaQoMQrjrlFigb3bJ7p4hTXwpTza6sPLpmd+jLS7p0Sh
|
SECRET_KEY_BASE: F1nY5uSyD0HfoWejcuuQiaQoMQrjrlFigb3bJ7p4hTXwpTza6sPLpmd+jLS7p0Sh
|
||||||
PROVIDERS_CONFIG_PATH: /app/providers.yml
|
PROVIDERS_CONFIG_PATH: /app/providers.yml
|
||||||
|
WHISPER_ENDPOINT: http://ai-office-server.beefalo-newton.ts.net:8082/inference
|
||||||
user: root
|
user: root
|
||||||
command: |
|
command: |
|
||||||
sh -c '
|
sh -c '
|
||||||
@@ -71,6 +73,7 @@ services:
|
|||||||
RELEASE_COOKIE: secret_cluster_cookie
|
RELEASE_COOKIE: secret_cluster_cookie
|
||||||
SECRET_KEY_BASE: F1nY5uSyD0HfoWejcuuQiaQoMQrjrlFigb3bJ7p4hTXwpTza6sPLpmd+jLS7p0Sh
|
SECRET_KEY_BASE: F1nY5uSyD0HfoWejcuuQiaQoMQrjrlFigb3bJ7p4hTXwpTza6sPLpmd+jLS7p0Sh
|
||||||
PROVIDERS_CONFIG_PATH: /app/providers.yml
|
PROVIDERS_CONFIG_PATH: /app/providers.yml
|
||||||
|
WHISPER_ENDPOINT: http://ai-office-server.beefalo-newton.ts.net:8082/inference
|
||||||
user: root
|
user: root
|
||||||
command: |
|
command: |
|
||||||
sh -c '
|
sh -c '
|
||||||
|
|||||||
@@ -13,6 +13,8 @@ defmodule ElixirAi.Application do
|
|||||||
[Application.get_env(:libcluster, :topologies, []), [name: ElixirAi.ClusterSupervisor]]},
|
[Application.get_env(:libcluster, :topologies, []), [name: ElixirAi.ClusterSupervisor]]},
|
||||||
{Phoenix.PubSub, name: ElixirAi.PubSub},
|
{Phoenix.PubSub, name: ElixirAi.PubSub},
|
||||||
{ElixirAi.LiveViewPG, []},
|
{ElixirAi.LiveViewPG, []},
|
||||||
|
{ElixirAi.AudioProcessingPG, []},
|
||||||
|
{DynamicSupervisor, name: ElixirAi.AudioWorkerSupervisor, strategy: :one_for_one},
|
||||||
ElixirAi.ToolTesting,
|
ElixirAi.ToolTesting,
|
||||||
ElixirAiWeb.Endpoint,
|
ElixirAiWeb.Endpoint,
|
||||||
{Horde.Registry,
|
{Horde.Registry,
|
||||||
|
|||||||
47
lib/elixir_ai/audio/audio_processing.ex
Normal file
47
lib/elixir_ai/audio/audio_processing.ex
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
defmodule ElixirAi.AudioProcessing do
|
||||||
|
@moduledoc """
|
||||||
|
Public API for the demand-driven audio transcription pool.
|
||||||
|
|
||||||
|
Dispatch strategy:
|
||||||
|
1. Pick a random idle worker from the :available pg group.
|
||||||
|
2. If none are idle and the pool is below @max_workers, spawn a fresh worker
|
||||||
|
under AudioWorkerSupervisor and route the job directly to it.
|
||||||
|
3. If already at @max_workers, queue the job to a random existing worker via
|
||||||
|
its Erlang mailbox — it will process it when its current job finishes.
|
||||||
|
|
||||||
|
Scale-up is fully automatic (on demand). Scale-down is handled by each worker's
|
||||||
|
idle-timeout logic; workers exit after idling and the pool can reach 0.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@max_workers 10
|
||||||
|
@all_group :all
|
||||||
|
@available_group :available
|
||||||
|
|
||||||
|
@doc """
|
||||||
|
Submit audio for transcription. The result is delivered asynchronously to
|
||||||
|
`caller_pid` as:
|
||||||
|
|
||||||
|
{:transcription_result, {:ok, text} | {:error, reason}}
|
||||||
|
"""
|
||||||
|
def submit(audio_binary, mime_type, caller_pid) do
|
||||||
|
case :pg.get_members(ElixirAi.AudioProcessingPG, @available_group) do
|
||||||
|
[] ->
|
||||||
|
all = :pg.get_members(ElixirAi.AudioProcessingPG, @all_group)
|
||||||
|
|
||||||
|
if length(all) < @max_workers do
|
||||||
|
{:ok, pid} =
|
||||||
|
DynamicSupervisor.start_child(ElixirAi.AudioWorkerSupervisor, ElixirAi.AudioWorker)
|
||||||
|
|
||||||
|
GenServer.cast(pid, {:transcribe, caller_pid, audio_binary, mime_type})
|
||||||
|
else
|
||||||
|
# At max capacity — overflow to a random worker's mailbox
|
||||||
|
GenServer.cast(Enum.random(all), {:transcribe, caller_pid, audio_binary, mime_type})
|
||||||
|
end
|
||||||
|
|
||||||
|
available ->
|
||||||
|
GenServer.cast(Enum.random(available), {:transcribe, caller_pid, audio_binary, mime_type})
|
||||||
|
end
|
||||||
|
|
||||||
|
:ok
|
||||||
|
end
|
||||||
|
end
|
||||||
20
lib/elixir_ai/audio/audio_processing_pg.ex
Normal file
20
lib/elixir_ai/audio/audio_processing_pg.ex
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
defmodule ElixirAi.AudioProcessingPG do
|
||||||
|
@moduledoc """
|
||||||
|
Named :pg scope for tracking audio transcription workers across the cluster.
|
||||||
|
|
||||||
|
Workers join two groups:
|
||||||
|
- :all — always a member while alive (used for pool-size accounting)
|
||||||
|
- :available — member only while idle (used for dispatch; left while processing)
|
||||||
|
|
||||||
|
:pg automatically removes dead processes, so no manual cleanup is needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def child_spec(_opts) do
|
||||||
|
%{
|
||||||
|
id: __MODULE__,
|
||||||
|
start: {:pg, :start_link, [__MODULE__]},
|
||||||
|
type: :worker,
|
||||||
|
restart: :permanent
|
||||||
|
}
|
||||||
|
end
|
||||||
|
end
|
||||||
114
lib/elixir_ai/audio/audio_worker.ex
Normal file
114
lib/elixir_ai/audio/audio_worker.ex
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
defmodule ElixirAi.AudioWorker do
|
||||||
|
@moduledoc """
|
||||||
|
GenServer that transcribes audio by posting to a Whisper-compatible HTTP endpoint.
|
||||||
|
|
||||||
|
Pool membership in AudioProcessingPG:
|
||||||
|
- :all — joined on init; left only on exit
|
||||||
|
- :available — joined on init and after each job; left while processing
|
||||||
|
|
||||||
|
This join/leave pattern lets the AudioProcessing dispatcher know which workers are
|
||||||
|
idle without any central coordinator. When a worker finishes a job it rejoins
|
||||||
|
:available and becomes eligible for the next dispatch.
|
||||||
|
|
||||||
|
Scale-down: workers exit after @idle_timeout_ms of inactivity, allowing the pool
|
||||||
|
to reach 0. New workers are spawned on demand when the next job arrives.
|
||||||
|
|
||||||
|
Results are delivered to the calling LiveView process as:
|
||||||
|
{:transcription_result, {:ok, text} | {:error, reason}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
use GenServer
|
||||||
|
require Logger
|
||||||
|
|
||||||
|
@all_group :all
|
||||||
|
@available_group :available
|
||||||
|
@idle_timeout_ms 30_000
|
||||||
|
|
||||||
|
def start_link(opts), do: GenServer.start_link(__MODULE__, opts)
|
||||||
|
|
||||||
|
@impl true
|
||||||
|
def init(_opts) do
|
||||||
|
:pg.join(ElixirAi.AudioProcessingPG, @all_group, self())
|
||||||
|
:pg.join(ElixirAi.AudioProcessingPG, @available_group, self())
|
||||||
|
schedule_idle_check()
|
||||||
|
{:ok, %{busy: false, idle_since: monotonic_sec()}}
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl true
|
||||||
|
def handle_cast({:transcribe, caller_pid, audio_binary, mime_type}, state) do
|
||||||
|
:pg.leave(ElixirAi.AudioProcessingPG, @available_group, self())
|
||||||
|
worker = self()
|
||||||
|
|
||||||
|
Task.start(fn ->
|
||||||
|
result = do_transcribe(audio_binary, mime_type)
|
||||||
|
send(worker, {:transcription_done, caller_pid, result})
|
||||||
|
end)
|
||||||
|
|
||||||
|
{:noreply, %{state | busy: true}}
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl true
|
||||||
|
def handle_info({:transcription_done, caller_pid, result}, state) do
|
||||||
|
send(caller_pid, {:transcription_result, result})
|
||||||
|
:pg.join(ElixirAi.AudioProcessingPG, @available_group, self())
|
||||||
|
{:noreply, %{state | busy: false, idle_since: monotonic_sec()}}
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_info(:idle_check, %{busy: true} = state) do
|
||||||
|
schedule_idle_check()
|
||||||
|
{:noreply, state}
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_info(:idle_check, %{busy: false, idle_since: idle_since} = state) do
|
||||||
|
idle_ms = (monotonic_sec() - idle_since) * 1000
|
||||||
|
|
||||||
|
if idle_ms >= @idle_timeout_ms do
|
||||||
|
Logger.debug("AudioWorker #{inspect(self())} exiting — idle for #{div(idle_ms, 1000)}s")
|
||||||
|
|
||||||
|
{:stop, :normal, state}
|
||||||
|
else
|
||||||
|
schedule_idle_check()
|
||||||
|
{:noreply, state}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp schedule_idle_check do
|
||||||
|
Process.send_after(self(), :idle_check, @idle_timeout_ms)
|
||||||
|
end
|
||||||
|
|
||||||
|
defp monotonic_sec, do: System.monotonic_time(:second)
|
||||||
|
|
||||||
|
defp filename_for(mime_type) do
|
||||||
|
cond do
|
||||||
|
String.starts_with?(mime_type, "audio/webm") -> "audio.webm"
|
||||||
|
String.starts_with?(mime_type, "audio/ogg") -> "audio.ogg"
|
||||||
|
String.starts_with?(mime_type, "audio/mp4") -> "audio.mp4"
|
||||||
|
true -> "audio.bin"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp do_transcribe(audio_binary, mime_type) do
|
||||||
|
endpoint = Application.get_env(:elixir_ai, :whisper_endpoint)
|
||||||
|
filename = filename_for(mime_type)
|
||||||
|
|
||||||
|
case Req.post(endpoint,
|
||||||
|
form_multipart: [
|
||||||
|
file: {audio_binary, filename: filename, content_type: mime_type},
|
||||||
|
response_format: "json",
|
||||||
|
language: "en"
|
||||||
|
],
|
||||||
|
receive_timeout: 30_000
|
||||||
|
) do
|
||||||
|
{:ok, %{status: 200, body: %{"text" => text}}} ->
|
||||||
|
{:ok, String.trim(text)}
|
||||||
|
|
||||||
|
{:ok, %{status: status, body: body}} ->
|
||||||
|
Logger.warning("AudioWorker: Whisper returned HTTP #{status}: #{inspect(body)}")
|
||||||
|
{:error, {:http_error, status}}
|
||||||
|
|
||||||
|
{:error, reason} ->
|
||||||
|
Logger.error("AudioWorker: request failed: #{inspect(reason)}")
|
||||||
|
{:error, reason}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
@@ -130,9 +130,6 @@ defmodule ElixirAiWeb.ChatLive do
|
|||||||
{:noreply, assign(socket, streaming_response: nil, ai_error: nil)}
|
{:noreply, assign(socket, streaming_response: nil, ai_error: nil)}
|
||||||
end
|
end
|
||||||
|
|
||||||
# Fetches the authoritative streaming snapshot directly from the runner pid,
|
|
||||||
# bypassing the Horde registry. Sent to self immediately after subscribing on
|
|
||||||
# connect so it is the first message processed — before any PubSub chunks.
|
|
||||||
def handle_info(:sync_streaming, %{assigns: %{runner_pid: pid}} = socket)
|
def handle_info(:sync_streaming, %{assigns: %{runner_pid: pid}} = socket)
|
||||||
when is_pid(pid) do
|
when is_pid(pid) do
|
||||||
case GenServer.call(pid, :get_streaming_response) do
|
case GenServer.call(pid, :get_streaming_response) do
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
</script>
|
</script>
|
||||||
</head>
|
</head>
|
||||||
<body class="bg-cyan-900 text-cyan-50">
|
<body class="bg-cyan-900 text-cyan-50">
|
||||||
|
{live_render(@conn, ElixirAiWeb.VoiceLive, id: "voice-control")}
|
||||||
{@inner_content}
|
{@inner_content}
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|||||||
117
lib/elixir_ai_web/voice/voice_live.ex
Normal file
117
lib/elixir_ai_web/voice/voice_live.ex
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
defmodule ElixirAiWeb.VoiceLive do
|
||||||
|
use ElixirAiWeb, :live_view
|
||||||
|
require Logger
|
||||||
|
|
||||||
|
def mount(_params, _session, socket) do
|
||||||
|
{:ok, assign(socket, state: :idle, transcription: nil), layout: false}
|
||||||
|
end
|
||||||
|
|
||||||
|
def render(assigns) do
|
||||||
|
~H"""
|
||||||
|
<div id="voice-control-hook" phx-hook="VoiceControl">
|
||||||
|
<div class="fixed top-4 right-4 w-72 bg-cyan-950/95 border border-cyan-800 rounded-2xl shadow-2xl z-50 p-4 flex flex-col gap-3 backdrop-blur">
|
||||||
|
<div class="flex items-center gap-3">
|
||||||
|
<%= if @state == :idle do %>
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" class="h-4 w-4 text-cyan-500 shrink-0" viewBox="0 0 24 24" fill="currentColor">
|
||||||
|
<path d="M12 1a4 4 0 0 1 4 4v7a4 4 0 0 1-8 0V5a4 4 0 0 1 4-4zm0 2a2 2 0 0 0-2 2v7a2 2 0 1 0 4 0V5a2 2 0 0 0-2-2zm-7 9a7 7 0 0 0 14 0h2a9 9 0 0 1-8 8.94V23h-2v-2.06A9 9 0 0 1 3 12H5z"/>
|
||||||
|
</svg>
|
||||||
|
<span class="text-cyan-400 font-semibold text-sm">Voice Input</span>
|
||||||
|
<% end %>
|
||||||
|
<%= if @state == :recording do %>
|
||||||
|
<span class="relative flex h-3 w-3 shrink-0">
|
||||||
|
<span class="animate-ping absolute inline-flex h-full w-full rounded-full bg-red-500 opacity-75"></span>
|
||||||
|
<span class="relative inline-flex rounded-full h-3 w-3 bg-red-500"></span>
|
||||||
|
</span>
|
||||||
|
<span class="text-cyan-50 font-semibold text-sm">Recording</span>
|
||||||
|
<% end %>
|
||||||
|
<%= if @state == :processing do %>
|
||||||
|
<span class="relative flex h-3 w-3 shrink-0">
|
||||||
|
<span class="animate-ping absolute inline-flex h-full w-full rounded-full bg-cyan-400 opacity-75"></span>
|
||||||
|
<span class="relative inline-flex rounded-full h-3 w-3 bg-cyan-400"></span>
|
||||||
|
</span>
|
||||||
|
<span class="text-cyan-50 font-semibold text-sm">Processing…</span>
|
||||||
|
<% end %>
|
||||||
|
<%= if @state == :transcribed do %>
|
||||||
|
<span class="text-cyan-300 font-semibold text-sm">Transcription</span>
|
||||||
|
<% end %>
|
||||||
|
</div>
|
||||||
|
<%= if @state in [:recording, :processing] do %>
|
||||||
|
<div id="voice-viz-wrapper" phx-update="ignore">
|
||||||
|
<canvas id="voice-viz-canvas" height="72" class="w-full rounded-lg bg-cyan-950 block">
|
||||||
|
</canvas>
|
||||||
|
</div>
|
||||||
|
<% end %>
|
||||||
|
<%= if @state == :transcribed do %>
|
||||||
|
<div class="rounded-xl bg-cyan-900/60 border border-cyan-700 px-3 py-2">
|
||||||
|
<p class="text-sm text-cyan-50 leading-relaxed">{@transcription}</p>
|
||||||
|
</div>
|
||||||
|
<% end %>
|
||||||
|
<%= if @state == :idle do %>
|
||||||
|
<button
|
||||||
|
phx-click={JS.dispatch("voice:start", to: "#voice-control-hook")}
|
||||||
|
class="w-full flex items-center justify-between px-3 py-1.5 rounded-lg bg-cyan-700 hover:bg-cyan-600 text-cyan-50 text-xs font-medium transition-colors"
|
||||||
|
>
|
||||||
|
<span>Start Recording</span>
|
||||||
|
<kbd class="text-cyan-300 bg-cyan-800 border border-cyan-600 px-1.5 py-0.5 rounded font-mono">Ctrl+Space</kbd>
|
||||||
|
</button>
|
||||||
|
<% end %>
|
||||||
|
<%= if @state == :recording do %>
|
||||||
|
<button
|
||||||
|
phx-click={JS.dispatch("voice:stop", to: "#voice-control-hook")}
|
||||||
|
class="w-full flex items-center justify-between px-3 py-1.5 rounded-lg bg-cyan-800 hover:bg-cyan-700 text-cyan-50 text-xs font-medium transition-colors border border-cyan-700"
|
||||||
|
>
|
||||||
|
<span>Stop Recording</span>
|
||||||
|
<kbd class="text-cyan-300 bg-cyan-900 border border-cyan-700 px-1.5 py-0.5 rounded font-mono">Space</kbd>
|
||||||
|
</button>
|
||||||
|
<% end %>
|
||||||
|
<%= if @state == :transcribed do %>
|
||||||
|
<button
|
||||||
|
phx-click="dismiss_transcription"
|
||||||
|
class="text-xs text-cyan-500 hover:text-cyan-300 transition-colors text-center w-full"
|
||||||
|
>
|
||||||
|
Dismiss
|
||||||
|
</button>
|
||||||
|
<% end %>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_event("recording_started", _params, socket) do
|
||||||
|
{:noreply, assign(socket, state: :recording)}
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_event("audio_recorded", %{"data" => base64, "mime_type" => mime_type}, socket) do
|
||||||
|
case Base.decode64(base64) do
|
||||||
|
{:ok, audio_binary} ->
|
||||||
|
Logger.info(
|
||||||
|
"VoiceLive: received #{byte_size(audio_binary)} bytes of audio (#{mime_type})"
|
||||||
|
)
|
||||||
|
|
||||||
|
ElixirAi.AudioProcessing.submit(audio_binary, mime_type, self())
|
||||||
|
{:noreply, assign(socket, state: :processing)}
|
||||||
|
|
||||||
|
:error ->
|
||||||
|
Logger.error("VoiceLive: failed to decode base64 audio data")
|
||||||
|
{:noreply, assign(socket, state: :idle)}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_event("recording_error", %{"reason" => reason}, socket) do
|
||||||
|
Logger.warning("VoiceLive: recording error: #{reason}")
|
||||||
|
{:noreply, assign(socket, state: :idle)}
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_event("dismiss_transcription", _params, socket) do
|
||||||
|
{:noreply, assign(socket, state: :idle, transcription: nil)}
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_info({:transcription_result, {:ok, text}}, socket) do
|
||||||
|
{:noreply, assign(socket, state: :transcribed, transcription: text)}
|
||||||
|
end
|
||||||
|
|
||||||
|
def handle_info({:transcription_result, {:error, reason}}, socket) do
|
||||||
|
Logger.error("VoiceLive: transcription failed: #{inspect(reason)}")
|
||||||
|
{:noreply, assign(socket, state: :idle)}
|
||||||
|
end
|
||||||
|
end
|
||||||
109
streaming-outline.md
Normal file
109
streaming-outline.md
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
# Voice Recording + Whisper
|
||||||
|
|
||||||
|
Reference: [`alexmickelson/office-infrastructure`](https://github.com/alexmickelson/office-infrastructure/tree/master/ai-office-server/nginx/html)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recording
|
||||||
|
|
||||||
|
```js
|
||||||
|
let mediaStream = null;
|
||||||
|
let recorder = null;
|
||||||
|
let chunks = [];
|
||||||
|
|
||||||
|
async function startRecording() {
|
||||||
|
if (!mediaStream) {
|
||||||
|
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
|
||||||
|
}
|
||||||
|
|
||||||
|
const mimeType = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/ogg"]
|
||||||
|
.find((t) => MediaRecorder.isTypeSupported(t)) || "";
|
||||||
|
|
||||||
|
chunks = [];
|
||||||
|
recorder = new MediaRecorder(mediaStream, mimeType ? { mimeType } : undefined);
|
||||||
|
recorder.ondataavailable = (e) => { if (e.data?.size > 0) chunks.push(e.data); };
|
||||||
|
recorder.onstop = async () => {
|
||||||
|
const blob = new Blob(chunks, { type: recorder.mimeType || "audio/webm" });
|
||||||
|
const text = await sendToWhisper(blob, "your prompt context here");
|
||||||
|
console.log(text);
|
||||||
|
};
|
||||||
|
recorder.start(100); // fires ondataavailable every 100ms
|
||||||
|
}
|
||||||
|
|
||||||
|
function stopRecording() {
|
||||||
|
if (recorder?.state === "recording") recorder.stop();
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Sending to Whisper
|
||||||
|
|
||||||
|
`POST {serverUrl}/inference` as `multipart/form-data`. Returns `{ "text": "..." }`.
|
||||||
|
|
||||||
|
```js
|
||||||
|
async function sendToWhisper(blob, prompt) {
|
||||||
|
const formData = new FormData();
|
||||||
|
formData.append("file", blob, "audio.webm");
|
||||||
|
formData.append("response_format", "json");
|
||||||
|
formData.append("language", "en"); // or "" for auto-detect
|
||||||
|
if (prompt) formData.append("prompt", prompt);
|
||||||
|
|
||||||
|
const res = await fetch("https://your-whisper-server/inference", {
|
||||||
|
method: "POST",
|
||||||
|
body: formData,
|
||||||
|
});
|
||||||
|
const data = await res.json();
|
||||||
|
return (data.text || "").trim();
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The `prompt` field accepts the last ~20 words of prior transcript — Whisper uses it as context to improve continuity across chunks.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Visualization
|
||||||
|
|
||||||
|
Requires a `<canvas id="volumeCanvas">` in the HTML.
|
||||||
|
|
||||||
|
```js
|
||||||
|
const canvas = document.getElementById("volumeCanvas");
|
||||||
|
const ctx = canvas.getContext("2d");
|
||||||
|
const MAX_BARS = 180; // 6s × 30fps
|
||||||
|
const volHistory = [];
|
||||||
|
let vizRaf = null;
|
||||||
|
|
||||||
|
function startViz(stream) {
|
||||||
|
const audioCtx = new AudioContext();
|
||||||
|
const analyser = audioCtx.createAnalyser();
|
||||||
|
audioCtx.createMediaStreamSource(stream).connect(analyser);
|
||||||
|
analyser.fftSize = 1024;
|
||||||
|
const buf = new Uint8Array(analyser.frequencyBinCount);
|
||||||
|
|
||||||
|
function tick() {
|
||||||
|
vizRaf = requestAnimationFrame(tick);
|
||||||
|
analyser.getByteFrequencyData(buf);
|
||||||
|
const rms = Math.sqrt(buf.reduce((s, v) => s + v * v, 0) / buf.length) / 255;
|
||||||
|
volHistory.push(rms);
|
||||||
|
if (volHistory.length > MAX_BARS) volHistory.shift();
|
||||||
|
|
||||||
|
const W = canvas.offsetWidth * devicePixelRatio;
|
||||||
|
const H = canvas.offsetHeight * devicePixelRatio;
|
||||||
|
if (canvas.width !== W || canvas.height !== H) { canvas.width = W; canvas.height = H; }
|
||||||
|
ctx.clearRect(0, 0, W, H);
|
||||||
|
const barW = W / MAX_BARS;
|
||||||
|
volHistory.forEach((v, i) => {
|
||||||
|
ctx.fillStyle = `hsl(${120 - v * 120}, 80%, 45%)`; // green → red
|
||||||
|
ctx.fillRect(i * barW, H - v * H, Math.max(1, barW - 1), v * H);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
tick();
|
||||||
|
}
|
||||||
|
|
||||||
|
function stopViz() {
|
||||||
|
cancelAnimationFrame(vizRaf);
|
||||||
|
vizRaf = null;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Call `startViz(mediaStream)` right after `getUserMedia`, and `stopViz()` after `recorder.stop()`.
|
||||||
Reference in New Issue
Block a user