can transcribe in the ui

2026-03-20 14:49:59 -06:00
parent 85eb8bcefa
commit 6fc4a686f8
13 changed files with 587 additions and 4 deletions
--- a/streaming-outline.md
+++ b/streaming-outline.md
@@ -0,0 +1,109 @@
+# Voice Recording + Whisper
+
+Reference: [`alexmickelson/office-infrastructure`](https://github.com/alexmickelson/office-infrastructure/tree/master/ai-office-server/nginx/html)
+
+---
+
+## Recording
+
+```js
+let mediaStream = null;
+let recorder = null;
+let chunks = [];
+
+async function startRecording() {
+  if (!mediaStream) {
+    mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
+  }
+
+  const mimeType = ["audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/ogg"]
+    .find((t) => MediaRecorder.isTypeSupported(t)) || "";
+
+  chunks = [];
+  recorder = new MediaRecorder(mediaStream, mimeType ? { mimeType } : undefined);
+  recorder.ondataavailable = (e) => { if (e.data?.size > 0) chunks.push(e.data); };
+  recorder.onstop = async () => {
+    const blob = new Blob(chunks, { type: recorder.mimeType || "audio/webm" });
+    const text = await sendToWhisper(blob, "your prompt context here");
+    console.log(text);
+  };
+  recorder.start(100); // fires ondataavailable every 100ms
+}
+
+function stopRecording() {
+  if (recorder?.state === "recording") recorder.stop();
+}
+```
+
+---
+
+## Sending to Whisper
+
+`POST {serverUrl}/inference` as `multipart/form-data`. Returns `{ "text": "..." }`.
+
+```js
+async function sendToWhisper(blob, prompt) {
+  const formData = new FormData();
+  formData.append("file", blob, "audio.webm");
+  formData.append("response_format", "json");
+  formData.append("language", "en"); // or "" for auto-detect
+  if (prompt) formData.append("prompt", prompt);
+
+  const res = await fetch("https://your-whisper-server/inference", {
+    method: "POST",
+    body: formData,
+  });
+  const data = await res.json();
+  return (data.text || "").trim();
+}
+```
+
+The `prompt` field accepts the last ~20 words of prior transcript — Whisper uses it as context to improve continuity across chunks.
+
+---
+
+## Visualization
+
+Requires a `<canvas id="volumeCanvas">` in the HTML.
+
+```js
+const canvas = document.getElementById("volumeCanvas");
+const ctx = canvas.getContext("2d");
+const MAX_BARS = 180; // 6s × 30fps
+const volHistory = [];
+let vizRaf = null;
+
+function startViz(stream) {
+  const audioCtx = new AudioContext();
+  const analyser = audioCtx.createAnalyser();
+  audioCtx.createMediaStreamSource(stream).connect(analyser);
+  analyser.fftSize = 1024;
+  const buf = new Uint8Array(analyser.frequencyBinCount);
+
+  function tick() {
+    vizRaf = requestAnimationFrame(tick);
+    analyser.getByteFrequencyData(buf);
+    const rms = Math.sqrt(buf.reduce((s, v) => s + v * v, 0) / buf.length) / 255;
+    volHistory.push(rms);
+    if (volHistory.length > MAX_BARS) volHistory.shift();
+
+    const W = canvas.offsetWidth * devicePixelRatio;
+    const H = canvas.offsetHeight * devicePixelRatio;
+    if (canvas.width !== W || canvas.height !== H) { canvas.width = W; canvas.height = H; }
+    ctx.clearRect(0, 0, W, H);
+    const barW = W / MAX_BARS;
+    volHistory.forEach((v, i) => {
+      ctx.fillStyle = `hsl(${120 - v * 120}, 80%, 45%)`; // green → red
+      ctx.fillRect(i * barW, H - v * H, Math.max(1, barW - 1), v * H);
+    });
+  }
+  tick();
+}
+
+function stopViz() {
+  cancelAnimationFrame(vizRaf);
+  vizRaf = null;
+}
+```
+
+Call `startViz(mediaStream)` right after `getUserMedia`, and `stopViz()` after `recorder.stop()`.