Audio transcription support (#18398)

* install new packages for transcription support * add config options * audio maintainer modifications to support transcription * pass main config to audio process * embeddings support * api and transcription post processor * embeddings maintainer support for post processor * live audio transcription with sherpa and faster-whisper * update dispatcher with live transcription topic * frontend websocket * frontend live transcription * frontend changes for speech events * i18n changes * docs * mqtt docs * fix linter * use float16 and small model on gpu for real-time * fix return value and use requestor to embed description instead of passing embeddings * run real-time transcription in its own thread * tweaks * publish live transcriptions on their own topic instead of tracked_object_update * config validator and docs * clarify docs
2026-04-28 23:06:13 +02:00 · 2025-05-27 10:26:00 -05:00
parent 2385c403ee
commit 6dc36fcbb4
29 changed files with 2322 additions and 51 deletions
--- a/frigate/api/classification.py
+++ b/frigate/api/classification.py
@@ -14,7 +14,10 @@ from peewee import DoesNotExist
 from playhouse.shortcuts import model_to_dict

 from frigate.api.auth import require_role
-from frigate.api.defs.request.classification_body import RenameFaceBody
+from frigate.api.defs.request.classification_body import (
+    AudioTranscriptionBody,
+    RenameFaceBody,
+)
 from frigate.api.defs.tags import Tags
 from frigate.config.camera import DetectConfig
 from frigate.const import FACE_DIR
@@ -384,3 +387,58 @@ def reindex_embeddings(request: Request):
            },
            status_code=500,
        )
+
+
+@router.put("/audio/transcribe")
+def transcribe_audio(request: Request, body: AudioTranscriptionBody):
+    event_id = body.event_id
+
+    try:
+        event = Event.get(Event.id == event_id)
+    except DoesNotExist:
+        message = f"Event {event_id} not found"
+        logger.error(message)
+        return JSONResponse(
+            content=({"success": False, "message": message}), status_code=404
+        )
+
+    if not request.app.frigate_config.cameras[event.camera].audio_transcription.enabled:
+        message = f"Audio transcription is not enabled for {event.camera}."
+        logger.error(message)
+        return JSONResponse(
+            content=(
+                {
+                    "success": False,
+                    "message": message,
+                }
+            ),
+            status_code=400,
+        )
+
+    context: EmbeddingsContext = request.app.embeddings
+    response = context.transcribe_audio(model_to_dict(event))
+
+    if response == "started":
+        return JSONResponse(
+            content={
+                "success": True,
+                "message": "Audio transcription has started.",
+            },
+            status_code=202,  # 202 Accepted
+        )
+    elif response == "in_progress":
+        return JSONResponse(
+            content={
+                "success": False,
+                "message": "Audio transcription for a speech event is currently in progress. Try again later.",
+            },
+            status_code=409,  # 409 Conflict
+        )
+    else:
+        return JSONResponse(
+            content={
+                "success": False,
+                "message": "Failed to transcribe audio.",
+            },
+            status_code=500,
+        )