Audio transcription support (#18398)

* install new packages for transcription support

* add config options

* audio maintainer modifications to support transcription

* pass main config to audio process

* embeddings support

* api and transcription post processor

* embeddings maintainer support for post processor

* live audio transcription with sherpa and faster-whisper

* update dispatcher with live transcription topic

* frontend websocket

* frontend live transcription

* frontend changes for speech events

* i18n changes

* docs

* mqtt docs

* fix linter

* use float16 and small model on gpu for real-time

* fix return value and use requestor to embed description instead of passing embeddings

* run real-time transcription in its own thread

* tweaks

* publish live transcriptions on their own topic instead of tracked_object_update

* config validator and docs

* clarify docs
This commit is contained in:
Josh Hawkins
2025-05-27 10:26:00 -05:00
committed by Blake Blackshear
parent 2385c403ee
commit 6dc36fcbb4
29 changed files with 2322 additions and 51 deletions

View File

@@ -75,13 +75,13 @@ export default function ExploreView({
}, {});
}, [events]);
const trackedObjectUpdate = useTrackedObjectUpdate();
const { payload: wsUpdate } = useTrackedObjectUpdate();
useEffect(() => {
mutate();
// mutate / revalidate when event description updates come in
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [trackedObjectUpdate]);
if (wsUpdate && wsUpdate.type == "description") {
mutate();
}
}, [wsUpdate, mutate]);
// update search detail when results change

View File

@@ -1,5 +1,7 @@
import {
useAudioLiveTranscription,
useAudioState,
useAudioTranscriptionState,
useAutotrackingState,
useDetectState,
useEnabledState,
@@ -90,6 +92,8 @@ import {
LuX,
} from "react-icons/lu";
import {
MdClosedCaption,
MdClosedCaptionDisabled,
MdNoPhotography,
MdOutlineRestartAlt,
MdPersonOff,
@@ -197,6 +201,29 @@ export default function LiveCameraView({
const { payload: enabledState } = useEnabledState(camera.name);
const cameraEnabled = enabledState === "ON";
// for audio transcriptions
const { payload: audioTranscriptionState, send: sendTranscription } =
useAudioTranscriptionState(camera.name);
const { payload: transcription } = useAudioLiveTranscription(camera.name);
const transcriptionRef = useRef<HTMLDivElement>(null);
useEffect(() => {
if (transcription) {
if (transcriptionRef.current) {
transcriptionRef.current.scrollTop =
transcriptionRef.current.scrollHeight;
}
}
}, [transcription]);
useEffect(() => {
return () => {
// disable transcriptions when unmounting
if (audioTranscriptionState == "ON") sendTranscription("OFF");
};
}, [audioTranscriptionState, sendTranscription]);
// click overlay for ptzs
const [clickOverlay, setClickOverlay] = useState(false);
@@ -567,6 +594,9 @@ export default function LiveCameraView({
autotrackingEnabled={
camera.onvif.autotracking.enabled_in_config
}
transcriptionEnabled={
camera.audio_transcription.enabled_in_config
}
fullscreen={fullscreen}
streamName={streamName ?? ""}
setStreamName={setStreamName}
@@ -626,6 +656,16 @@ export default function LiveCameraView({
/>
</div>
</TransformComponent>
{camera?.audio?.enabled_in_config &&
audioTranscriptionState == "ON" &&
transcription != null && (
<div
ref={transcriptionRef}
className="text-md scrollbar-container absolute bottom-4 left-1/2 max-h-[15vh] w-[75%] -translate-x-1/2 overflow-y-auto rounded-lg bg-black/70 p-2 text-white md:w-[50%]"
>
{transcription}
</div>
)}
</div>
</div>
{camera.onvif.host != "" && (
@@ -984,6 +1024,7 @@ type FrigateCameraFeaturesProps = {
recordingEnabled: boolean;
audioDetectEnabled: boolean;
autotrackingEnabled: boolean;
transcriptionEnabled: boolean;
fullscreen: boolean;
streamName: string;
setStreamName?: (value: string | undefined) => void;
@@ -1003,6 +1044,7 @@ function FrigateCameraFeatures({
recordingEnabled,
audioDetectEnabled,
autotrackingEnabled,
transcriptionEnabled,
fullscreen,
streamName,
setStreamName,
@@ -1035,6 +1077,8 @@ function FrigateCameraFeatures({
const { payload: audioState, send: sendAudio } = useAudioState(camera.name);
const { payload: autotrackingState, send: sendAutotracking } =
useAutotrackingState(camera.name);
const { payload: transcriptionState, send: sendTranscription } =
useAudioTranscriptionState(camera.name);
// roles
@@ -1198,6 +1242,27 @@ function FrigateCameraFeatures({
disabled={!cameraEnabled}
/>
)}
{audioDetectEnabled && transcriptionEnabled && (
<CameraFeatureToggle
className="p-2 md:p-0"
variant={fullscreen ? "overlay" : "primary"}
Icon={
transcriptionState == "ON"
? MdClosedCaption
: MdClosedCaptionDisabled
}
isActive={transcriptionState == "ON"}
title={
transcriptionState == "ON"
? t("transcription.disable")
: t("transcription.enable")
}
onClick={() =>
sendTranscription(transcriptionState == "ON" ? "OFF" : "ON")
}
disabled={!cameraEnabled || audioState == "OFF"}
/>
)}
{autotrackingEnabled && (
<CameraFeatureToggle
className="p-2 md:p-0"
@@ -1562,6 +1627,16 @@ function FrigateCameraFeatures({
}
/>
)}
{audioDetectEnabled && transcriptionEnabled && (
<FilterSwitch
label={t("cameraSettings.transcription")}
disabled={audioState == "OFF"}
isChecked={transcriptionState == "ON"}
onCheckedChange={() =>
sendTranscription(transcriptionState == "ON" ? "OFF" : "ON")
}
/>
)}
{autotrackingEnabled && (
<FilterSwitch
label={t("cameraSettings.autotracking")}