Audio transcription support (#18398)

* install new packages for transcription support

* add config options

* audio maintainer modifications to support transcription

* pass main config to audio process

* embeddings support

* api and transcription post processor

* embeddings maintainer support for post processor

* live audio transcription with sherpa and faster-whisper

* update dispatcher with live transcription topic

* frontend websocket

* frontend live transcription

* frontend changes for speech events

* i18n changes

* docs

* mqtt docs

* fix linter

* use float16 and small model on gpu for real-time

* fix return value and use requestor to embed description instead of passing embeddings

* run real-time transcription in its own thread

* tweaks

* publish live transcriptions on their own topic instead of tracked_object_update

* config validator and docs

* clarify docs
This commit is contained in:
Josh Hawkins
2025-05-27 10:26:00 -05:00
committed by Blake Blackshear
parent 2385c403ee
commit 6dc36fcbb4
29 changed files with 2322 additions and 51 deletions

View File

@@ -8,6 +8,7 @@ import {
FrigateReview,
ModelState,
ToggleableSetting,
TrackedObjectUpdateReturnType,
} from "@/types/ws";
import { FrigateStats } from "@/types/stats";
import { createContainer } from "react-tracked";
@@ -60,6 +61,7 @@ function useValue(): useValueReturn {
enabled,
snapshots,
audio,
audio_transcription,
notifications,
notifications_suspended,
autotracking,
@@ -71,6 +73,9 @@ function useValue(): useValueReturn {
cameraStates[`${name}/detect/state`] = detect ? "ON" : "OFF";
cameraStates[`${name}/snapshots/state`] = snapshots ? "ON" : "OFF";
cameraStates[`${name}/audio/state`] = audio ? "ON" : "OFF";
cameraStates[`${name}/audio_transcription/state`] = audio_transcription
? "ON"
: "OFF";
cameraStates[`${name}/notifications/state`] = notifications
? "ON"
: "OFF";
@@ -220,6 +225,20 @@ export function useAudioState(camera: string): {
return { payload: payload as ToggleableSetting, send };
}
export function useAudioTranscriptionState(camera: string): {
payload: ToggleableSetting;
send: (payload: ToggleableSetting, retain?: boolean) => void;
} {
const {
value: { payload },
send,
} = useWs(
`${camera}/audio_transcription/state`,
`${camera}/audio_transcription/set`,
);
return { payload: payload as ToggleableSetting, send };
}
export function useAutotrackingState(camera: string): {
payload: ToggleableSetting;
send: (payload: ToggleableSetting, retain?: boolean) => void;
@@ -421,6 +440,15 @@ export function useAudioActivity(camera: string): { payload: number } {
return { payload: payload as number };
}
export function useAudioLiveTranscription(camera: string): {
payload: string;
} {
const {
value: { payload },
} = useWs(`${camera}/audio/transcription`, "");
return { payload: payload as string };
}
export function useMotionThreshold(camera: string): {
payload: string;
send: (payload: number, retain?: boolean) => void;
@@ -463,11 +491,16 @@ export function useImproveContrast(camera: string): {
return { payload: payload as ToggleableSetting, send };
}
export function useTrackedObjectUpdate(): { payload: string } {
export function useTrackedObjectUpdate(): {
payload: TrackedObjectUpdateReturnType;
} {
const {
value: { payload },
} = useWs("tracked_object_update", "");
return useDeepMemo(JSON.parse(payload as string));
const parsed = payload
? JSON.parse(payload as string)
: { type: "", id: "", camera: "" };
return { payload: useDeepMemo(parsed) };
}
export function useNotifications(camera: string): {

View File

@@ -78,6 +78,7 @@ import { TbFaceId } from "react-icons/tb";
import { useIsAdmin } from "@/hooks/use-is-admin";
import FaceSelectionDialog from "../FaceSelectionDialog";
import { getTranslatedLabel } from "@/utils/i18n";
import { CgTranscript } from "react-icons/cg";
const SEARCH_TABS = [
"details",
@@ -710,6 +711,34 @@ function ObjectDetailsTab({
[search, t],
);
// speech transcription
const onTranscribe = useCallback(() => {
axios
.put(`/audio/transcribe`, { event_id: search.id })
.then((resp) => {
if (resp.status == 202) {
toast.success(t("details.item.toast.success.audioTranscription"), {
position: "top-center",
});
}
})
.catch((error) => {
const errorMessage =
error.response?.data?.message ||
error.response?.data?.detail ||
"Unknown error";
toast.error(
t("details.item.toast.error.audioTranscription", {
errorMessage,
}),
{
position: "top-center",
},
);
});
}, [search, t]);
return (
<div className="flex flex-col gap-5">
<div className="flex w-full flex-row">
@@ -893,6 +922,16 @@ function ObjectDetailsTab({
</Button>
</FaceSelectionDialog>
)}
{config?.cameras[search?.camera].audio_transcription.enabled &&
search?.label == "speech" &&
search?.end_time && (
<Button className="w-full" onClick={onTranscribe}>
<div className="flex gap-1">
<CgTranscript />
{t("itemMenu.audioTranscription.label")}
</div>
</Button>
)}
</div>
</div>
</div>

View File

@@ -257,15 +257,13 @@ export default function Explore() {
// mutation and revalidation
const trackedObjectUpdate = useTrackedObjectUpdate();
const { payload: wsUpdate } = useTrackedObjectUpdate();
useEffect(() => {
if (trackedObjectUpdate) {
if (wsUpdate && wsUpdate.type == "description") {
mutate();
}
// mutate / revalidate when event description updates come in
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [trackedObjectUpdate]);
}, [wsUpdate, mutate]);
// embeddings reindex progress

View File

@@ -41,6 +41,11 @@ export interface CameraConfig {
min_volume: number;
num_threads: number;
};
audio_transcription: {
enabled: boolean;
enabled_in_config: boolean;
live_enabled: boolean;
};
best_image_timeout: number;
birdseye: {
enabled: boolean;
@@ -296,6 +301,10 @@ export interface FrigateConfig {
num_threads: number;
};
audio_transcription: {
enabled: boolean;
};
birdseye: BirdseyeConfig;
cameras: {

View File

@@ -58,6 +58,7 @@ export interface FrigateCameraState {
snapshots: boolean;
record: boolean;
audio: boolean;
audio_transcription: boolean;
notifications: boolean;
notifications_suspended: number;
autotracking: boolean;
@@ -84,3 +85,21 @@ export type EmbeddingsReindexProgressType = {
};
export type ToggleableSetting = "ON" | "OFF";
export type TrackedObjectUpdateType =
| "description"
| "lpr"
| "transcription"
| "face";
export type TrackedObjectUpdateReturnType = {
type: TrackedObjectUpdateType;
id: string;
camera: string;
description?: string;
name?: string;
plate?: string;
score?: number;
timestamp?: number;
text?: string;
} | null;

View File

@@ -75,13 +75,13 @@ export default function ExploreView({
}, {});
}, [events]);
const trackedObjectUpdate = useTrackedObjectUpdate();
const { payload: wsUpdate } = useTrackedObjectUpdate();
useEffect(() => {
mutate();
// mutate / revalidate when event description updates come in
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [trackedObjectUpdate]);
if (wsUpdate && wsUpdate.type == "description") {
mutate();
}
}, [wsUpdate, mutate]);
// update search detail when results change

View File

@@ -1,5 +1,7 @@
import {
useAudioLiveTranscription,
useAudioState,
useAudioTranscriptionState,
useAutotrackingState,
useDetectState,
useEnabledState,
@@ -90,6 +92,8 @@ import {
LuX,
} from "react-icons/lu";
import {
MdClosedCaption,
MdClosedCaptionDisabled,
MdNoPhotography,
MdOutlineRestartAlt,
MdPersonOff,
@@ -197,6 +201,29 @@ export default function LiveCameraView({
const { payload: enabledState } = useEnabledState(camera.name);
const cameraEnabled = enabledState === "ON";
// for audio transcriptions
const { payload: audioTranscriptionState, send: sendTranscription } =
useAudioTranscriptionState(camera.name);
const { payload: transcription } = useAudioLiveTranscription(camera.name);
const transcriptionRef = useRef<HTMLDivElement>(null);
useEffect(() => {
if (transcription) {
if (transcriptionRef.current) {
transcriptionRef.current.scrollTop =
transcriptionRef.current.scrollHeight;
}
}
}, [transcription]);
useEffect(() => {
return () => {
// disable transcriptions when unmounting
if (audioTranscriptionState == "ON") sendTranscription("OFF");
};
}, [audioTranscriptionState, sendTranscription]);
// click overlay for ptzs
const [clickOverlay, setClickOverlay] = useState(false);
@@ -567,6 +594,9 @@ export default function LiveCameraView({
autotrackingEnabled={
camera.onvif.autotracking.enabled_in_config
}
transcriptionEnabled={
camera.audio_transcription.enabled_in_config
}
fullscreen={fullscreen}
streamName={streamName ?? ""}
setStreamName={setStreamName}
@@ -626,6 +656,16 @@ export default function LiveCameraView({
/>
</div>
</TransformComponent>
{camera?.audio?.enabled_in_config &&
audioTranscriptionState == "ON" &&
transcription != null && (
<div
ref={transcriptionRef}
className="text-md scrollbar-container absolute bottom-4 left-1/2 max-h-[15vh] w-[75%] -translate-x-1/2 overflow-y-auto rounded-lg bg-black/70 p-2 text-white md:w-[50%]"
>
{transcription}
</div>
)}
</div>
</div>
{camera.onvif.host != "" && (
@@ -984,6 +1024,7 @@ type FrigateCameraFeaturesProps = {
recordingEnabled: boolean;
audioDetectEnabled: boolean;
autotrackingEnabled: boolean;
transcriptionEnabled: boolean;
fullscreen: boolean;
streamName: string;
setStreamName?: (value: string | undefined) => void;
@@ -1003,6 +1044,7 @@ function FrigateCameraFeatures({
recordingEnabled,
audioDetectEnabled,
autotrackingEnabled,
transcriptionEnabled,
fullscreen,
streamName,
setStreamName,
@@ -1035,6 +1077,8 @@ function FrigateCameraFeatures({
const { payload: audioState, send: sendAudio } = useAudioState(camera.name);
const { payload: autotrackingState, send: sendAutotracking } =
useAutotrackingState(camera.name);
const { payload: transcriptionState, send: sendTranscription } =
useAudioTranscriptionState(camera.name);
// roles
@@ -1198,6 +1242,27 @@ function FrigateCameraFeatures({
disabled={!cameraEnabled}
/>
)}
{audioDetectEnabled && transcriptionEnabled && (
<CameraFeatureToggle
className="p-2 md:p-0"
variant={fullscreen ? "overlay" : "primary"}
Icon={
transcriptionState == "ON"
? MdClosedCaption
: MdClosedCaptionDisabled
}
isActive={transcriptionState == "ON"}
title={
transcriptionState == "ON"
? t("transcription.disable")
: t("transcription.enable")
}
onClick={() =>
sendTranscription(transcriptionState == "ON" ? "OFF" : "ON")
}
disabled={!cameraEnabled || audioState == "OFF"}
/>
)}
{autotrackingEnabled && (
<CameraFeatureToggle
className="p-2 md:p-0"
@@ -1562,6 +1627,16 @@ function FrigateCameraFeatures({
}
/>
)}
{audioDetectEnabled && transcriptionEnabled && (
<FilterSwitch
label={t("cameraSettings.transcription")}
disabled={audioState == "OFF"}
isChecked={transcriptionState == "ON"}
onCheckedChange={() =>
sendTranscription(transcriptionState == "ON" ? "OFF" : "ON")
}
/>
)}
{autotrackingEnabled && (
<FilterSwitch
label={t("cameraSettings.autotracking")}