mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-03-07 02:18:07 +01:00
Audio transcription support (#18398)
* install new packages for transcription support * add config options * audio maintainer modifications to support transcription * pass main config to audio process * embeddings support * api and transcription post processor * embeddings maintainer support for post processor * live audio transcription with sherpa and faster-whisper * update dispatcher with live transcription topic * frontend websocket * frontend live transcription * frontend changes for speech events * i18n changes * docs * mqtt docs * fix linter * use float16 and small model on gpu for real-time * fix return value and use requestor to embed description instead of passing embeddings * run real-time transcription in its own thread * tweaks * publish live transcriptions on their own topic instead of tracked_object_update * config validator and docs * clarify docs
This commit is contained in:
committed by
Blake Blackshear
parent
2385c403ee
commit
6dc36fcbb4
@@ -8,6 +8,7 @@ import {
|
||||
FrigateReview,
|
||||
ModelState,
|
||||
ToggleableSetting,
|
||||
TrackedObjectUpdateReturnType,
|
||||
} from "@/types/ws";
|
||||
import { FrigateStats } from "@/types/stats";
|
||||
import { createContainer } from "react-tracked";
|
||||
@@ -60,6 +61,7 @@ function useValue(): useValueReturn {
|
||||
enabled,
|
||||
snapshots,
|
||||
audio,
|
||||
audio_transcription,
|
||||
notifications,
|
||||
notifications_suspended,
|
||||
autotracking,
|
||||
@@ -71,6 +73,9 @@ function useValue(): useValueReturn {
|
||||
cameraStates[`${name}/detect/state`] = detect ? "ON" : "OFF";
|
||||
cameraStates[`${name}/snapshots/state`] = snapshots ? "ON" : "OFF";
|
||||
cameraStates[`${name}/audio/state`] = audio ? "ON" : "OFF";
|
||||
cameraStates[`${name}/audio_transcription/state`] = audio_transcription
|
||||
? "ON"
|
||||
: "OFF";
|
||||
cameraStates[`${name}/notifications/state`] = notifications
|
||||
? "ON"
|
||||
: "OFF";
|
||||
@@ -220,6 +225,20 @@ export function useAudioState(camera: string): {
|
||||
return { payload: payload as ToggleableSetting, send };
|
||||
}
|
||||
|
||||
export function useAudioTranscriptionState(camera: string): {
|
||||
payload: ToggleableSetting;
|
||||
send: (payload: ToggleableSetting, retain?: boolean) => void;
|
||||
} {
|
||||
const {
|
||||
value: { payload },
|
||||
send,
|
||||
} = useWs(
|
||||
`${camera}/audio_transcription/state`,
|
||||
`${camera}/audio_transcription/set`,
|
||||
);
|
||||
return { payload: payload as ToggleableSetting, send };
|
||||
}
|
||||
|
||||
export function useAutotrackingState(camera: string): {
|
||||
payload: ToggleableSetting;
|
||||
send: (payload: ToggleableSetting, retain?: boolean) => void;
|
||||
@@ -421,6 +440,15 @@ export function useAudioActivity(camera: string): { payload: number } {
|
||||
return { payload: payload as number };
|
||||
}
|
||||
|
||||
export function useAudioLiveTranscription(camera: string): {
|
||||
payload: string;
|
||||
} {
|
||||
const {
|
||||
value: { payload },
|
||||
} = useWs(`${camera}/audio/transcription`, "");
|
||||
return { payload: payload as string };
|
||||
}
|
||||
|
||||
export function useMotionThreshold(camera: string): {
|
||||
payload: string;
|
||||
send: (payload: number, retain?: boolean) => void;
|
||||
@@ -463,11 +491,16 @@ export function useImproveContrast(camera: string): {
|
||||
return { payload: payload as ToggleableSetting, send };
|
||||
}
|
||||
|
||||
export function useTrackedObjectUpdate(): { payload: string } {
|
||||
export function useTrackedObjectUpdate(): {
|
||||
payload: TrackedObjectUpdateReturnType;
|
||||
} {
|
||||
const {
|
||||
value: { payload },
|
||||
} = useWs("tracked_object_update", "");
|
||||
return useDeepMemo(JSON.parse(payload as string));
|
||||
const parsed = payload
|
||||
? JSON.parse(payload as string)
|
||||
: { type: "", id: "", camera: "" };
|
||||
return { payload: useDeepMemo(parsed) };
|
||||
}
|
||||
|
||||
export function useNotifications(camera: string): {
|
||||
|
||||
@@ -78,6 +78,7 @@ import { TbFaceId } from "react-icons/tb";
|
||||
import { useIsAdmin } from "@/hooks/use-is-admin";
|
||||
import FaceSelectionDialog from "../FaceSelectionDialog";
|
||||
import { getTranslatedLabel } from "@/utils/i18n";
|
||||
import { CgTranscript } from "react-icons/cg";
|
||||
|
||||
const SEARCH_TABS = [
|
||||
"details",
|
||||
@@ -710,6 +711,34 @@ function ObjectDetailsTab({
|
||||
[search, t],
|
||||
);
|
||||
|
||||
// speech transcription
|
||||
|
||||
const onTranscribe = useCallback(() => {
|
||||
axios
|
||||
.put(`/audio/transcribe`, { event_id: search.id })
|
||||
.then((resp) => {
|
||||
if (resp.status == 202) {
|
||||
toast.success(t("details.item.toast.success.audioTranscription"), {
|
||||
position: "top-center",
|
||||
});
|
||||
}
|
||||
})
|
||||
.catch((error) => {
|
||||
const errorMessage =
|
||||
error.response?.data?.message ||
|
||||
error.response?.data?.detail ||
|
||||
"Unknown error";
|
||||
toast.error(
|
||||
t("details.item.toast.error.audioTranscription", {
|
||||
errorMessage,
|
||||
}),
|
||||
{
|
||||
position: "top-center",
|
||||
},
|
||||
);
|
||||
});
|
||||
}, [search, t]);
|
||||
|
||||
return (
|
||||
<div className="flex flex-col gap-5">
|
||||
<div className="flex w-full flex-row">
|
||||
@@ -893,6 +922,16 @@ function ObjectDetailsTab({
|
||||
</Button>
|
||||
</FaceSelectionDialog>
|
||||
)}
|
||||
{config?.cameras[search?.camera].audio_transcription.enabled &&
|
||||
search?.label == "speech" &&
|
||||
search?.end_time && (
|
||||
<Button className="w-full" onClick={onTranscribe}>
|
||||
<div className="flex gap-1">
|
||||
<CgTranscript />
|
||||
{t("itemMenu.audioTranscription.label")}
|
||||
</div>
|
||||
</Button>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -257,15 +257,13 @@ export default function Explore() {
|
||||
|
||||
// mutation and revalidation
|
||||
|
||||
const trackedObjectUpdate = useTrackedObjectUpdate();
|
||||
const { payload: wsUpdate } = useTrackedObjectUpdate();
|
||||
|
||||
useEffect(() => {
|
||||
if (trackedObjectUpdate) {
|
||||
if (wsUpdate && wsUpdate.type == "description") {
|
||||
mutate();
|
||||
}
|
||||
// mutate / revalidate when event description updates come in
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [trackedObjectUpdate]);
|
||||
}, [wsUpdate, mutate]);
|
||||
|
||||
// embeddings reindex progress
|
||||
|
||||
|
||||
@@ -41,6 +41,11 @@ export interface CameraConfig {
|
||||
min_volume: number;
|
||||
num_threads: number;
|
||||
};
|
||||
audio_transcription: {
|
||||
enabled: boolean;
|
||||
enabled_in_config: boolean;
|
||||
live_enabled: boolean;
|
||||
};
|
||||
best_image_timeout: number;
|
||||
birdseye: {
|
||||
enabled: boolean;
|
||||
@@ -296,6 +301,10 @@ export interface FrigateConfig {
|
||||
num_threads: number;
|
||||
};
|
||||
|
||||
audio_transcription: {
|
||||
enabled: boolean;
|
||||
};
|
||||
|
||||
birdseye: BirdseyeConfig;
|
||||
|
||||
cameras: {
|
||||
|
||||
@@ -58,6 +58,7 @@ export interface FrigateCameraState {
|
||||
snapshots: boolean;
|
||||
record: boolean;
|
||||
audio: boolean;
|
||||
audio_transcription: boolean;
|
||||
notifications: boolean;
|
||||
notifications_suspended: number;
|
||||
autotracking: boolean;
|
||||
@@ -84,3 +85,21 @@ export type EmbeddingsReindexProgressType = {
|
||||
};
|
||||
|
||||
export type ToggleableSetting = "ON" | "OFF";
|
||||
|
||||
export type TrackedObjectUpdateType =
|
||||
| "description"
|
||||
| "lpr"
|
||||
| "transcription"
|
||||
| "face";
|
||||
|
||||
export type TrackedObjectUpdateReturnType = {
|
||||
type: TrackedObjectUpdateType;
|
||||
id: string;
|
||||
camera: string;
|
||||
description?: string;
|
||||
name?: string;
|
||||
plate?: string;
|
||||
score?: number;
|
||||
timestamp?: number;
|
||||
text?: string;
|
||||
} | null;
|
||||
|
||||
@@ -75,13 +75,13 @@ export default function ExploreView({
|
||||
}, {});
|
||||
}, [events]);
|
||||
|
||||
const trackedObjectUpdate = useTrackedObjectUpdate();
|
||||
const { payload: wsUpdate } = useTrackedObjectUpdate();
|
||||
|
||||
useEffect(() => {
|
||||
mutate();
|
||||
// mutate / revalidate when event description updates come in
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [trackedObjectUpdate]);
|
||||
if (wsUpdate && wsUpdate.type == "description") {
|
||||
mutate();
|
||||
}
|
||||
}, [wsUpdate, mutate]);
|
||||
|
||||
// update search detail when results change
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import {
|
||||
useAudioLiveTranscription,
|
||||
useAudioState,
|
||||
useAudioTranscriptionState,
|
||||
useAutotrackingState,
|
||||
useDetectState,
|
||||
useEnabledState,
|
||||
@@ -90,6 +92,8 @@ import {
|
||||
LuX,
|
||||
} from "react-icons/lu";
|
||||
import {
|
||||
MdClosedCaption,
|
||||
MdClosedCaptionDisabled,
|
||||
MdNoPhotography,
|
||||
MdOutlineRestartAlt,
|
||||
MdPersonOff,
|
||||
@@ -197,6 +201,29 @@ export default function LiveCameraView({
|
||||
const { payload: enabledState } = useEnabledState(camera.name);
|
||||
const cameraEnabled = enabledState === "ON";
|
||||
|
||||
// for audio transcriptions
|
||||
|
||||
const { payload: audioTranscriptionState, send: sendTranscription } =
|
||||
useAudioTranscriptionState(camera.name);
|
||||
const { payload: transcription } = useAudioLiveTranscription(camera.name);
|
||||
const transcriptionRef = useRef<HTMLDivElement>(null);
|
||||
|
||||
useEffect(() => {
|
||||
if (transcription) {
|
||||
if (transcriptionRef.current) {
|
||||
transcriptionRef.current.scrollTop =
|
||||
transcriptionRef.current.scrollHeight;
|
||||
}
|
||||
}
|
||||
}, [transcription]);
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
// disable transcriptions when unmounting
|
||||
if (audioTranscriptionState == "ON") sendTranscription("OFF");
|
||||
};
|
||||
}, [audioTranscriptionState, sendTranscription]);
|
||||
|
||||
// click overlay for ptzs
|
||||
|
||||
const [clickOverlay, setClickOverlay] = useState(false);
|
||||
@@ -567,6 +594,9 @@ export default function LiveCameraView({
|
||||
autotrackingEnabled={
|
||||
camera.onvif.autotracking.enabled_in_config
|
||||
}
|
||||
transcriptionEnabled={
|
||||
camera.audio_transcription.enabled_in_config
|
||||
}
|
||||
fullscreen={fullscreen}
|
||||
streamName={streamName ?? ""}
|
||||
setStreamName={setStreamName}
|
||||
@@ -626,6 +656,16 @@ export default function LiveCameraView({
|
||||
/>
|
||||
</div>
|
||||
</TransformComponent>
|
||||
{camera?.audio?.enabled_in_config &&
|
||||
audioTranscriptionState == "ON" &&
|
||||
transcription != null && (
|
||||
<div
|
||||
ref={transcriptionRef}
|
||||
className="text-md scrollbar-container absolute bottom-4 left-1/2 max-h-[15vh] w-[75%] -translate-x-1/2 overflow-y-auto rounded-lg bg-black/70 p-2 text-white md:w-[50%]"
|
||||
>
|
||||
{transcription}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
{camera.onvif.host != "" && (
|
||||
@@ -984,6 +1024,7 @@ type FrigateCameraFeaturesProps = {
|
||||
recordingEnabled: boolean;
|
||||
audioDetectEnabled: boolean;
|
||||
autotrackingEnabled: boolean;
|
||||
transcriptionEnabled: boolean;
|
||||
fullscreen: boolean;
|
||||
streamName: string;
|
||||
setStreamName?: (value: string | undefined) => void;
|
||||
@@ -1003,6 +1044,7 @@ function FrigateCameraFeatures({
|
||||
recordingEnabled,
|
||||
audioDetectEnabled,
|
||||
autotrackingEnabled,
|
||||
transcriptionEnabled,
|
||||
fullscreen,
|
||||
streamName,
|
||||
setStreamName,
|
||||
@@ -1035,6 +1077,8 @@ function FrigateCameraFeatures({
|
||||
const { payload: audioState, send: sendAudio } = useAudioState(camera.name);
|
||||
const { payload: autotrackingState, send: sendAutotracking } =
|
||||
useAutotrackingState(camera.name);
|
||||
const { payload: transcriptionState, send: sendTranscription } =
|
||||
useAudioTranscriptionState(camera.name);
|
||||
|
||||
// roles
|
||||
|
||||
@@ -1198,6 +1242,27 @@ function FrigateCameraFeatures({
|
||||
disabled={!cameraEnabled}
|
||||
/>
|
||||
)}
|
||||
{audioDetectEnabled && transcriptionEnabled && (
|
||||
<CameraFeatureToggle
|
||||
className="p-2 md:p-0"
|
||||
variant={fullscreen ? "overlay" : "primary"}
|
||||
Icon={
|
||||
transcriptionState == "ON"
|
||||
? MdClosedCaption
|
||||
: MdClosedCaptionDisabled
|
||||
}
|
||||
isActive={transcriptionState == "ON"}
|
||||
title={
|
||||
transcriptionState == "ON"
|
||||
? t("transcription.disable")
|
||||
: t("transcription.enable")
|
||||
}
|
||||
onClick={() =>
|
||||
sendTranscription(transcriptionState == "ON" ? "OFF" : "ON")
|
||||
}
|
||||
disabled={!cameraEnabled || audioState == "OFF"}
|
||||
/>
|
||||
)}
|
||||
{autotrackingEnabled && (
|
||||
<CameraFeatureToggle
|
||||
className="p-2 md:p-0"
|
||||
@@ -1562,6 +1627,16 @@ function FrigateCameraFeatures({
|
||||
}
|
||||
/>
|
||||
)}
|
||||
{audioDetectEnabled && transcriptionEnabled && (
|
||||
<FilterSwitch
|
||||
label={t("cameraSettings.transcription")}
|
||||
disabled={audioState == "OFF"}
|
||||
isChecked={transcriptionState == "ON"}
|
||||
onCheckedChange={() =>
|
||||
sendTranscription(transcriptionState == "ON" ? "OFF" : "ON")
|
||||
}
|
||||
/>
|
||||
)}
|
||||
{autotrackingEnabled && (
|
||||
<FilterSwitch
|
||||
label={t("cameraSettings.autotracking")}
|
||||
|
||||
Reference in New Issue
Block a user