From 2541a345d0b4bbadde1d8f77f84036109cdd8ef9 Mon Sep 17 00:00:00 2001 From: Nicolas Mowen Date: Mon, 7 Oct 2024 20:15:31 -0600 Subject: [PATCH] Improve Nvidia GPU stats (#14206) * :Add support for nvidia driver info * Don't show temperature if detector isn't called coral * Add encoder and decoder info for Nvidia GPUs * Fix device info * Implement GPU info for nvidia GPU * Update web/src/views/system/GeneralMetrics.tsx Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com> * Update web/src/views/system/GeneralMetrics.tsx Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com> --------- Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com> --- frigate/api/app.py | 12 +- frigate/util/services.py | 33 +++- web/src/components/overlay/GPUInfoDialog.tsx | 100 +++++++++++ web/src/components/overlay/VainfoDialog.tsx | 55 ------ web/src/types/stats.ts | 14 ++ web/src/views/system/GeneralMetrics.tsx | 178 +++++++++++++++++-- 6 files changed, 316 insertions(+), 76 deletions(-) create mode 100644 web/src/components/overlay/GPUInfoDialog.tsx delete mode 100644 web/src/components/overlay/VainfoDialog.tsx diff --git a/frigate/api/app.py b/frigate/api/app.py index 2e16f4fcb..d4c5cdba3 100644 --- a/frigate/api/app.py +++ b/frigate/api/app.py @@ -28,7 +28,12 @@ from frigate.util.builtin import ( get_tz_modifiers, update_yaml_from_url, ) -from frigate.util.services import ffprobe_stream, restart_frigate, vainfo_hwaccel +from frigate.util.services import ( + ffprobe_stream, + get_nvidia_driver_info, + restart_frigate, + vainfo_hwaccel, +) from frigate.version import VERSION logger = logging.getLogger(__name__) @@ -382,6 +387,11 @@ def vainfo(): ) +@router.get("/nvinfo") +def nvinfo(): + return JSONResponse(content=get_nvidia_driver_info()) + + @router.get("/logs/{service}", tags=[Tags.logs]) def logs( service: str = Path(enum=["frigate", "nginx", "go2rtc"]), diff --git a/frigate/util/services.py b/frigate/util/services.py index 4c902ea8d..24db5c628 100644 --- a/frigate/util/services.py +++ b/frigate/util/services.py @@ -339,7 +339,10 @@ def get_intel_gpu_stats() -> dict[str, str]: def try_get_info(f, h, default="N/A"): try: - v = f(h) + if h: + v = f(h) + else: + v = f() except nvml.NVMLError_NotSupported: v = default return v @@ -356,6 +359,8 @@ def get_nvidia_gpu_stats() -> dict[int, dict]: util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle) enc = try_get_info(nvml.nvmlDeviceGetEncoderUtilization, handle) dec = try_get_info(nvml.nvmlDeviceGetDecoderUtilization, handle) + pstate = try_get_info(nvml.nvmlDeviceGetPowerState, handle, default=None) + if util != "N/A": gpu_util = util.gpu else: @@ -382,6 +387,7 @@ def get_nvidia_gpu_stats() -> dict[int, dict]: "mem": gpu_mem_util, "enc": enc_util, "dec": dec_util, + "pstate": pstate or "unknown", } except Exception: pass @@ -432,6 +438,31 @@ def vainfo_hwaccel(device_name: Optional[str] = None) -> sp.CompletedProcess: return sp.run(ffprobe_cmd, capture_output=True) +def get_nvidia_driver_info() -> dict[str, any]: + """Get general hardware info for nvidia GPU.""" + results = {} + try: + nvml.nvmlInit() + deviceCount = nvml.nvmlDeviceGetCount() + for i in range(deviceCount): + handle = nvml.nvmlDeviceGetHandleByIndex(i) + driver = try_get_info(nvml.nvmlSystemGetDriverVersion, None, default=None) + cuda_compute = try_get_info( + nvml.nvmlDeviceGetCudaComputeCapability, handle, default=None + ) + vbios = try_get_info(nvml.nvmlDeviceGetVbiosVersion, handle, default=None) + results[i] = { + "name": nvml.nvmlDeviceGetName(handle), + "driver": driver or "unknown", + "cuda_compute": cuda_compute or "unknown", + "vbios": vbios or "unknown", + } + except Exception: + pass + finally: + return results + + def auto_detect_hwaccel() -> str: """Detect hwaccel args by default.""" try: diff --git a/web/src/components/overlay/GPUInfoDialog.tsx b/web/src/components/overlay/GPUInfoDialog.tsx new file mode 100644 index 000000000..90564ed4e --- /dev/null +++ b/web/src/components/overlay/GPUInfoDialog.tsx @@ -0,0 +1,100 @@ +import useSWR from "swr"; +import { + Dialog, + DialogContent, + DialogFooter, + DialogHeader, + DialogTitle, +} from "../ui/dialog"; +import ActivityIndicator from "../indicators/activity-indicator"; +import { GpuInfo, Nvinfo, Vainfo } from "@/types/stats"; +import { Button } from "../ui/button"; +import copy from "copy-to-clipboard"; + +type GPUInfoDialogProps = { + showGpuInfo: boolean; + gpuType: GpuInfo; + setShowGpuInfo: (show: boolean) => void; +}; +export default function GPUInfoDialog({ + showGpuInfo, + gpuType, + setShowGpuInfo, +}: GPUInfoDialogProps) { + const { data: vainfo } = useSWR( + showGpuInfo && gpuType == "vainfo" ? "vainfo" : null, + ); + const { data: nvinfo } = useSWR( + showGpuInfo && gpuType == "nvinfo" ? "nvinfo" : null, + ); + + const onCopyInfo = async () => { + copy( + JSON.stringify(gpuType == "vainfo" ? vainfo : nvinfo).replace( + /[\\\s]+/gi, + "", + ), + ); + setShowGpuInfo(false); + }; + + if (gpuType == "vainfo") { + return ( + + + + Vainfo Output + + {vainfo ? ( +
+
Return Code: {vainfo.return_code}
+
+
Process {vainfo.return_code == 0 ? "Output" : "Error"}:
+
+
+ {vainfo.return_code == 0 ? vainfo.stdout : vainfo.stderr} +
+
+ ) : ( + + )} + + + + +
+
+ ); + } else { + return ( + + + + Nvidia SMI Output + + {nvinfo ? ( +
+
Name: {nvinfo["0"].name}
+
+
Driver: {nvinfo["0"].driver}
+
+
Cuda Compute Capability: {nvinfo["0"].cuda_compute}
+
+
VBios Info: {nvinfo["0"].vbios}
+
+ ) : ( + + )} + + + + +
+
+ ); + } +} diff --git a/web/src/components/overlay/VainfoDialog.tsx b/web/src/components/overlay/VainfoDialog.tsx deleted file mode 100644 index 74dec5ae3..000000000 --- a/web/src/components/overlay/VainfoDialog.tsx +++ /dev/null @@ -1,55 +0,0 @@ -import useSWR from "swr"; -import { - Dialog, - DialogContent, - DialogFooter, - DialogHeader, - DialogTitle, -} from "../ui/dialog"; -import ActivityIndicator from "../indicators/activity-indicator"; -import { Vainfo } from "@/types/stats"; -import { Button } from "../ui/button"; -import copy from "copy-to-clipboard"; - -type VainfoDialogProps = { - showVainfo: boolean; - setShowVainfo: (show: boolean) => void; -}; -export default function VainfoDialog({ - showVainfo, - setShowVainfo, -}: VainfoDialogProps) { - const { data: vainfo } = useSWR(showVainfo ? "vainfo" : null); - - const onCopyVainfo = async () => { - copy(JSON.stringify(vainfo).replace(/[\\\s]+/gi, "")); - setShowVainfo(false); - }; - - return ( - - - - Vainfo Output - - {vainfo ? ( -
-
Return Code: {vainfo.return_code}
-
-
Process {vainfo.return_code == 0 ? "Output" : "Error"}:
-
-
{vainfo.return_code == 0 ? vainfo.stdout : vainfo.stderr}
-
- ) : ( - - )} - - - - -
-
- ); -} diff --git a/web/src/types/stats.ts b/web/src/types/stats.ts index a4f5605e1..026cf0536 100644 --- a/web/src/types/stats.ts +++ b/web/src/types/stats.ts @@ -41,8 +41,13 @@ export type ExtraProcessStats = { export type GpuStats = { gpu: string; mem: string; + enc?: string; + dec?: string; + pstate?: string; }; +export type GpuInfo = "vainfo" | "nvinfo"; + export type ServiceStats = { last_updated: number; storage: { [path: string]: StorageStats }; @@ -71,6 +76,15 @@ export type Vainfo = { stderr: string; }; +export type Nvinfo = { + [key: string]: { + name: string; + driver: string; + cuda_compute: string; + vbios: string; + }; +}; + export type Ffprobe = { return_code: number; stderr: string; diff --git a/web/src/views/system/GeneralMetrics.tsx b/web/src/views/system/GeneralMetrics.tsx index f617e9654..e6b04f5b7 100644 --- a/web/src/views/system/GeneralMetrics.tsx +++ b/web/src/views/system/GeneralMetrics.tsx @@ -1,5 +1,5 @@ import useSWR from "swr"; -import { FrigateStats } from "@/types/stats"; +import { FrigateStats, GpuInfo } from "@/types/stats"; import { useEffect, useMemo, useState } from "react"; import { useFrigateStats } from "@/api/ws"; import { @@ -11,9 +11,10 @@ import { InferenceThreshold, } from "@/types/graph"; import { Button } from "@/components/ui/button"; -import VainfoDialog from "@/components/overlay/VainfoDialog"; +import GPUInfoDialog from "@/components/overlay/GPUInfoDialog"; import { Skeleton } from "@/components/ui/skeleton"; import { ThresholdBarGraph } from "@/components/graph/SystemGraph"; +import { cn } from "@/lib/utils"; type GeneralMetricsProps = { lastUpdated: number; @@ -62,15 +63,23 @@ export default function GeneralMetrics({ } }, [initialStats, updatedStats, statsHistory, lastUpdated, setLastUpdated]); - const canGetGpuInfo = useMemo( - () => - statsHistory.length > 0 && - Object.keys(statsHistory[0]?.gpu_usages ?? {}).filter( - (key) => - key == "amd-vaapi" || key == "intel-vaapi" || key == "intel-qsv", - ).length > 0, - [statsHistory], - ); + const [canGetGpuInfo, gpuType] = useMemo<[boolean, GpuInfo]>(() => { + let vaCount = 0; + let nvCount = 0; + + statsHistory.length > 0 && + Object.keys(statsHistory[0]?.gpu_usages ?? {}).forEach((key) => { + if (key == "amd-vaapi" || key == "intel-vaapi" || key == "intel-qsv") { + vaCount += 1; + } + + if (key.includes("NVIDIA")) { + nvCount += 1; + } + }); + + return [vaCount > 0 || nvCount > 0, nvCount > 0 ? "nvinfo" : "vainfo"]; + }, [statsHistory]); // timestamps @@ -108,7 +117,7 @@ export default function GeneralMetrics({ const detTempSeries = useMemo(() => { if (!statsHistory) { - return []; + return undefined; } if ( @@ -128,6 +137,10 @@ export default function GeneralMetrics({ } Object.entries(stats.detectors).forEach(([key], cIdx) => { + if (!key.includes("coral")) { + return; + } + if (cIdx <= Object.keys(stats.service.temperatures).length) { if (!(key in series)) { series[key] = { @@ -141,7 +154,12 @@ export default function GeneralMetrics({ } }); }); - return Object.values(series); + + if (Object.keys(series).length > 0) { + return Object.values(series); + } + + return undefined; }, [statsHistory]); const detCpuSeries = useMemo(() => { @@ -282,6 +300,74 @@ export default function GeneralMetrics({ return Object.values(series); }, [statsHistory]); + const gpuEncSeries = useMemo(() => { + if (!statsHistory) { + return []; + } + + const series: { + [key: string]: { name: string; data: { x: number; y: string }[] }; + } = {}; + let hasValidGpu = false; + + statsHistory.forEach((stats, statsIdx) => { + if (!stats) { + return; + } + + Object.entries(stats.gpu_usages || []).forEach(([key, stats]) => { + if (!(key in series)) { + series[key] = { name: key, data: [] }; + } + + if (stats.enc) { + hasValidGpu = true; + series[key].data.push({ x: statsIdx + 1, y: stats.enc.slice(0, -1) }); + } + }); + }); + + if (!hasValidGpu) { + return []; + } + + return Object.keys(series).length > 0 ? Object.values(series) : undefined; + }, [statsHistory]); + + const gpuDecSeries = useMemo(() => { + if (!statsHistory) { + return []; + } + + const series: { + [key: string]: { name: string; data: { x: number; y: string }[] }; + } = {}; + let hasValidGpu = false; + + statsHistory.forEach((stats, statsIdx) => { + if (!stats) { + return; + } + + Object.entries(stats.gpu_usages || []).forEach(([key, stats]) => { + if (!(key in series)) { + series[key] = { name: key, data: [] }; + } + + if (stats.dec) { + hasValidGpu = true; + series[key].data.push({ x: statsIdx + 1, y: stats.dec.slice(0, -1) }); + } + }); + }); + + if (!hasValidGpu) { + return []; + } + + return Object.keys(series).length > 0 ? Object.values(series) : undefined; + }, [statsHistory]); + // other processes stats const otherProcessCpuSeries = useMemo(() => { @@ -354,14 +440,21 @@ export default function GeneralMetrics({ return ( <> - +
Detectors
{statsHistory.length != 0 ? (
@@ -381,7 +474,7 @@ export default function GeneralMetrics({ ) : ( )} - {statsHistory.length != 0 ? ( + {statsHistory.length != 0 && ( <> {detTempSeries && (
@@ -400,8 +493,6 @@ export default function GeneralMetrics({
)} - ) : ( - )} {statsHistory.length != 0 ? (
@@ -457,7 +548,12 @@ export default function GeneralMetrics({ )}
-
+
{statsHistory.length != 0 ? (
GPU Usage
@@ -498,6 +594,50 @@ export default function GeneralMetrics({ ) : ( )} + {statsHistory.length != 0 ? ( + <> + {gpuEncSeries && gpuEncSeries?.length != 0 && ( +
+
GPU Encoder
+ {gpuEncSeries.map((series) => ( + + ))} +
+ )} + + ) : ( + + )} + {statsHistory.length != 0 ? ( + <> + {gpuDecSeries && gpuDecSeries?.length != 0 && ( +
+
GPU Decoder
+ {gpuDecSeries.map((series) => ( + + ))} +
+ )} + + ) : ( + + )}
)}