Improve Nvidia GPU stats (#14206)

* :Add support for nvidia driver info

* Don't show temperature if detector isn't called coral

* Add encoder and decoder info for Nvidia GPUs

* Fix device info

* Implement GPU info for nvidia GPU

* Update web/src/views/system/GeneralMetrics.tsx

Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com>

* Update web/src/views/system/GeneralMetrics.tsx

Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com>

---------

Co-authored-by: Josh Hawkins <32435876+hawkeye217@users.noreply.github.com>
This commit is contained in:
Nicolas Mowen 2024-10-07 20:15:31 -06:00 committed by GitHub
parent 23ce1e930d
commit 2541a345d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 316 additions and 76 deletions

View File

@ -28,7 +28,12 @@ from frigate.util.builtin import (
get_tz_modifiers,
update_yaml_from_url,
)
from frigate.util.services import ffprobe_stream, restart_frigate, vainfo_hwaccel
from frigate.util.services import (
ffprobe_stream,
get_nvidia_driver_info,
restart_frigate,
vainfo_hwaccel,
)
from frigate.version import VERSION
logger = logging.getLogger(__name__)
@ -382,6 +387,11 @@ def vainfo():
)
@router.get("/nvinfo")
def nvinfo():
return JSONResponse(content=get_nvidia_driver_info())
@router.get("/logs/{service}", tags=[Tags.logs])
def logs(
service: str = Path(enum=["frigate", "nginx", "go2rtc"]),

View File

@ -339,7 +339,10 @@ def get_intel_gpu_stats() -> dict[str, str]:
def try_get_info(f, h, default="N/A"):
try:
v = f(h)
if h:
v = f(h)
else:
v = f()
except nvml.NVMLError_NotSupported:
v = default
return v
@ -356,6 +359,8 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
enc = try_get_info(nvml.nvmlDeviceGetEncoderUtilization, handle)
dec = try_get_info(nvml.nvmlDeviceGetDecoderUtilization, handle)
pstate = try_get_info(nvml.nvmlDeviceGetPowerState, handle, default=None)
if util != "N/A":
gpu_util = util.gpu
else:
@ -382,6 +387,7 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
"mem": gpu_mem_util,
"enc": enc_util,
"dec": dec_util,
"pstate": pstate or "unknown",
}
except Exception:
pass
@ -432,6 +438,31 @@ def vainfo_hwaccel(device_name: Optional[str] = None) -> sp.CompletedProcess:
return sp.run(ffprobe_cmd, capture_output=True)
def get_nvidia_driver_info() -> dict[str, any]:
"""Get general hardware info for nvidia GPU."""
results = {}
try:
nvml.nvmlInit()
deviceCount = nvml.nvmlDeviceGetCount()
for i in range(deviceCount):
handle = nvml.nvmlDeviceGetHandleByIndex(i)
driver = try_get_info(nvml.nvmlSystemGetDriverVersion, None, default=None)
cuda_compute = try_get_info(
nvml.nvmlDeviceGetCudaComputeCapability, handle, default=None
)
vbios = try_get_info(nvml.nvmlDeviceGetVbiosVersion, handle, default=None)
results[i] = {
"name": nvml.nvmlDeviceGetName(handle),
"driver": driver or "unknown",
"cuda_compute": cuda_compute or "unknown",
"vbios": vbios or "unknown",
}
except Exception:
pass
finally:
return results
def auto_detect_hwaccel() -> str:
"""Detect hwaccel args by default."""
try:

View File

@ -0,0 +1,100 @@
import useSWR from "swr";
import {
Dialog,
DialogContent,
DialogFooter,
DialogHeader,
DialogTitle,
} from "../ui/dialog";
import ActivityIndicator from "../indicators/activity-indicator";
import { GpuInfo, Nvinfo, Vainfo } from "@/types/stats";
import { Button } from "../ui/button";
import copy from "copy-to-clipboard";
type GPUInfoDialogProps = {
showGpuInfo: boolean;
gpuType: GpuInfo;
setShowGpuInfo: (show: boolean) => void;
};
export default function GPUInfoDialog({
showGpuInfo,
gpuType,
setShowGpuInfo,
}: GPUInfoDialogProps) {
const { data: vainfo } = useSWR<Vainfo>(
showGpuInfo && gpuType == "vainfo" ? "vainfo" : null,
);
const { data: nvinfo } = useSWR<Nvinfo>(
showGpuInfo && gpuType == "nvinfo" ? "nvinfo" : null,
);
const onCopyInfo = async () => {
copy(
JSON.stringify(gpuType == "vainfo" ? vainfo : nvinfo).replace(
/[\\\s]+/gi,
"",
),
);
setShowGpuInfo(false);
};
if (gpuType == "vainfo") {
return (
<Dialog open={showGpuInfo} onOpenChange={setShowGpuInfo}>
<DialogContent>
<DialogHeader>
<DialogTitle>Vainfo Output</DialogTitle>
</DialogHeader>
{vainfo ? (
<div className="scrollbar-container mb-2 max-h-96 overflow-y-scroll whitespace-pre-line">
<div>Return Code: {vainfo.return_code}</div>
<br />
<div>Process {vainfo.return_code == 0 ? "Output" : "Error"}:</div>
<br />
<div>
{vainfo.return_code == 0 ? vainfo.stdout : vainfo.stderr}
</div>
</div>
) : (
<ActivityIndicator />
)}
<DialogFooter>
<Button onClick={() => setShowGpuInfo(false)}>Close</Button>
<Button variant="select" onClick={() => onCopyInfo()}>
Copy
</Button>
</DialogFooter>
</DialogContent>
</Dialog>
);
} else {
return (
<Dialog open={showGpuInfo} onOpenChange={setShowGpuInfo}>
<DialogContent>
<DialogHeader>
<DialogTitle>Nvidia SMI Output</DialogTitle>
</DialogHeader>
{nvinfo ? (
<div className="scrollbar-container mb-2 max-h-96 overflow-y-scroll whitespace-pre-line">
<div>Name: {nvinfo["0"].name}</div>
<br />
<div>Driver: {nvinfo["0"].driver}</div>
<br />
<div>Cuda Compute Capability: {nvinfo["0"].cuda_compute}</div>
<br />
<div>VBios Info: {nvinfo["0"].vbios}</div>
</div>
) : (
<ActivityIndicator />
)}
<DialogFooter>
<Button onClick={() => setShowGpuInfo(false)}>Close</Button>
<Button variant="select" onClick={() => onCopyInfo()}>
Copy
</Button>
</DialogFooter>
</DialogContent>
</Dialog>
);
}
}

View File

@ -1,55 +0,0 @@
import useSWR from "swr";
import {
Dialog,
DialogContent,
DialogFooter,
DialogHeader,
DialogTitle,
} from "../ui/dialog";
import ActivityIndicator from "../indicators/activity-indicator";
import { Vainfo } from "@/types/stats";
import { Button } from "../ui/button";
import copy from "copy-to-clipboard";
type VainfoDialogProps = {
showVainfo: boolean;
setShowVainfo: (show: boolean) => void;
};
export default function VainfoDialog({
showVainfo,
setShowVainfo,
}: VainfoDialogProps) {
const { data: vainfo } = useSWR<Vainfo>(showVainfo ? "vainfo" : null);
const onCopyVainfo = async () => {
copy(JSON.stringify(vainfo).replace(/[\\\s]+/gi, ""));
setShowVainfo(false);
};
return (
<Dialog open={showVainfo} onOpenChange={setShowVainfo}>
<DialogContent>
<DialogHeader>
<DialogTitle>Vainfo Output</DialogTitle>
</DialogHeader>
{vainfo ? (
<div className="scrollbar-container mb-2 max-h-96 overflow-y-scroll whitespace-pre-line">
<div>Return Code: {vainfo.return_code}</div>
<br />
<div>Process {vainfo.return_code == 0 ? "Output" : "Error"}:</div>
<br />
<div>{vainfo.return_code == 0 ? vainfo.stdout : vainfo.stderr}</div>
</div>
) : (
<ActivityIndicator />
)}
<DialogFooter>
<Button onClick={() => setShowVainfo(false)}>Close</Button>
<Button variant="select" onClick={() => onCopyVainfo()}>
Copy
</Button>
</DialogFooter>
</DialogContent>
</Dialog>
);
}

View File

@ -41,8 +41,13 @@ export type ExtraProcessStats = {
export type GpuStats = {
gpu: string;
mem: string;
enc?: string;
dec?: string;
pstate?: string;
};
export type GpuInfo = "vainfo" | "nvinfo";
export type ServiceStats = {
last_updated: number;
storage: { [path: string]: StorageStats };
@ -71,6 +76,15 @@ export type Vainfo = {
stderr: string;
};
export type Nvinfo = {
[key: string]: {
name: string;
driver: string;
cuda_compute: string;
vbios: string;
};
};
export type Ffprobe = {
return_code: number;
stderr: string;

View File

@ -1,5 +1,5 @@
import useSWR from "swr";
import { FrigateStats } from "@/types/stats";
import { FrigateStats, GpuInfo } from "@/types/stats";
import { useEffect, useMemo, useState } from "react";
import { useFrigateStats } from "@/api/ws";
import {
@ -11,9 +11,10 @@ import {
InferenceThreshold,
} from "@/types/graph";
import { Button } from "@/components/ui/button";
import VainfoDialog from "@/components/overlay/VainfoDialog";
import GPUInfoDialog from "@/components/overlay/GPUInfoDialog";
import { Skeleton } from "@/components/ui/skeleton";
import { ThresholdBarGraph } from "@/components/graph/SystemGraph";
import { cn } from "@/lib/utils";
type GeneralMetricsProps = {
lastUpdated: number;
@ -62,15 +63,23 @@ export default function GeneralMetrics({
}
}, [initialStats, updatedStats, statsHistory, lastUpdated, setLastUpdated]);
const canGetGpuInfo = useMemo(
() =>
statsHistory.length > 0 &&
Object.keys(statsHistory[0]?.gpu_usages ?? {}).filter(
(key) =>
key == "amd-vaapi" || key == "intel-vaapi" || key == "intel-qsv",
).length > 0,
[statsHistory],
);
const [canGetGpuInfo, gpuType] = useMemo<[boolean, GpuInfo]>(() => {
let vaCount = 0;
let nvCount = 0;
statsHistory.length > 0 &&
Object.keys(statsHistory[0]?.gpu_usages ?? {}).forEach((key) => {
if (key == "amd-vaapi" || key == "intel-vaapi" || key == "intel-qsv") {
vaCount += 1;
}
if (key.includes("NVIDIA")) {
nvCount += 1;
}
});
return [vaCount > 0 || nvCount > 0, nvCount > 0 ? "nvinfo" : "vainfo"];
}, [statsHistory]);
// timestamps
@ -108,7 +117,7 @@ export default function GeneralMetrics({
const detTempSeries = useMemo(() => {
if (!statsHistory) {
return [];
return undefined;
}
if (
@ -128,6 +137,10 @@ export default function GeneralMetrics({
}
Object.entries(stats.detectors).forEach(([key], cIdx) => {
if (!key.includes("coral")) {
return;
}
if (cIdx <= Object.keys(stats.service.temperatures).length) {
if (!(key in series)) {
series[key] = {
@ -141,7 +154,12 @@ export default function GeneralMetrics({
}
});
});
return Object.values(series);
if (Object.keys(series).length > 0) {
return Object.values(series);
}
return undefined;
}, [statsHistory]);
const detCpuSeries = useMemo(() => {
@ -282,6 +300,74 @@ export default function GeneralMetrics({
return Object.values(series);
}, [statsHistory]);
const gpuEncSeries = useMemo(() => {
if (!statsHistory) {
return [];
}
const series: {
[key: string]: { name: string; data: { x: number; y: string }[] };
} = {};
let hasValidGpu = false;
statsHistory.forEach((stats, statsIdx) => {
if (!stats) {
return;
}
Object.entries(stats.gpu_usages || []).forEach(([key, stats]) => {
if (!(key in series)) {
series[key] = { name: key, data: [] };
}
if (stats.enc) {
hasValidGpu = true;
series[key].data.push({ x: statsIdx + 1, y: stats.enc.slice(0, -1) });
}
});
});
if (!hasValidGpu) {
return [];
}
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
}, [statsHistory]);
const gpuDecSeries = useMemo(() => {
if (!statsHistory) {
return [];
}
const series: {
[key: string]: { name: string; data: { x: number; y: string }[] };
} = {};
let hasValidGpu = false;
statsHistory.forEach((stats, statsIdx) => {
if (!stats) {
return;
}
Object.entries(stats.gpu_usages || []).forEach(([key, stats]) => {
if (!(key in series)) {
series[key] = { name: key, data: [] };
}
if (stats.dec) {
hasValidGpu = true;
series[key].data.push({ x: statsIdx + 1, y: stats.dec.slice(0, -1) });
}
});
});
if (!hasValidGpu) {
return [];
}
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
}, [statsHistory]);
// other processes stats
const otherProcessCpuSeries = useMemo(() => {
@ -354,14 +440,21 @@ export default function GeneralMetrics({
return (
<>
<VainfoDialog showVainfo={showVainfo} setShowVainfo={setShowVainfo} />
<GPUInfoDialog
showGpuInfo={showVainfo}
gpuType={gpuType}
setShowGpuInfo={setShowVainfo}
/>
<div className="scrollbar-container mt-4 flex size-full flex-col overflow-y-auto">
<div className="text-sm font-medium text-muted-foreground">
Detectors
</div>
<div
className={`mt-4 grid w-full grid-cols-1 gap-2 ${detTempSeries == undefined ? "sm:grid-cols-3" : "sm:grid-cols-4"}`}
className={cn(
"mt-4 grid w-full grid-cols-1 gap-2 sm:grid-cols-3",
detTempSeries && "sm:grid-cols-4",
)}
>
{statsHistory.length != 0 ? (
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
@ -381,7 +474,7 @@ export default function GeneralMetrics({
) : (
<Skeleton className="aspect-video w-full rounded-lg md:rounded-2xl" />
)}
{statsHistory.length != 0 ? (
{statsHistory.length != 0 && (
<>
{detTempSeries && (
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
@ -400,8 +493,6 @@ export default function GeneralMetrics({
</div>
)}
</>
) : (
<Skeleton className="aspect-video w-full" />
)}
{statsHistory.length != 0 ? (
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
@ -457,7 +548,12 @@ export default function GeneralMetrics({
</Button>
)}
</div>
<div className="mt-4 grid grid-cols-1 gap-2 sm:grid-cols-2">
<div
className={cn(
"mt-4 grid grid-cols-1 gap-2 sm:grid-cols-2",
gpuEncSeries?.length && "md:grid-cols-4",
)}
>
{statsHistory.length != 0 ? (
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
<div className="mb-5">GPU Usage</div>
@ -498,6 +594,50 @@ export default function GeneralMetrics({
) : (
<Skeleton className="aspect-video w-full" />
)}
{statsHistory.length != 0 ? (
<>
{gpuEncSeries && gpuEncSeries?.length != 0 && (
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
<div className="mb-5">GPU Encoder</div>
{gpuEncSeries.map((series) => (
<ThresholdBarGraph
key={series.name}
graphId={`${series.name}-enc`}
unit="%"
name={series.name}
threshold={GPUMemThreshold}
updateTimes={updateTimes}
data={[series]}
/>
))}
</div>
)}
</>
) : (
<Skeleton className="aspect-video w-full" />
)}
{statsHistory.length != 0 ? (
<>
{gpuDecSeries && gpuDecSeries?.length != 0 && (
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
<div className="mb-5">GPU Decoder</div>
{gpuDecSeries.map((series) => (
<ThresholdBarGraph
key={series.name}
graphId={`${series.name}-dec`}
unit="%"
name={series.name}
threshold={GPUMemThreshold}
updateTimes={updateTimes}
data={[series]}
/>
))}
</div>
)}
</>
) : (
<Skeleton className="aspect-video w-full" />
)}
</div>
</>
)}