mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-01-05 20:04:51 +01:00
Add support for GPU and NPU temperatures (#21495)
* Add rockchip temps * Add support for GPU and NPU temperatures in the frontend * Add support for Nvidia temperature * Improve separation * Adjust graph scaling
This commit is contained in:
parent
7fb8d9b050
commit
aa0b082184
@ -123,6 +123,10 @@ def get_detector_temperature(
|
||||
if index < len(hailo_device_names):
|
||||
device_name = hailo_device_names[index]
|
||||
return hailo_temps[device_name]
|
||||
elif detector_type == "rknn":
|
||||
# Rockchip temperatures are handled by the GPU / NPU stats
|
||||
# as there are not detector specific temperatures
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
@ -242,6 +246,7 @@ async def set_gpu_stats(
|
||||
"mem": str(round(float(nvidia_usage[i]["mem"]), 2)) + "%",
|
||||
"enc": str(round(float(nvidia_usage[i]["enc"]), 2)) + "%",
|
||||
"dec": str(round(float(nvidia_usage[i]["dec"]), 2)) + "%",
|
||||
"temp": str(nvidia_usage[i]["temp"]),
|
||||
}
|
||||
|
||||
else:
|
||||
|
||||
@ -417,12 +417,12 @@ def get_openvino_npu_stats() -> Optional[dict[str, str]]:
|
||||
else:
|
||||
usage = 0.0
|
||||
|
||||
return {"npu": f"{round(usage, 2)}", "mem": "-"}
|
||||
return {"npu": f"{round(usage, 2)}", "mem": "-%"}
|
||||
except (FileNotFoundError, PermissionError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def get_rockchip_gpu_stats() -> Optional[dict[str, str]]:
|
||||
def get_rockchip_gpu_stats() -> Optional[dict[str, str | float]]:
|
||||
"""Get GPU stats using rk."""
|
||||
try:
|
||||
with open("/sys/kernel/debug/rkrga/load", "r") as f:
|
||||
@ -440,7 +440,16 @@ def get_rockchip_gpu_stats() -> Optional[dict[str, str]]:
|
||||
return None
|
||||
|
||||
average_load = f"{round(sum(load_values) / len(load_values), 2)}%"
|
||||
return {"gpu": average_load, "mem": "-"}
|
||||
stats: dict[str, str | float] = {"gpu": average_load, "mem": "-%"}
|
||||
|
||||
try:
|
||||
with open("/sys/class/thermal/thermal_zone5/temp", "r") as f:
|
||||
line = f.readline().strip()
|
||||
stats["temp"] = round(int(line) / 1000, 1)
|
||||
except (FileNotFoundError, OSError, ValueError):
|
||||
pass
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def get_rockchip_npu_stats() -> Optional[dict[str, float | str]]:
|
||||
@ -463,13 +472,25 @@ def get_rockchip_npu_stats() -> Optional[dict[str, float | str]]:
|
||||
|
||||
percentages = [int(load) for load in core_loads]
|
||||
mean = round(sum(percentages) / len(percentages), 2)
|
||||
return {"npu": mean, "mem": "-"}
|
||||
stats: dict[str, float | str] = {"npu": mean, "mem": "-%"}
|
||||
|
||||
try:
|
||||
with open("/sys/class/thermal/thermal_zone6/temp", "r") as f:
|
||||
line = f.readline().strip()
|
||||
stats["temp"] = round(int(line) / 1000, 1)
|
||||
except (FileNotFoundError, OSError, ValueError):
|
||||
pass
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def try_get_info(f, h, default="N/A"):
|
||||
def try_get_info(f, h, default="N/A", sensor=None):
|
||||
try:
|
||||
if h:
|
||||
v = f(h)
|
||||
if sensor is not None:
|
||||
v = f(h, sensor)
|
||||
else:
|
||||
v = f(h)
|
||||
else:
|
||||
v = f()
|
||||
except nvml.NVMLError_NotSupported:
|
||||
@ -498,6 +519,9 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
|
||||
util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
|
||||
enc = try_get_info(nvml.nvmlDeviceGetEncoderUtilization, handle)
|
||||
dec = try_get_info(nvml.nvmlDeviceGetDecoderUtilization, handle)
|
||||
temp = try_get_info(
|
||||
nvml.nvmlDeviceGetTemperature, handle, default=None, sensor=0
|
||||
)
|
||||
pstate = try_get_info(nvml.nvmlDeviceGetPowerState, handle, default=None)
|
||||
|
||||
if util != "N/A":
|
||||
@ -510,6 +534,11 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
|
||||
else:
|
||||
gpu_mem_util = -1
|
||||
|
||||
if temp != "N/A" and temp is not None:
|
||||
temp = float(temp)
|
||||
else:
|
||||
temp = None
|
||||
|
||||
if enc != "N/A":
|
||||
enc_util = enc[0]
|
||||
else:
|
||||
@ -527,6 +556,7 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
|
||||
"enc": enc_util,
|
||||
"dec": dec_util,
|
||||
"pstate": pstate or "unknown",
|
||||
"temp": temp,
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@ -51,6 +51,7 @@
|
||||
"gpuMemory": "GPU Memory",
|
||||
"gpuEncoder": "GPU Encoder",
|
||||
"gpuDecoder": "GPU Decoder",
|
||||
"gpuTemperature": "GPU Temperature",
|
||||
"gpuInfo": {
|
||||
"vainfoOutput": {
|
||||
"title": "Vainfo Output",
|
||||
@ -77,6 +78,7 @@
|
||||
},
|
||||
"npuUsage": "NPU Usage",
|
||||
"npuMemory": "NPU Memory",
|
||||
"npuTemperature": "NPU Temperature",
|
||||
"intelGpuWarning": {
|
||||
"title": "Intel GPU Stats Warning",
|
||||
"message": "GPU stats unavailable",
|
||||
|
||||
@ -61,11 +61,13 @@ export type GpuStats = {
|
||||
enc?: string;
|
||||
dec?: string;
|
||||
pstate?: string;
|
||||
temp?: number;
|
||||
};
|
||||
|
||||
export type NpuStats = {
|
||||
npu: number;
|
||||
mem: string;
|
||||
temp?: number;
|
||||
};
|
||||
|
||||
export type GpuInfo = "vainfo" | "nvinfo";
|
||||
|
||||
@ -368,6 +368,40 @@ export default function GeneralMetrics({
|
||||
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
|
||||
}, [statsHistory]);
|
||||
|
||||
const gpuTempSeries = useMemo(() => {
|
||||
if (!statsHistory) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const series: {
|
||||
[key: string]: { name: string; data: { x: number; y: number }[] };
|
||||
} = {};
|
||||
let hasValidGpu = false;
|
||||
|
||||
statsHistory.forEach((stats, statsIdx) => {
|
||||
if (!stats) {
|
||||
return;
|
||||
}
|
||||
|
||||
Object.entries(stats.gpu_usages || {}).forEach(([key, stats]) => {
|
||||
if (!(key in series)) {
|
||||
series[key] = { name: key, data: [] };
|
||||
}
|
||||
|
||||
if (stats.temp !== undefined) {
|
||||
hasValidGpu = true;
|
||||
series[key].data.push({ x: statsIdx + 1, y: stats.temp });
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
if (!hasValidGpu) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
|
||||
}, [statsHistory]);
|
||||
|
||||
// Check if Intel GPU has all 0% usage values (known bug)
|
||||
const showIntelGpuWarning = useMemo(() => {
|
||||
if (!statsHistory || statsHistory.length < 3) {
|
||||
@ -448,6 +482,40 @@ export default function GeneralMetrics({
|
||||
return Object.keys(series).length > 0 ? Object.values(series) : [];
|
||||
}, [statsHistory]);
|
||||
|
||||
const npuTempSeries = useMemo(() => {
|
||||
if (!statsHistory) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const series: {
|
||||
[key: string]: { name: string; data: { x: number; y: number }[] };
|
||||
} = {};
|
||||
let hasValidNpu = false;
|
||||
|
||||
statsHistory.forEach((stats, statsIdx) => {
|
||||
if (!stats) {
|
||||
return;
|
||||
}
|
||||
|
||||
Object.entries(stats.npu_usages || {}).forEach(([key, stats]) => {
|
||||
if (!(key in series)) {
|
||||
series[key] = { name: key, data: [] };
|
||||
}
|
||||
|
||||
if (stats.temp !== undefined) {
|
||||
hasValidNpu = true;
|
||||
series[key].data.push({ x: statsIdx + 1, y: stats.temp });
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
if (!hasValidNpu) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
|
||||
}, [statsHistory]);
|
||||
|
||||
// other processes stats
|
||||
|
||||
const hardwareType = useMemo(() => {
|
||||
@ -669,7 +737,11 @@ export default function GeneralMetrics({
|
||||
<div
|
||||
className={cn(
|
||||
"mt-4 grid grid-cols-1 gap-2 sm:grid-cols-2",
|
||||
gpuEncSeries?.length && "md:grid-cols-4",
|
||||
gpuTempSeries?.length && "md:grid-cols-3",
|
||||
gpuEncSeries?.length && "xl:grid-cols-4",
|
||||
gpuEncSeries?.length &&
|
||||
gpuTempSeries?.length &&
|
||||
"3xl:grid-cols-5",
|
||||
)}
|
||||
>
|
||||
{statsHistory[0]?.gpu_usages && (
|
||||
@ -804,6 +876,30 @@ export default function GeneralMetrics({
|
||||
) : (
|
||||
<Skeleton className="aspect-video w-full" />
|
||||
)}
|
||||
{statsHistory.length != 0 ? (
|
||||
<>
|
||||
{gpuTempSeries && gpuTempSeries?.length != 0 && (
|
||||
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
|
||||
<div className="mb-5">
|
||||
{t("general.hardwareInfo.gpuTemperature")}
|
||||
</div>
|
||||
{gpuTempSeries.map((series) => (
|
||||
<ThresholdBarGraph
|
||||
key={series.name}
|
||||
graphId={`${series.name}-temp`}
|
||||
name={series.name}
|
||||
unit="°C"
|
||||
threshold={DetectorTempThreshold}
|
||||
updateTimes={updateTimes}
|
||||
data={[series]}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
) : (
|
||||
<Skeleton className="aspect-video w-full" />
|
||||
)}
|
||||
|
||||
{statsHistory[0]?.npu_usages && (
|
||||
<>
|
||||
@ -827,6 +923,30 @@ export default function GeneralMetrics({
|
||||
) : (
|
||||
<Skeleton className="aspect-video w-full" />
|
||||
)}
|
||||
{statsHistory.length != 0 ? (
|
||||
<>
|
||||
{npuTempSeries && npuTempSeries?.length != 0 && (
|
||||
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
|
||||
<div className="mb-5">
|
||||
{t("general.hardwareInfo.npuTemperature")}
|
||||
</div>
|
||||
{npuTempSeries.map((series) => (
|
||||
<ThresholdBarGraph
|
||||
key={series.name}
|
||||
graphId={`${series.name}-temp`}
|
||||
name={series.name}
|
||||
unit="°C"
|
||||
threshold={DetectorTempThreshold}
|
||||
updateTimes={updateTimes}
|
||||
data={[series]}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
) : (
|
||||
<Skeleton className="aspect-video w-full" />
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
|
||||
Loading…
Reference in New Issue
Block a user