Add support for GPU and NPU temperatures (#21495)

* Add rockchip temps

* Add support for GPU and NPU temperatures in the frontend

* Add support for Nvidia temperature

* Improve separation

* Adjust graph scaling
This commit is contained in:
Nicolas Mowen 2025-12-31 13:32:07 -07:00 committed by GitHub
parent 7fb8d9b050
commit aa0b082184
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 166 additions and 7 deletions

View File

@ -123,6 +123,10 @@ def get_detector_temperature(
if index < len(hailo_device_names):
device_name = hailo_device_names[index]
return hailo_temps[device_name]
elif detector_type == "rknn":
# Rockchip temperatures are handled by the GPU / NPU stats
# as there are not detector specific temperatures
pass
return None
@ -242,6 +246,7 @@ async def set_gpu_stats(
"mem": str(round(float(nvidia_usage[i]["mem"]), 2)) + "%",
"enc": str(round(float(nvidia_usage[i]["enc"]), 2)) + "%",
"dec": str(round(float(nvidia_usage[i]["dec"]), 2)) + "%",
"temp": str(nvidia_usage[i]["temp"]),
}
else:

View File

@ -417,12 +417,12 @@ def get_openvino_npu_stats() -> Optional[dict[str, str]]:
else:
usage = 0.0
return {"npu": f"{round(usage, 2)}", "mem": "-"}
return {"npu": f"{round(usage, 2)}", "mem": "-%"}
except (FileNotFoundError, PermissionError, ValueError):
return None
def get_rockchip_gpu_stats() -> Optional[dict[str, str]]:
def get_rockchip_gpu_stats() -> Optional[dict[str, str | float]]:
"""Get GPU stats using rk."""
try:
with open("/sys/kernel/debug/rkrga/load", "r") as f:
@ -440,7 +440,16 @@ def get_rockchip_gpu_stats() -> Optional[dict[str, str]]:
return None
average_load = f"{round(sum(load_values) / len(load_values), 2)}%"
return {"gpu": average_load, "mem": "-"}
stats: dict[str, str | float] = {"gpu": average_load, "mem": "-%"}
try:
with open("/sys/class/thermal/thermal_zone5/temp", "r") as f:
line = f.readline().strip()
stats["temp"] = round(int(line) / 1000, 1)
except (FileNotFoundError, OSError, ValueError):
pass
return stats
def get_rockchip_npu_stats() -> Optional[dict[str, float | str]]:
@ -463,13 +472,25 @@ def get_rockchip_npu_stats() -> Optional[dict[str, float | str]]:
percentages = [int(load) for load in core_loads]
mean = round(sum(percentages) / len(percentages), 2)
return {"npu": mean, "mem": "-"}
stats: dict[str, float | str] = {"npu": mean, "mem": "-%"}
try:
with open("/sys/class/thermal/thermal_zone6/temp", "r") as f:
line = f.readline().strip()
stats["temp"] = round(int(line) / 1000, 1)
except (FileNotFoundError, OSError, ValueError):
pass
return stats
def try_get_info(f, h, default="N/A"):
def try_get_info(f, h, default="N/A", sensor=None):
try:
if h:
v = f(h)
if sensor is not None:
v = f(h, sensor)
else:
v = f(h)
else:
v = f()
except nvml.NVMLError_NotSupported:
@ -498,6 +519,9 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
enc = try_get_info(nvml.nvmlDeviceGetEncoderUtilization, handle)
dec = try_get_info(nvml.nvmlDeviceGetDecoderUtilization, handle)
temp = try_get_info(
nvml.nvmlDeviceGetTemperature, handle, default=None, sensor=0
)
pstate = try_get_info(nvml.nvmlDeviceGetPowerState, handle, default=None)
if util != "N/A":
@ -510,6 +534,11 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
else:
gpu_mem_util = -1
if temp != "N/A" and temp is not None:
temp = float(temp)
else:
temp = None
if enc != "N/A":
enc_util = enc[0]
else:
@ -527,6 +556,7 @@ def get_nvidia_gpu_stats() -> dict[int, dict]:
"enc": enc_util,
"dec": dec_util,
"pstate": pstate or "unknown",
"temp": temp,
}
except Exception:
pass

View File

@ -51,6 +51,7 @@
"gpuMemory": "GPU Memory",
"gpuEncoder": "GPU Encoder",
"gpuDecoder": "GPU Decoder",
"gpuTemperature": "GPU Temperature",
"gpuInfo": {
"vainfoOutput": {
"title": "Vainfo Output",
@ -77,6 +78,7 @@
},
"npuUsage": "NPU Usage",
"npuMemory": "NPU Memory",
"npuTemperature": "NPU Temperature",
"intelGpuWarning": {
"title": "Intel GPU Stats Warning",
"message": "GPU stats unavailable",

View File

@ -61,11 +61,13 @@ export type GpuStats = {
enc?: string;
dec?: string;
pstate?: string;
temp?: number;
};
export type NpuStats = {
npu: number;
mem: string;
temp?: number;
};
export type GpuInfo = "vainfo" | "nvinfo";

View File

@ -368,6 +368,40 @@ export default function GeneralMetrics({
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
}, [statsHistory]);
const gpuTempSeries = useMemo(() => {
if (!statsHistory) {
return [];
}
const series: {
[key: string]: { name: string; data: { x: number; y: number }[] };
} = {};
let hasValidGpu = false;
statsHistory.forEach((stats, statsIdx) => {
if (!stats) {
return;
}
Object.entries(stats.gpu_usages || {}).forEach(([key, stats]) => {
if (!(key in series)) {
series[key] = { name: key, data: [] };
}
if (stats.temp !== undefined) {
hasValidGpu = true;
series[key].data.push({ x: statsIdx + 1, y: stats.temp });
}
});
});
if (!hasValidGpu) {
return [];
}
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
}, [statsHistory]);
// Check if Intel GPU has all 0% usage values (known bug)
const showIntelGpuWarning = useMemo(() => {
if (!statsHistory || statsHistory.length < 3) {
@ -448,6 +482,40 @@ export default function GeneralMetrics({
return Object.keys(series).length > 0 ? Object.values(series) : [];
}, [statsHistory]);
const npuTempSeries = useMemo(() => {
if (!statsHistory) {
return [];
}
const series: {
[key: string]: { name: string; data: { x: number; y: number }[] };
} = {};
let hasValidNpu = false;
statsHistory.forEach((stats, statsIdx) => {
if (!stats) {
return;
}
Object.entries(stats.npu_usages || {}).forEach(([key, stats]) => {
if (!(key in series)) {
series[key] = { name: key, data: [] };
}
if (stats.temp !== undefined) {
hasValidNpu = true;
series[key].data.push({ x: statsIdx + 1, y: stats.temp });
}
});
});
if (!hasValidNpu) {
return [];
}
return Object.keys(series).length > 0 ? Object.values(series) : undefined;
}, [statsHistory]);
// other processes stats
const hardwareType = useMemo(() => {
@ -669,7 +737,11 @@ export default function GeneralMetrics({
<div
className={cn(
"mt-4 grid grid-cols-1 gap-2 sm:grid-cols-2",
gpuEncSeries?.length && "md:grid-cols-4",
gpuTempSeries?.length && "md:grid-cols-3",
gpuEncSeries?.length && "xl:grid-cols-4",
gpuEncSeries?.length &&
gpuTempSeries?.length &&
"3xl:grid-cols-5",
)}
>
{statsHistory[0]?.gpu_usages && (
@ -804,6 +876,30 @@ export default function GeneralMetrics({
) : (
<Skeleton className="aspect-video w-full" />
)}
{statsHistory.length != 0 ? (
<>
{gpuTempSeries && gpuTempSeries?.length != 0 && (
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
<div className="mb-5">
{t("general.hardwareInfo.gpuTemperature")}
</div>
{gpuTempSeries.map((series) => (
<ThresholdBarGraph
key={series.name}
graphId={`${series.name}-temp`}
name={series.name}
unit="°C"
threshold={DetectorTempThreshold}
updateTimes={updateTimes}
data={[series]}
/>
))}
</div>
)}
</>
) : (
<Skeleton className="aspect-video w-full" />
)}
{statsHistory[0]?.npu_usages && (
<>
@ -827,6 +923,30 @@ export default function GeneralMetrics({
) : (
<Skeleton className="aspect-video w-full" />
)}
{statsHistory.length != 0 ? (
<>
{npuTempSeries && npuTempSeries?.length != 0 && (
<div className="rounded-lg bg-background_alt p-2.5 md:rounded-2xl">
<div className="mb-5">
{t("general.hardwareInfo.npuTemperature")}
</div>
{npuTempSeries.map((series) => (
<ThresholdBarGraph
key={series.name}
graphId={`${series.name}-temp`}
name={series.name}
unit="°C"
threshold={DetectorTempThreshold}
updateTimes={updateTimes}
data={[series]}
/>
))}
</div>
)}
</>
) : (
<Skeleton className="aspect-video w-full" />
)}
</>
)}
</>