Add support for SR-IOV GPU stats (#15796)

* Add option to treat GPU as SRIOV in order for stats to work correctly

* Add to intel docs

* fix tests
This commit is contained in:
Nicolas Mowen 2025-01-03 09:43:44 -06:00
parent f2cc16bf3c
commit eb85079f74
6 changed files with 25 additions and 7 deletions

View File

@ -175,6 +175,16 @@ For more information on the various values across different distributions, see h
Depending on your OS and kernel configuration, you may need to change the `/proc/sys/kernel/perf_event_paranoid` kernel tunable. You can test the change by running `sudo sh -c 'echo 2 >/proc/sys/kernel/perf_event_paranoid'` which will persist until a reboot. Make it permanent by running `sudo sh -c 'echo kernel.perf_event_paranoid=2 >> /etc/sysctl.d/local.conf'` Depending on your OS and kernel configuration, you may need to change the `/proc/sys/kernel/perf_event_paranoid` kernel tunable. You can test the change by running `sudo sh -c 'echo 2 >/proc/sys/kernel/perf_event_paranoid'` which will persist until a reboot. Make it permanent by running `sudo sh -c 'echo kernel.perf_event_paranoid=2 >> /etc/sysctl.d/local.conf'`
#### Stats for SR-IOV devices
When using virtualized GPUs via SR-IOV, additional args are needed for GPU stats to function. This can be enabled with the following config:
```yaml
telemetry:
stats:
sriov: True
```
## AMD/ATI GPUs (Radeon HD 2000 and newer GPUs) via libva-mesa-driver ## AMD/ATI GPUs (Radeon HD 2000 and newer GPUs) via libva-mesa-driver
VAAPI supports automatic profile selection so it will work automatically with both H.264 and H.265 streams. VAAPI supports automatic profile selection so it will work automatically with both H.264 and H.265 streams.

View File

@ -817,11 +817,13 @@ telemetry:
- lo - lo
# Optional: Configure system stats # Optional: Configure system stats
stats: stats:
# Enable AMD GPU stats (default: shown below) # Optional: Enable AMD GPU stats (default: shown below)
amd_gpu_stats: True amd_gpu_stats: True
# Enable Intel GPU stats (default: shown below) # Optional: Enable Intel GPU stats (default: shown below)
intel_gpu_stats: True intel_gpu_stats: True
# Enable network bandwidth stats monitoring for camera ffmpeg processes, go2rtc, and object detectors. (default: shown below) # Optional: Treat GPU as SR-IOV to fix GPU stats (default: shown below)
sriov: False
# Optional: Enable network bandwidth stats monitoring for camera ffmpeg processes, go2rtc, and object detectors. (default: shown below)
# NOTE: The container must either be privileged or have cap_net_admin, cap_net_raw capabilities enabled. # NOTE: The container must either be privileged or have cap_net_admin, cap_net_raw capabilities enabled.
network_bandwidth: False network_bandwidth: False
# Optional: Enable the latest version outbound check (default: shown below) # Optional: Enable the latest version outbound check (default: shown below)

View File

@ -11,6 +11,9 @@ class StatsConfig(FrigateBaseModel):
network_bandwidth: bool = Field( network_bandwidth: bool = Field(
default=False, title="Enable network bandwidth for ffmpeg processes." default=False, title="Enable network bandwidth for ffmpeg processes."
) )
sriov: bool = Field(
default=False, title="Treat device as SR-IOV to support GPU stats."
)
class TelemetryConfig(FrigateBaseModel): class TelemetryConfig(FrigateBaseModel):

View File

@ -195,7 +195,7 @@ async def set_gpu_stats(
continue continue
# intel QSV GPU # intel QSV GPU
intel_usage = get_intel_gpu_stats() intel_usage = get_intel_gpu_stats(config.telemetry.stats.sriov)
if intel_usage is not None: if intel_usage is not None:
stats["intel-qsv"] = intel_usage or {"gpu": "", "mem": ""} stats["intel-qsv"] = intel_usage or {"gpu": "", "mem": ""}
@ -220,7 +220,7 @@ async def set_gpu_stats(
continue continue
# intel VAAPI GPU # intel VAAPI GPU
intel_usage = get_intel_gpu_stats() intel_usage = get_intel_gpu_stats(config.telemetry.stats.sriov)
if intel_usage is not None: if intel_usage is not None:
stats["intel-vaapi"] = intel_usage or {"gpu": "", "mem": ""} stats["intel-vaapi"] = intel_usage or {"gpu": "", "mem": ""}

View File

@ -38,7 +38,7 @@ class TestGpuStats(unittest.TestCase):
process.returncode = 124 process.returncode = 124
process.stdout = self.intel_results process.stdout = self.intel_results
sp.return_value = process sp.return_value = process
intel_stats = get_intel_gpu_stats() intel_stats = get_intel_gpu_stats(False)
print(f"the intel stats are {intel_stats}") print(f"the intel stats are {intel_stats}")
assert intel_stats == { assert intel_stats == {
"gpu": "1.13%", "gpu": "1.13%",

View File

@ -255,7 +255,7 @@ def get_amd_gpu_stats() -> dict[str, str]:
return results return results
def get_intel_gpu_stats() -> dict[str, str]: def get_intel_gpu_stats(sriov: bool) -> dict[str, str]:
"""Get stats using intel_gpu_top.""" """Get stats using intel_gpu_top."""
def get_stats_manually(output: str) -> dict[str, str]: def get_stats_manually(output: str) -> dict[str, str]:
@ -302,6 +302,9 @@ def get_intel_gpu_stats() -> dict[str, str]:
"1", "1",
] ]
if sriov:
intel_gpu_top_command += ["-d", "drm:/dev/dri/card0"]
p = sp.run( p = sp.run(
intel_gpu_top_command, intel_gpu_top_command,
encoding="ascii", encoding="ascii",