Add support for SR-IOV GPU stats (#15796)

* Add option to treat GPU as SRIOV in order for stats to work correctly

* Add to intel docs

* fix tests
This commit is contained in:
Nicolas Mowen 2025-01-03 09:43:44 -06:00
parent f2cc16bf3c
commit eb85079f74
6 changed files with 25 additions and 7 deletions

View File

@ -175,6 +175,16 @@ For more information on the various values across different distributions, see h
Depending on your OS and kernel configuration, you may need to change the `/proc/sys/kernel/perf_event_paranoid` kernel tunable. You can test the change by running `sudo sh -c 'echo 2 >/proc/sys/kernel/perf_event_paranoid'` which will persist until a reboot. Make it permanent by running `sudo sh -c 'echo kernel.perf_event_paranoid=2 >> /etc/sysctl.d/local.conf'`
#### Stats for SR-IOV devices
When using virtualized GPUs via SR-IOV, additional args are needed for GPU stats to function. This can be enabled with the following config:
```yaml
telemetry:
stats:
sriov: True
```
## AMD/ATI GPUs (Radeon HD 2000 and newer GPUs) via libva-mesa-driver
VAAPI supports automatic profile selection so it will work automatically with both H.264 and H.265 streams.

View File

@ -817,11 +817,13 @@ telemetry:
- lo
# Optional: Configure system stats
stats:
# Enable AMD GPU stats (default: shown below)
# Optional: Enable AMD GPU stats (default: shown below)
amd_gpu_stats: True
# Enable Intel GPU stats (default: shown below)
# Optional: Enable Intel GPU stats (default: shown below)
intel_gpu_stats: True
# Enable network bandwidth stats monitoring for camera ffmpeg processes, go2rtc, and object detectors. (default: shown below)
# Optional: Treat GPU as SR-IOV to fix GPU stats (default: shown below)
sriov: False
# Optional: Enable network bandwidth stats monitoring for camera ffmpeg processes, go2rtc, and object detectors. (default: shown below)
# NOTE: The container must either be privileged or have cap_net_admin, cap_net_raw capabilities enabled.
network_bandwidth: False
# Optional: Enable the latest version outbound check (default: shown below)

View File

@ -11,6 +11,9 @@ class StatsConfig(FrigateBaseModel):
network_bandwidth: bool = Field(
default=False, title="Enable network bandwidth for ffmpeg processes."
)
sriov: bool = Field(
default=False, title="Treat device as SR-IOV to support GPU stats."
)
class TelemetryConfig(FrigateBaseModel):

View File

@ -195,7 +195,7 @@ async def set_gpu_stats(
continue
# intel QSV GPU
intel_usage = get_intel_gpu_stats()
intel_usage = get_intel_gpu_stats(config.telemetry.stats.sriov)
if intel_usage is not None:
stats["intel-qsv"] = intel_usage or {"gpu": "", "mem": ""}
@ -220,7 +220,7 @@ async def set_gpu_stats(
continue
# intel VAAPI GPU
intel_usage = get_intel_gpu_stats()
intel_usage = get_intel_gpu_stats(config.telemetry.stats.sriov)
if intel_usage is not None:
stats["intel-vaapi"] = intel_usage or {"gpu": "", "mem": ""}

View File

@ -38,7 +38,7 @@ class TestGpuStats(unittest.TestCase):
process.returncode = 124
process.stdout = self.intel_results
sp.return_value = process
intel_stats = get_intel_gpu_stats()
intel_stats = get_intel_gpu_stats(False)
print(f"the intel stats are {intel_stats}")
assert intel_stats == {
"gpu": "1.13%",

View File

@ -255,7 +255,7 @@ def get_amd_gpu_stats() -> dict[str, str]:
return results
def get_intel_gpu_stats() -> dict[str, str]:
def get_intel_gpu_stats(sriov: bool) -> dict[str, str]:
"""Get stats using intel_gpu_top."""
def get_stats_manually(output: str) -> dict[str, str]:
@ -302,6 +302,9 @@ def get_intel_gpu_stats() -> dict[str, str]:
"1",
]
if sriov:
intel_gpu_top_command += ["-d", "drm:/dev/dri/card0"]
p = sp.run(
intel_gpu_top_command,
encoding="ascii",