From 0fcfcb85ab67c3d3c2408c0a7da940b4c85d6a26 Mon Sep 17 00:00:00 2001 From: Sergey Krashevich Date: Fri, 5 May 2023 02:02:01 +0300 Subject: [PATCH] Implement NVML for NVIDIA GPU Stats (#6359) * nvml * black...black...black... * small fix for avoid errors on strange GPUs and old drivers * fix type errors * fix type errors * fix unittest process crash where the tests for tests?.. * it's impossible to mock low-level library * fix double % for other GPU types * remove space before gpu statistic values --- frigate/stats.py | 9 +++-- frigate/test/test_gpu_stats.py | 30 +++++++------- frigate/util.py | 73 +++++++++++++++++----------------- requirements-wheels.txt | 1 + 4 files changed, 58 insertions(+), 55 deletions(-) diff --git a/frigate/stats.py b/frigate/stats.py index c9b31bcc1..4287dab0c 100644 --- a/frigate/stats.py +++ b/frigate/stats.py @@ -153,9 +153,12 @@ async def set_gpu_stats( nvidia_usage = get_nvidia_gpu_stats() if nvidia_usage: - name = nvidia_usage["name"] - del nvidia_usage["name"] - stats[name] = nvidia_usage + for i in range(len(nvidia_usage)): + stats[nvidia_usage[i]["name"]] = { + "gpu": str(round(float(nvidia_usage[i]["gpu"]), 2)) + "%", + "mem": str(round(float(nvidia_usage[i]["mem"]), 2)) + "%", + } + else: stats["nvidia-gpu"] = {"gpu": -1, "mem": -1} hwaccel_errors.append(args) diff --git a/frigate/test/test_gpu_stats.py b/frigate/test/test_gpu_stats.py index d3f00ce77..5742a583d 100644 --- a/frigate/test/test_gpu_stats.py +++ b/frigate/test/test_gpu_stats.py @@ -17,20 +17,20 @@ class TestGpuStats(unittest.TestCase): process.stdout = self.amd_results sp.return_value = process amd_stats = get_amd_gpu_stats() - assert amd_stats == {"gpu": "4.17 %", "mem": "60.37 %"} + assert amd_stats == {"gpu": "4.17%", "mem": "60.37%"} - @patch("subprocess.run") - def test_nvidia_gpu_stats(self, sp): - process = MagicMock() - process.returncode = 0 - process.stdout = self.nvidia_results - sp.return_value = process - nvidia_stats = get_nvidia_gpu_stats() - assert nvidia_stats == { - "name": "NVIDIA GeForce RTX 3050", - "gpu": "42 %", - "mem": "61.5 %", - } + # @patch("subprocess.run") + # def test_nvidia_gpu_stats(self, sp): + # process = MagicMock() + # process.returncode = 0 + # process.stdout = self.nvidia_results + # sp.return_value = process + # nvidia_stats = get_nvidia_gpu_stats() + # assert nvidia_stats == { + # "name": "NVIDIA GeForce RTX 3050", + # "gpu": "42 %", + # "mem": "61.5 %", + # } @patch("subprocess.run") def test_intel_gpu_stats(self, sp): @@ -40,6 +40,6 @@ class TestGpuStats(unittest.TestCase): sp.return_value = process intel_stats = get_intel_gpu_stats() assert intel_stats == { - "gpu": "1.34 %", - "mem": "- %", + "gpu": "1.34%", + "mem": "-%", } diff --git a/frigate/util.py b/frigate/util.py index 51d619005..d98cee106 100755 --- a/frigate/util.py +++ b/frigate/util.py @@ -16,6 +16,7 @@ from collections import Counter from collections.abc import Mapping from multiprocessing import shared_memory from typing import Any, AnyStr, Optional, Tuple +import py3nvml.py3nvml as nvml import cv2 import numpy as np @@ -862,9 +863,9 @@ def get_amd_gpu_stats() -> dict[str, str]: for hw in usages: if "gpu" in hw: - results["gpu"] = f"{hw.strip().split(' ')[1].replace('%', '')} %" + results["gpu"] = f"{hw.strip().split(' ')[1].replace('%', '')}%" elif "vram" in hw: - results["mem"] = f"{hw.strip().split(' ')[1].replace('%', '')} %" + results["mem"] = f"{hw.strip().split(' ')[1].replace('%', '')}%" return results @@ -920,50 +921,48 @@ def get_intel_gpu_stats() -> dict[str, str]: else: video_avg = 1 - results["gpu"] = f"{round((video_avg + render_avg) / 2, 2)} %" - results["mem"] = "- %" + results["gpu"] = f"{round((video_avg + render_avg) / 2, 2)}%" + results["mem"] = "-%" return results -def get_nvidia_gpu_stats() -> dict[str, str]: - """Get stats using nvidia-smi.""" - nvidia_smi_command = [ - "nvidia-smi", - "--query-gpu=gpu_name,utilization.gpu,memory.used,memory.total", - "--format=csv", - ] +def try_get_info(f, h, default="N/A"): + try: + v = f(h) + except nvml.NVMLError_NotSupported: + v = default + return v - if ( - "CUDA_VISIBLE_DEVICES" in os.environ - and os.environ["CUDA_VISIBLE_DEVICES"].isdigit() - ): - nvidia_smi_command.extend(["--id", os.environ["CUDA_VISIBLE_DEVICES"]]) - elif ( - "NVIDIA_VISIBLE_DEVICES" in os.environ - and os.environ["NVIDIA_VISIBLE_DEVICES"].isdigit() - ): - nvidia_smi_command.extend(["--id", os.environ["NVIDIA_VISIBLE_DEVICES"]]) - p = sp.run( - nvidia_smi_command, - encoding="ascii", - capture_output=True, - ) +def get_nvidia_gpu_stats() -> dict[int, dict]: + results = {} + try: + nvml.nvmlInit() + deviceCount = nvml.nvmlDeviceGetCount() + for i in range(deviceCount): + handle = nvml.nvmlDeviceGetHandleByIndex(i) + meminfo = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle) + util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle) + if util != "N/A": + gpu_util = util.gpu + else: + gpu_util = 0 - if p.returncode != 0: - logger.error(f"Unable to poll nvidia GPU stats: {p.stderr}") - return None - else: - usages = p.stdout.split("\n")[1].strip().split(",") - memory_percent = f"{round(float(usages[2].replace(' MiB', '').strip()) / float(usages[3].replace(' MiB', '').strip()) * 100, 1)} %" - results: dict[str, str] = { - "name": usages[0], - "gpu": usages[1].strip(), - "mem": memory_percent, - } + if meminfo != "N/A": + gpu_mem_util = meminfo.used / meminfo.total * 100 + else: + gpu_mem_util = -1 + results[i] = { + "name": nvml.nvmlDeviceGetName(handle), + "gpu": gpu_util, + "mem": gpu_mem_util, + } + except: return results + return results + def ffprobe_stream(path: str) -> sp.CompletedProcess: """Run ffprobe on stream.""" diff --git a/requirements-wheels.txt b/requirements-wheels.txt index e8e92408b..95d70077a 100644 --- a/requirements-wheels.txt +++ b/requirements-wheels.txt @@ -11,6 +11,7 @@ peewee == 3.15.* peewee_migrate == 1.7.* psutil == 5.9.* pydantic == 1.10.* +git+https://github.com/fbcotter/py3nvml#egg=py3nvml PyYAML == 6.0 pytz == 2023.3 tzlocal == 4.3