Implement NVML for NVIDIA GPU Stats (#6359)

* nvml

* black...black...black...

* small fix for avoid errors on strange GPUs and old drivers

* fix type errors

* fix type errors

* fix unittest process crash

where the tests for tests?..

* it's impossible to mock low-level library

* fix double % for other GPU types

* remove space before gpu statistic values
This commit is contained in:
Sergey Krashevich 2023-05-05 02:02:01 +03:00 committed by GitHub
parent ef50af03f2
commit 0fcfcb85ab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 58 additions and 55 deletions

View File

@ -153,9 +153,12 @@ async def set_gpu_stats(
nvidia_usage = get_nvidia_gpu_stats() nvidia_usage = get_nvidia_gpu_stats()
if nvidia_usage: if nvidia_usage:
name = nvidia_usage["name"] for i in range(len(nvidia_usage)):
del nvidia_usage["name"] stats[nvidia_usage[i]["name"]] = {
stats[name] = nvidia_usage "gpu": str(round(float(nvidia_usage[i]["gpu"]), 2)) + "%",
"mem": str(round(float(nvidia_usage[i]["mem"]), 2)) + "%",
}
else: else:
stats["nvidia-gpu"] = {"gpu": -1, "mem": -1} stats["nvidia-gpu"] = {"gpu": -1, "mem": -1}
hwaccel_errors.append(args) hwaccel_errors.append(args)

View File

@ -17,20 +17,20 @@ class TestGpuStats(unittest.TestCase):
process.stdout = self.amd_results process.stdout = self.amd_results
sp.return_value = process sp.return_value = process
amd_stats = get_amd_gpu_stats() amd_stats = get_amd_gpu_stats()
assert amd_stats == {"gpu": "4.17 %", "mem": "60.37 %"} assert amd_stats == {"gpu": "4.17%", "mem": "60.37%"}
@patch("subprocess.run") # @patch("subprocess.run")
def test_nvidia_gpu_stats(self, sp): # def test_nvidia_gpu_stats(self, sp):
process = MagicMock() # process = MagicMock()
process.returncode = 0 # process.returncode = 0
process.stdout = self.nvidia_results # process.stdout = self.nvidia_results
sp.return_value = process # sp.return_value = process
nvidia_stats = get_nvidia_gpu_stats() # nvidia_stats = get_nvidia_gpu_stats()
assert nvidia_stats == { # assert nvidia_stats == {
"name": "NVIDIA GeForce RTX 3050", # "name": "NVIDIA GeForce RTX 3050",
"gpu": "42 %", # "gpu": "42 %",
"mem": "61.5 %", # "mem": "61.5 %",
} # }
@patch("subprocess.run") @patch("subprocess.run")
def test_intel_gpu_stats(self, sp): def test_intel_gpu_stats(self, sp):
@ -40,6 +40,6 @@ class TestGpuStats(unittest.TestCase):
sp.return_value = process sp.return_value = process
intel_stats = get_intel_gpu_stats() intel_stats = get_intel_gpu_stats()
assert intel_stats == { assert intel_stats == {
"gpu": "1.34 %", "gpu": "1.34%",
"mem": "- %", "mem": "-%",
} }

View File

@ -16,6 +16,7 @@ from collections import Counter
from collections.abc import Mapping from collections.abc import Mapping
from multiprocessing import shared_memory from multiprocessing import shared_memory
from typing import Any, AnyStr, Optional, Tuple from typing import Any, AnyStr, Optional, Tuple
import py3nvml.py3nvml as nvml
import cv2 import cv2
import numpy as np import numpy as np
@ -862,9 +863,9 @@ def get_amd_gpu_stats() -> dict[str, str]:
for hw in usages: for hw in usages:
if "gpu" in hw: if "gpu" in hw:
results["gpu"] = f"{hw.strip().split(' ')[1].replace('%', '')} %" results["gpu"] = f"{hw.strip().split(' ')[1].replace('%', '')}%"
elif "vram" in hw: elif "vram" in hw:
results["mem"] = f"{hw.strip().split(' ')[1].replace('%', '')} %" results["mem"] = f"{hw.strip().split(' ')[1].replace('%', '')}%"
return results return results
@ -920,47 +921,45 @@ def get_intel_gpu_stats() -> dict[str, str]:
else: else:
video_avg = 1 video_avg = 1
results["gpu"] = f"{round((video_avg + render_avg) / 2, 2)} %" results["gpu"] = f"{round((video_avg + render_avg) / 2, 2)}%"
results["mem"] = "- %" results["mem"] = "-%"
return results return results
def get_nvidia_gpu_stats() -> dict[str, str]: def try_get_info(f, h, default="N/A"):
"""Get stats using nvidia-smi.""" try:
nvidia_smi_command = [ v = f(h)
"nvidia-smi", except nvml.NVMLError_NotSupported:
"--query-gpu=gpu_name,utilization.gpu,memory.used,memory.total", v = default
"--format=csv", return v
]
if (
"CUDA_VISIBLE_DEVICES" in os.environ
and os.environ["CUDA_VISIBLE_DEVICES"].isdigit()
):
nvidia_smi_command.extend(["--id", os.environ["CUDA_VISIBLE_DEVICES"]])
elif (
"NVIDIA_VISIBLE_DEVICES" in os.environ
and os.environ["NVIDIA_VISIBLE_DEVICES"].isdigit()
):
nvidia_smi_command.extend(["--id", os.environ["NVIDIA_VISIBLE_DEVICES"]])
p = sp.run( def get_nvidia_gpu_stats() -> dict[int, dict]:
nvidia_smi_command, results = {}
encoding="ascii", try:
capture_output=True, nvml.nvmlInit()
) deviceCount = nvml.nvmlDeviceGetCount()
for i in range(deviceCount):
if p.returncode != 0: handle = nvml.nvmlDeviceGetHandleByIndex(i)
logger.error(f"Unable to poll nvidia GPU stats: {p.stderr}") meminfo = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle)
return None util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
if util != "N/A":
gpu_util = util.gpu
else: else:
usages = p.stdout.split("\n")[1].strip().split(",") gpu_util = 0
memory_percent = f"{round(float(usages[2].replace(' MiB', '').strip()) / float(usages[3].replace(' MiB', '').strip()) * 100, 1)} %"
results: dict[str, str] = { if meminfo != "N/A":
"name": usages[0], gpu_mem_util = meminfo.used / meminfo.total * 100
"gpu": usages[1].strip(), else:
"mem": memory_percent, gpu_mem_util = -1
results[i] = {
"name": nvml.nvmlDeviceGetName(handle),
"gpu": gpu_util,
"mem": gpu_mem_util,
} }
except:
return results
return results return results

View File

@ -11,6 +11,7 @@ peewee == 3.15.*
peewee_migrate == 1.7.* peewee_migrate == 1.7.*
psutil == 5.9.* psutil == 5.9.*
pydantic == 1.10.* pydantic == 1.10.*
git+https://github.com/fbcotter/py3nvml#egg=py3nvml
PyYAML == 6.0 PyYAML == 6.0
pytz == 2023.3 pytz == 2023.3
tzlocal == 4.3 tzlocal == 4.3