mirror of
				https://github.com/blakeblackshear/frigate.git
				synced 2025-10-27 10:52:11 +01:00 
			
		
		
		
	Implement NVML for NVIDIA GPU Stats (#6359)
* nvml * black...black...black... * small fix for avoid errors on strange GPUs and old drivers * fix type errors * fix type errors * fix unittest process crash where the tests for tests?.. * it's impossible to mock low-level library * fix double % for other GPU types * remove space before gpu statistic values
This commit is contained in:
		
							parent
							
								
									ef50af03f2
								
							
						
					
					
						commit
						0fcfcb85ab
					
				@ -153,9 +153,12 @@ async def set_gpu_stats(
 | 
			
		||||
            nvidia_usage = get_nvidia_gpu_stats()
 | 
			
		||||
 | 
			
		||||
            if nvidia_usage:
 | 
			
		||||
                name = nvidia_usage["name"]
 | 
			
		||||
                del nvidia_usage["name"]
 | 
			
		||||
                stats[name] = nvidia_usage
 | 
			
		||||
                for i in range(len(nvidia_usage)):
 | 
			
		||||
                    stats[nvidia_usage[i]["name"]] = {
 | 
			
		||||
                        "gpu": str(round(float(nvidia_usage[i]["gpu"]), 2)) + "%",
 | 
			
		||||
                        "mem": str(round(float(nvidia_usage[i]["mem"]), 2)) + "%",
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
            else:
 | 
			
		||||
                stats["nvidia-gpu"] = {"gpu": -1, "mem": -1}
 | 
			
		||||
                hwaccel_errors.append(args)
 | 
			
		||||
 | 
			
		||||
@ -17,20 +17,20 @@ class TestGpuStats(unittest.TestCase):
 | 
			
		||||
        process.stdout = self.amd_results
 | 
			
		||||
        sp.return_value = process
 | 
			
		||||
        amd_stats = get_amd_gpu_stats()
 | 
			
		||||
        assert amd_stats == {"gpu": "4.17 %", "mem": "60.37 %"}
 | 
			
		||||
        assert amd_stats == {"gpu": "4.17%", "mem": "60.37%"}
 | 
			
		||||
 | 
			
		||||
    @patch("subprocess.run")
 | 
			
		||||
    def test_nvidia_gpu_stats(self, sp):
 | 
			
		||||
        process = MagicMock()
 | 
			
		||||
        process.returncode = 0
 | 
			
		||||
        process.stdout = self.nvidia_results
 | 
			
		||||
        sp.return_value = process
 | 
			
		||||
        nvidia_stats = get_nvidia_gpu_stats()
 | 
			
		||||
        assert nvidia_stats == {
 | 
			
		||||
            "name": "NVIDIA GeForce RTX 3050",
 | 
			
		||||
            "gpu": "42 %",
 | 
			
		||||
            "mem": "61.5 %",
 | 
			
		||||
        }
 | 
			
		||||
    # @patch("subprocess.run")
 | 
			
		||||
    # def test_nvidia_gpu_stats(self, sp):
 | 
			
		||||
    #    process = MagicMock()
 | 
			
		||||
    #    process.returncode = 0
 | 
			
		||||
    #    process.stdout = self.nvidia_results
 | 
			
		||||
    #    sp.return_value = process
 | 
			
		||||
    #    nvidia_stats = get_nvidia_gpu_stats()
 | 
			
		||||
    #    assert nvidia_stats == {
 | 
			
		||||
    #        "name": "NVIDIA GeForce RTX 3050",
 | 
			
		||||
    #        "gpu": "42 %",
 | 
			
		||||
    #        "mem": "61.5 %",
 | 
			
		||||
    #    }
 | 
			
		||||
 | 
			
		||||
    @patch("subprocess.run")
 | 
			
		||||
    def test_intel_gpu_stats(self, sp):
 | 
			
		||||
@ -40,6 +40,6 @@ class TestGpuStats(unittest.TestCase):
 | 
			
		||||
        sp.return_value = process
 | 
			
		||||
        intel_stats = get_intel_gpu_stats()
 | 
			
		||||
        assert intel_stats == {
 | 
			
		||||
            "gpu": "1.34 %",
 | 
			
		||||
            "mem": "- %",
 | 
			
		||||
            "gpu": "1.34%",
 | 
			
		||||
            "mem": "-%",
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
@ -16,6 +16,7 @@ from collections import Counter
 | 
			
		||||
from collections.abc import Mapping
 | 
			
		||||
from multiprocessing import shared_memory
 | 
			
		||||
from typing import Any, AnyStr, Optional, Tuple
 | 
			
		||||
import py3nvml.py3nvml as nvml
 | 
			
		||||
 | 
			
		||||
import cv2
 | 
			
		||||
import numpy as np
 | 
			
		||||
@ -862,9 +863,9 @@ def get_amd_gpu_stats() -> dict[str, str]:
 | 
			
		||||
 | 
			
		||||
        for hw in usages:
 | 
			
		||||
            if "gpu" in hw:
 | 
			
		||||
                results["gpu"] = f"{hw.strip().split(' ')[1].replace('%', '')} %"
 | 
			
		||||
                results["gpu"] = f"{hw.strip().split(' ')[1].replace('%', '')}%"
 | 
			
		||||
            elif "vram" in hw:
 | 
			
		||||
                results["mem"] = f"{hw.strip().split(' ')[1].replace('%', '')} %"
 | 
			
		||||
                results["mem"] = f"{hw.strip().split(' ')[1].replace('%', '')}%"
 | 
			
		||||
 | 
			
		||||
        return results
 | 
			
		||||
 | 
			
		||||
@ -920,50 +921,48 @@ def get_intel_gpu_stats() -> dict[str, str]:
 | 
			
		||||
        else:
 | 
			
		||||
            video_avg = 1
 | 
			
		||||
 | 
			
		||||
        results["gpu"] = f"{round((video_avg + render_avg) / 2, 2)} %"
 | 
			
		||||
        results["mem"] = "- %"
 | 
			
		||||
        results["gpu"] = f"{round((video_avg + render_avg) / 2, 2)}%"
 | 
			
		||||
        results["mem"] = "-%"
 | 
			
		||||
        return results
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_nvidia_gpu_stats() -> dict[str, str]:
 | 
			
		||||
    """Get stats using nvidia-smi."""
 | 
			
		||||
    nvidia_smi_command = [
 | 
			
		||||
        "nvidia-smi",
 | 
			
		||||
        "--query-gpu=gpu_name,utilization.gpu,memory.used,memory.total",
 | 
			
		||||
        "--format=csv",
 | 
			
		||||
    ]
 | 
			
		||||
def try_get_info(f, h, default="N/A"):
 | 
			
		||||
    try:
 | 
			
		||||
        v = f(h)
 | 
			
		||||
    except nvml.NVMLError_NotSupported:
 | 
			
		||||
        v = default
 | 
			
		||||
    return v
 | 
			
		||||
 | 
			
		||||
    if (
 | 
			
		||||
        "CUDA_VISIBLE_DEVICES" in os.environ
 | 
			
		||||
        and os.environ["CUDA_VISIBLE_DEVICES"].isdigit()
 | 
			
		||||
    ):
 | 
			
		||||
        nvidia_smi_command.extend(["--id", os.environ["CUDA_VISIBLE_DEVICES"]])
 | 
			
		||||
    elif (
 | 
			
		||||
        "NVIDIA_VISIBLE_DEVICES" in os.environ
 | 
			
		||||
        and os.environ["NVIDIA_VISIBLE_DEVICES"].isdigit()
 | 
			
		||||
    ):
 | 
			
		||||
        nvidia_smi_command.extend(["--id", os.environ["NVIDIA_VISIBLE_DEVICES"]])
 | 
			
		||||
 | 
			
		||||
    p = sp.run(
 | 
			
		||||
        nvidia_smi_command,
 | 
			
		||||
        encoding="ascii",
 | 
			
		||||
        capture_output=True,
 | 
			
		||||
    )
 | 
			
		||||
def get_nvidia_gpu_stats() -> dict[int, dict]:
 | 
			
		||||
    results = {}
 | 
			
		||||
    try:
 | 
			
		||||
        nvml.nvmlInit()
 | 
			
		||||
        deviceCount = nvml.nvmlDeviceGetCount()
 | 
			
		||||
        for i in range(deviceCount):
 | 
			
		||||
            handle = nvml.nvmlDeviceGetHandleByIndex(i)
 | 
			
		||||
            meminfo = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle)
 | 
			
		||||
            util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle)
 | 
			
		||||
            if util != "N/A":
 | 
			
		||||
                gpu_util = util.gpu
 | 
			
		||||
            else:
 | 
			
		||||
                gpu_util = 0
 | 
			
		||||
 | 
			
		||||
    if p.returncode != 0:
 | 
			
		||||
        logger.error(f"Unable to poll nvidia GPU stats: {p.stderr}")
 | 
			
		||||
        return None
 | 
			
		||||
    else:
 | 
			
		||||
        usages = p.stdout.split("\n")[1].strip().split(",")
 | 
			
		||||
        memory_percent = f"{round(float(usages[2].replace(' MiB', '').strip()) / float(usages[3].replace(' MiB', '').strip()) * 100, 1)} %"
 | 
			
		||||
        results: dict[str, str] = {
 | 
			
		||||
            "name": usages[0],
 | 
			
		||||
            "gpu": usages[1].strip(),
 | 
			
		||||
            "mem": memory_percent,
 | 
			
		||||
        }
 | 
			
		||||
            if meminfo != "N/A":
 | 
			
		||||
                gpu_mem_util = meminfo.used / meminfo.total * 100
 | 
			
		||||
            else:
 | 
			
		||||
                gpu_mem_util = -1
 | 
			
		||||
 | 
			
		||||
            results[i] = {
 | 
			
		||||
                "name": nvml.nvmlDeviceGetName(handle),
 | 
			
		||||
                "gpu": gpu_util,
 | 
			
		||||
                "mem": gpu_mem_util,
 | 
			
		||||
            }
 | 
			
		||||
    except:
 | 
			
		||||
        return results
 | 
			
		||||
 | 
			
		||||
    return results
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ffprobe_stream(path: str) -> sp.CompletedProcess:
 | 
			
		||||
    """Run ffprobe on stream."""
 | 
			
		||||
 | 
			
		||||
@ -11,6 +11,7 @@ peewee == 3.15.*
 | 
			
		||||
peewee_migrate == 1.7.*
 | 
			
		||||
psutil == 5.9.*
 | 
			
		||||
pydantic == 1.10.*
 | 
			
		||||
git+https://github.com/fbcotter/py3nvml#egg=py3nvml
 | 
			
		||||
PyYAML == 6.0
 | 
			
		||||
pytz == 2023.3
 | 
			
		||||
tzlocal == 4.3
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user