Add GPU stats to the /stats API and debug screen (#3931)

* Add ffprobe endpoint

* Get ffprobe for multiple inputs

* Copy ffprobe in output

* Fix bad if statement

* Return full output of ffprobe process

* Return full output of ffprobe process

* Make ffprobe button show dialog with output and option to copy

* Add driver names to consts

* Add driver env var name

* Setup general tracking for GPU stats

* Catch RPi args as well

* Add util to get radeontop results

* Add real amd GPU stats

* Fix missed arg

* pass config

* Use only the values

* Fix vram

* Add nvidia gpu stats

* Use nvidia stats

* Add chart for gpu stats

* Format AMD with space between percent

* Get correct nvidia %

* Start to add support for intel GPU stats

* Block out RPi as util is not currently available

* Formatting

* Fix mypy

* Strip for float conversion

* Strip for float conversion

* Fix percent formatting

* Remove name from gpu map

* Add tests and fix AMD formatting

* Add nvidia gpu stats test

* Formatting

* Add intel_gpu_top for testing

* Formatting

* Handle case where hwaccel is not setup

* Formatting

* Check to remove none

* Don't use set

* Cleanup and fix types

* Handle case where args is list

* Fix mypy

* Cast to str

* Fix type checking

* Return none instead of empty

* Fix organization

* Make keys consistent

* Make gpu match style

* Get support for vainfo

* Add vainfo endpoint

* Set vainfo output in error correctly

* Remove duplicate function

* Fix errors

* Do cpu & gpu work asynchonously

* Fix async

* Fix event loop

* Fix crash

* Fix naming

* Send empty data for gpu if error occurs

* Show error if gpu stats could not be retrieved

* Fix mypy

* Fix test

* Don't use json for vainfo

* Fix cross references

* Strip unicode still

* await vainfo response

* Add gpu deps

* Formatting

* remove comments

* Use empty string

* Add vainfo back in
This commit is contained in:
Nicolas Mowen 2022-11-28 18:24:20 -07:00 committed by GitHub
parent 3cb6d43fac
commit aaedd24f37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 365 additions and 11 deletions

View File

@ -8,7 +8,7 @@ apt-get -qq install --no-install-recommends -y \
apt-transport-https \
gnupg \
wget \
procps \
procps vainfo \
unzip locales tzdata libxml2 xz-utils \
python3-pip
@ -53,7 +53,7 @@ if [[ "${TARGETARCH}" == "amd64" ]]; then
echo 'deb http://deb.debian.org/debian testing main non-free' >/etc/apt/sources.list.d/debian-testing.list
apt-get -qq update
apt-get -qq install --no-install-recommends --no-install-suggests -y \
mesa-va-drivers libva-drm2 intel-media-va-driver-non-free i965-va-driver libmfx1
mesa-va-drivers libva-drm2 intel-media-va-driver-non-free i965-va-driver libmfx1 radeontop intel-gpu-tools
rm -f /etc/apt/sources.list.d/debian-testing.list
fi

View File

@ -11,3 +11,10 @@ PLUS_API_HOST = "https://api.frigate.video"
REGEX_CAMERA_NAME = "^[a-zA-Z0-9_-]+$"
REGEX_RTSP_CAMERA_USER_PASS = ":\/\/[a-zA-Z0-9_-]+:[\S]+@"
REGEX_HTTP_CAMERA_USER_PASS = "user=[a-zA-Z0-9_-]+&password=[\S]+"
# Known Driver Names
DRIVER_ENV_VAR = "LIBVA_DRIVER_NAME"
DRIVER_AMD = "radeonsi"
DRIVER_INTEL_i965 = "i965"
DRIVER_INTEL_iHD = "iHD"

View File

@ -26,11 +26,12 @@ from flask import (
from peewee import SqliteDatabase, operator, fn, DoesNotExist
from playhouse.shortcuts import model_to_dict
from frigate.config import CameraConfig
from frigate.const import CLIPS_DIR
from frigate.models import Event, Recordings
from frigate.object_processing import TrackedObject
from frigate.stats import stats_snapshot
from frigate.util import clean_camera_user_pass, ffprobe_stream
from frigate.util import clean_camera_user_pass, ffprobe_stream, vainfo_hwaccel
from frigate.version import VERSION
logger = logging.getLogger(__name__)
@ -608,7 +609,7 @@ def version():
@bp.route("/stats")
def stats():
stats = stats_snapshot(current_app.stats_tracking)
stats = stats_snapshot(current_app.frigate_config, current_app.stats_tracking)
return jsonify(stats)
@ -996,3 +997,19 @@ def ffprobe():
)
return jsonify(output)
@bp.route("/vainfo", methods=["GET"])
def vainfo():
vainfo = vainfo_hwaccel()
return jsonify(
{
"return_code": vainfo.returncode,
"stderr": vainfo.stderr.decode("unicode_escape").strip()
if vainfo.stderr.decode()
else "",
"stdout": vainfo.stdout.decode("unicode_escape").strip()
if vainfo.stdout.decode()
else "",
}
)

View File

@ -1,3 +1,4 @@
import asyncio
import json
import logging
import threading
@ -11,8 +12,9 @@ from multiprocessing.synchronize import Event as MpEvent
from frigate.comms.dispatcher import Dispatcher
from frigate.config import FrigateConfig
from frigate.const import RECORD_DIR, CLIPS_DIR, CACHE_DIR
from frigate.const import DRIVER_AMD, DRIVER_ENV_VAR, RECORD_DIR, CLIPS_DIR, CACHE_DIR
from frigate.types import StatsTrackingTypes, CameraMetricsTypes
from frigate.util import get_amd_gpu_stats, get_intel_gpu_stats, get_nvidia_gpu_stats
from frigate.version import VERSION
from frigate.util import get_cpu_stats
from frigate.object_detection import ObjectDetectProcess
@ -82,7 +84,96 @@ def get_temperatures() -> dict[str, float]:
return temps
def stats_snapshot(stats_tracking: StatsTrackingTypes) -> dict[str, Any]:
def get_processing_stats(config: FrigateConfig, stats: dict[str, str]) -> None:
"""Get stats for cpu / gpu."""
async def run_tasks() -> None:
await asyncio.wait(
[
asyncio.create_task(set_gpu_stats(config, stats)),
asyncio.create_task(set_cpu_stats(stats)),
]
)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(run_tasks())
loop.close()
async def set_cpu_stats(all_stats: dict[str, Any]) -> None:
"""Set cpu usage from top."""
cpu_stats = get_cpu_stats()
if cpu_stats:
all_stats["cpu_usages"] = cpu_stats
async def set_gpu_stats(config: FrigateConfig, all_stats: dict[str, Any]) -> None:
"""Parse GPUs from hwaccel args and use for stats."""
hwaccel_args = []
for camera in config.cameras.values():
args = camera.ffmpeg.hwaccel_args
if isinstance(args, list):
args = " ".join(args)
if args and args not in hwaccel_args:
hwaccel_args.append(args)
stats: dict[str, dict] = {}
for args in hwaccel_args:
if "cuvid" in args:
# nvidia GPU
nvidia_usage = get_nvidia_gpu_stats()
if nvidia_usage:
name = nvidia_usage["name"]
del nvidia_usage["name"]
stats[name] = nvidia_usage
else:
stats["nvidia-gpu"] = {"gpu": -1, "mem": -1}
elif "qsv" in args:
# intel QSV GPU
intel_usage = get_intel_gpu_stats()
if intel_usage:
stats["intel-qsv"] = intel_usage
else:
stats["intel-qsv"] = {"gpu": -1, "mem": -1}
elif "vaapi" in args:
driver = os.environ.get(DRIVER_ENV_VAR)
if driver == DRIVER_AMD:
# AMD VAAPI GPU
amd_usage = get_amd_gpu_stats()
if amd_usage:
stats["amd-vaapi"] = amd_usage
else:
stats["amd-vaapi"] = {"gpu": -1, "mem": -1}
else:
# intel VAAPI GPU
intel_usage = get_intel_gpu_stats()
if intel_usage:
stats["intel-vaapi"] = intel_usage
else:
stats["intel-vaapi"] = {"gpu": -1, "mem": -1}
elif "v4l2m2m" in args:
# RPi v4l2m2m is currently not able to get usage stats
stats["rpi-v4l2m2m"] = {"gpu": -1, "mem": -1}
if stats:
all_stats["gpu_usages"] = stats
def stats_snapshot(
config: FrigateConfig, stats_tracking: StatsTrackingTypes
) -> dict[str, Any]:
"""Get a snapshot of the current stats that are being tracked."""
camera_metrics = stats_tracking["camera_metrics"]
stats: dict[str, Any] = {}
@ -119,7 +210,7 @@ def stats_snapshot(stats_tracking: StatsTrackingTypes) -> dict[str, Any]:
}
stats["detection_fps"] = round(total_detection_fps, 2)
stats["cpu_usages"] = get_cpu_stats()
get_processing_stats(config, stats)
stats["service"] = {
"uptime": (int(time.time()) - stats_tracking["started"]),
@ -159,6 +250,6 @@ class StatsEmitter(threading.Thread):
def run(self) -> None:
time.sleep(10)
while not self.stop_event.wait(self.config.mqtt.stats_interval):
stats = stats_snapshot(self.stats_tracking)
stats = stats_snapshot(self.config, self.stats_tracking)
self.dispatcher.publish("stats", json.dumps(stats), retain=False)
logger.info(f"Exiting watchdog...")

View File

@ -0,0 +1,45 @@
import unittest
from unittest.mock import MagicMock, patch
from frigate.util import get_amd_gpu_stats, get_intel_gpu_stats, get_nvidia_gpu_stats
class TestGpuStats(unittest.TestCase):
def setUp(self):
self.amd_results = "Unknown Radeon card. <= R500 won't work, new cards might.\nDumping to -, line limit 1.\n1664070990.607556: bus 10, gpu 4.17%, ee 0.00%, vgt 0.00%, ta 0.00%, tc 0.00%, sx 0.00%, sh 0.00%, spi 0.83%, smx 0.00%, cr 0.00%, sc 0.00%, pa 0.00%, db 0.00%, cb 0.00%, vram 60.37% 294.04mb, gtt 0.33% 52.21mb, mclk 100.00% 1.800ghz, sclk 26.65% 0.533ghz\n"
self.intel_results = """{"period":{"duration":1.194033,"unit":"ms"},"frequency":{"requested":0.000000,"actual":0.000000,"unit":"MHz"},"interrupts":{"count":3349.991164,"unit":"irq/s"},"rc6":{"value":47.844741,"unit":"%"},"engines":{"Render/3D/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Blitter/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/0":{"busy":4.533124,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/1":{"busy":6.194385,"sema":0.000000,"wait":0.000000,"unit":"%"},"VideoEnhance/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"}}},{"period":{"duration":1.189291,"unit":"ms"},"frequency":{"requested":0.000000,"actual":0.000000,"unit":"MHz"},"interrupts":{"count":0.000000,"unit":"irq/s"},"rc6":{"value":100.000000,"unit":"%"},"engines":{"Render/3D/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Blitter/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"Video/1":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"},"VideoEnhance/0":{"busy":0.000000,"sema":0.000000,"wait":0.000000,"unit":"%"}}}"""
self.nvidia_results = "name, utilization.gpu [%], memory.used [MiB], memory.total [MiB]\nNVIDIA GeForce RTX 3050, 42 %, 5036 MiB, 8192 MiB\n"
@patch("subprocess.run")
def test_amd_gpu_stats(self, sp):
process = MagicMock()
process.returncode = 0
process.stdout = self.amd_results
sp.return_value = process
amd_stats = get_amd_gpu_stats()
assert amd_stats == {"gpu": "4.17 %", "mem": "60.37 %"}
@patch("subprocess.run")
def test_nvidia_gpu_stats(self, sp):
process = MagicMock()
process.returncode = 0
process.stdout = self.nvidia_results
sp.return_value = process
nvidia_stats = get_nvidia_gpu_stats()
assert nvidia_stats == {
"name": "NVIDIA GeForce RTX 3050",
"gpu": "42 %",
"mem": "61.5 %",
}
@patch("subprocess.run")
def test_intel_gpu_stats(self, sp):
process = MagicMock()
process.returncode = 0
process.stdout = self.intel_results
sp.return_value = process
intel_stats = get_intel_gpu_stats()
assert intel_stats == {
"gpu": "10.73 %",
"mem": "- %",
}

View File

@ -766,6 +766,105 @@ def get_cpu_stats() -> dict[str, dict]:
return usages
def get_amd_gpu_stats() -> dict[str, str]:
"""Get stats using radeontop."""
radeontop_command = ["radeontop", "-d", "-", "-l", "1"]
p = sp.run(
radeontop_command,
encoding="ascii",
capture_output=True,
)
if p.returncode != 0:
logger.error(p.stderr)
return None
else:
usages = p.stdout.split(",")
results: dict[str, str] = {}
for hw in usages:
if "gpu" in hw:
results["gpu"] = f"{hw.strip().split(' ')[1].replace('%', '')} %"
elif "vram" in hw:
results["mem"] = f"{hw.strip().split(' ')[1].replace('%', '')} %"
return results
def get_intel_gpu_stats() -> dict[str, str]:
"""Get stats using intel_gpu_top."""
intel_gpu_top_command = [
"timeout",
"0.1s",
"intel_gpu_top",
"-J",
"-o",
"-",
"-s",
"1",
]
p = sp.run(
intel_gpu_top_command,
encoding="ascii",
capture_output=True,
)
if p.returncode != 0:
logger.error(p.stderr)
return None
else:
readings = json.loads(f"[{p.stdout}]")
results: dict[str, str] = {}
for reading in readings:
if reading.get("engines", {}).get("Video/0", {}).get(
"busy", 0
) or reading.get("engines", {}).get("Video/1", {}).get("busy", 0):
gpu_usage = round(
float(reading.get("engines", {}).get("Video/0", {}).get("busy", 0))
+ float(
reading.get("engines", {}).get("Video/1", {}).get("busy", 0)
),
2,
)
results["gpu"] = f"{gpu_usage} %"
break
results["mem"] = "- %"
return results
def get_nvidia_gpu_stats() -> dict[str, str]:
"""Get stats using nvidia-smi."""
nvidia_smi_command = [
"nvidia-smi",
"--query-gpu=gpu_name,utilization.gpu,memory.used,memory.total",
"--format=csv",
]
p = sp.run(
nvidia_smi_command,
encoding="ascii",
capture_output=True,
)
if p.returncode != 0:
logger.error(p.stderr)
return None
else:
usages = p.stdout.split("\n")[1].strip().split(",")
memory_percent = f"{round(float(usages[2].replace(' MiB', '').strip()) / float(usages[3].replace(' MiB', '').strip()) * 100, 1)} %"
results: dict[str, str] = {
"name": usages[0],
"gpu": usages[1].strip(),
"mem": memory_percent,
}
return results
def ffprobe_stream(path: str) -> sp.CompletedProcess:
"""Run ffprobe on stream."""
ffprobe_cmd = [
@ -781,6 +880,12 @@ def ffprobe_stream(path: str) -> sp.CompletedProcess:
return sp.run(ffprobe_cmd, capture_output=True)
def vainfo_hwaccel() -> sp.CompletedProcess:
"""Run vainfo."""
ffprobe_cmd = ["vainfo"]
return sp.run(ffprobe_cmd, capture_output=True)
class FrameManager(ABC):
@abstractmethod
def create(self, name, size) -> AnyStr:

View File

@ -21,9 +21,17 @@ export default function System() {
} = useWs('stats');
const { data: initialStats } = useSWR('stats');
const { cpu_usages, detectors, service = {}, detection_fps: _, ...cameras } = stats || initialStats || emptyObject;
const {
cpu_usages,
gpu_usages,
detectors,
service = {},
detection_fps: _,
...cameras
} = stats || initialStats || emptyObject;
const detectorNames = Object.keys(detectors || emptyObject);
const gpuNames = Object.keys(gpu_usages || emptyObject);
const cameraNames = Object.keys(cameras || emptyObject);
const handleCopyConfig = useCallback(() => {
@ -55,9 +63,9 @@ export default function System() {
});
if (response.status === 200) {
setState({ showFfprobe: true, ffprobe: JSON.stringify(response.data, null, 2) });
setState({ ...state, showFfprobe: true, ffprobe: JSON.stringify(response.data, null, 2) });
} else {
setState({ ...state, ffprobe: 'There was an error getting the ffprobe output.' });
setState({ ...state, showFfprobe: true, ffprobe: 'There was an error getting the ffprobe output.' });
}
};
@ -66,11 +74,31 @@ export default function System() {
setState({ ...state, ffprobe: '', showFfprobe: false });
};
const onHandleVainfo = async (e) => {
if (e) {
e.stopPropagation();
}
const response = await axios.get('vainfo');
if (response.status === 200) {
setState({ ...state, showVainfo: true, vainfo: JSON.stringify(response.data, null, 2) });
} else {
setState({ ...state, showVainfo: true, vainfo: 'There was an error getting the vainfo output.' });
}
};
const onCopyVainfo = async () => {
await window.navigator.clipboard.writeText(JSON.stringify(state.vaifp, null, 2));
setState({ ...state, vainfo: '', showVainfo: false });
};
return (
<div className="space-y-4 p-2 px-4">
<Heading>
System <span className="text-sm">{service.version}</span>
</Heading>
{state.showFfprobe && (
<Dialog>
<div className="p-4">
@ -92,6 +120,23 @@ export default function System() {
</Dialog>
)}
{state.showVainfo && (
<Dialog>
<div className="p-4">
<Heading size="lg">Vainfo Output</Heading>
{state.vainfo != '' ? <p className="mb-2">{state.vainfo}</p> : <ActivityIndicator />}
</div>
<div className="p-2 flex justify-start flex-row-reverse space-x-2">
<Button className="ml-2" onClick={() => onCopyVainfo()} type="text">
Copy
</Button>
<Button className="ml-2" onClick={() => setState({ ...state, vainfo: '', showVainfo: false })} type="text">
Close
</Button>
</div>
</Dialog>
)}
{!detectors ? (
<div>
<ActivityIndicator />
@ -125,6 +170,50 @@ export default function System() {
))}
</div>
<div className="text-lg flex justify-between p-4">
<Heading size="lg">GPUs</Heading>
<Button onClick={(e) => onHandleVainfo(e)}>vainfo</Button>
</div>
{!gpu_usages ? (
<div className="p-4">
<Link href={'https://docs.frigate.video/configuration/hardware_acceleration'}>
Hardware acceleration has not been setup, see the docs to setup hardware acceleration.
</Link>
</div>
) : (
<div data-testid="gpus" className="grid grid-cols-1 3xl:grid-cols-3 md:grid-cols-2 gap-4">
{gpuNames.map((gpu) => (
<div key={gpu} className="dark:bg-gray-800 shadow-md hover:shadow-lg rounded-lg transition-shadow">
<div className="text-lg flex justify-between p-4">{gpu}</div>
<div className="p-2">
{gpu_usages[gpu]['gpu'] == -1 ? (
<div className="p-4">
There was an error getting usage stats. Either your GPU does not support this or frigate does
not have proper access.
</div>
) : (
<Table className="w-full">
<Thead>
<Tr>
<Th>Gpu %</Th>
<Th>Memory %</Th>
</Tr>
</Thead>
<Tbody>
<Tr>
<Td>{gpu_usages[gpu]['gpu']}</Td>
<Td>{gpu_usages[gpu]['mem']}</Td>
</Tr>
</Tbody>
</Table>
)}
</div>
</div>
))}
</div>
)}
<Heading size="lg">Cameras</Heading>
<div data-testid="cameras" className="grid grid-cols-1 3xl:grid-cols-3 md:grid-cols-2 gap-4">
{cameraNames.map((camera) => (