Don't keep attempting gpu usage stats after failure (#4904)

* Don't log intel gpu top errors

* Keep list of errored hwaccel and don't send again

* Can log on first time

* Formatting & mypy
This commit is contained in:
Nicolas Mowen 2023-01-04 17:12:51 -07:00 committed by GitHub
parent 5e71d95cb1
commit ffa98a138b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -84,13 +84,15 @@ def get_temperatures() -> dict[str, float]:
return temps return temps
def get_processing_stats(config: FrigateConfig, stats: dict[str, str]) -> None: def get_processing_stats(
config: FrigateConfig, stats: dict[str, str], hwaccel_errors: list[str]
) -> None:
"""Get stats for cpu / gpu.""" """Get stats for cpu / gpu."""
async def run_tasks() -> None: async def run_tasks() -> None:
await asyncio.wait( await asyncio.wait(
[ [
asyncio.create_task(set_gpu_stats(config, stats)), asyncio.create_task(set_gpu_stats(config, stats, hwaccel_errors)),
asyncio.create_task(set_cpu_stats(stats)), asyncio.create_task(set_cpu_stats(stats)),
] ]
) )
@ -109,7 +111,9 @@ async def set_cpu_stats(all_stats: dict[str, Any]) -> None:
all_stats["cpu_usages"] = cpu_stats all_stats["cpu_usages"] = cpu_stats
async def set_gpu_stats(config: FrigateConfig, all_stats: dict[str, Any]) -> None: async def set_gpu_stats(
config: FrigateConfig, all_stats: dict[str, Any], hwaccel_errors: list[str]
) -> None:
"""Parse GPUs from hwaccel args and use for stats.""" """Parse GPUs from hwaccel args and use for stats."""
hwaccel_args = [] hwaccel_args = []
@ -119,7 +123,7 @@ async def set_gpu_stats(config: FrigateConfig, all_stats: dict[str, Any]) -> Non
if isinstance(args, list): if isinstance(args, list):
args = " ".join(args) args = " ".join(args)
if args and args not in hwaccel_args: if args and args not in hwaccel_args and args not in hwaccel_errors:
hwaccel_args.append(args) hwaccel_args.append(args)
for stream_input in camera.ffmpeg.inputs: for stream_input in camera.ffmpeg.inputs:
@ -144,6 +148,7 @@ async def set_gpu_stats(config: FrigateConfig, all_stats: dict[str, Any]) -> Non
stats[name] = nvidia_usage stats[name] = nvidia_usage
else: else:
stats["nvidia-gpu"] = {"gpu": -1, "mem": -1} stats["nvidia-gpu"] = {"gpu": -1, "mem": -1}
hwaccel_errors.append(args)
elif "qsv" in args: elif "qsv" in args:
# intel QSV GPU # intel QSV GPU
intel_usage = get_intel_gpu_stats() intel_usage = get_intel_gpu_stats()
@ -152,6 +157,7 @@ async def set_gpu_stats(config: FrigateConfig, all_stats: dict[str, Any]) -> Non
stats["intel-qsv"] = intel_usage stats["intel-qsv"] = intel_usage
else: else:
stats["intel-qsv"] = {"gpu": -1, "mem": -1} stats["intel-qsv"] = {"gpu": -1, "mem": -1}
hwaccel_errors.append(args)
elif "vaapi" in args: elif "vaapi" in args:
driver = os.environ.get(DRIVER_ENV_VAR) driver = os.environ.get(DRIVER_ENV_VAR)
@ -163,6 +169,7 @@ async def set_gpu_stats(config: FrigateConfig, all_stats: dict[str, Any]) -> Non
stats["amd-vaapi"] = amd_usage stats["amd-vaapi"] = amd_usage
else: else:
stats["amd-vaapi"] = {"gpu": -1, "mem": -1} stats["amd-vaapi"] = {"gpu": -1, "mem": -1}
hwaccel_errors.append(args)
else: else:
# intel VAAPI GPU # intel VAAPI GPU
intel_usage = get_intel_gpu_stats() intel_usage = get_intel_gpu_stats()
@ -171,6 +178,7 @@ async def set_gpu_stats(config: FrigateConfig, all_stats: dict[str, Any]) -> Non
stats["intel-vaapi"] = intel_usage stats["intel-vaapi"] = intel_usage
else: else:
stats["intel-vaapi"] = {"gpu": -1, "mem": -1} stats["intel-vaapi"] = {"gpu": -1, "mem": -1}
hwaccel_errors.append(args)
elif "v4l2m2m" in args or "rpi" in args: elif "v4l2m2m" in args or "rpi" in args:
# RPi v4l2m2m is currently not able to get usage stats # RPi v4l2m2m is currently not able to get usage stats
stats["rpi-v4l2m2m"] = {"gpu": -1, "mem": -1} stats["rpi-v4l2m2m"] = {"gpu": -1, "mem": -1}
@ -180,7 +188,7 @@ async def set_gpu_stats(config: FrigateConfig, all_stats: dict[str, Any]) -> Non
def stats_snapshot( def stats_snapshot(
config: FrigateConfig, stats_tracking: StatsTrackingTypes config: FrigateConfig, stats_tracking: StatsTrackingTypes, hwaccel_errors: list[str]
) -> dict[str, Any]: ) -> dict[str, Any]:
"""Get a snapshot of the current stats that are being tracked.""" """Get a snapshot of the current stats that are being tracked."""
camera_metrics = stats_tracking["camera_metrics"] camera_metrics = stats_tracking["camera_metrics"]
@ -219,7 +227,7 @@ def stats_snapshot(
} }
stats["detection_fps"] = round(total_detection_fps, 2) stats["detection_fps"] = round(total_detection_fps, 2)
get_processing_stats(config, stats) get_processing_stats(config, stats, hwaccel_errors)
stats["service"] = { stats["service"] = {
"uptime": (int(time.time()) - stats_tracking["started"]), "uptime": (int(time.time()) - stats_tracking["started"]),
@ -255,10 +263,13 @@ class StatsEmitter(threading.Thread):
self.stats_tracking = stats_tracking self.stats_tracking = stats_tracking
self.dispatcher = dispatcher self.dispatcher = dispatcher
self.stop_event = stop_event self.stop_event = stop_event
self.hwaccel_errors: list[str] = []
def run(self) -> None: def run(self) -> None:
time.sleep(10) time.sleep(10)
while not self.stop_event.wait(self.config.mqtt.stats_interval): while not self.stop_event.wait(self.config.mqtt.stats_interval):
stats = stats_snapshot(self.config, self.stats_tracking) stats = stats_snapshot(
self.config, self.stats_tracking, self.hwaccel_errors
)
self.dispatcher.publish("stats", json.dumps(stats), retain=False) self.dispatcher.publish("stats", json.dumps(stats), retain=False)
logger.info(f"Exiting watchdog...") logger.info(f"Exiting watchdog...")