2024-01-19 15:51:29 +01:00
|
|
|
import { collectDefaultMetrics } from 'prom-client';
|
2024-01-09 16:33:00 +01:00
|
|
|
import memoizee from 'memoizee';
|
2024-03-18 13:58:05 +01:00
|
|
|
import type EventEmitter from 'events';
|
|
|
|
import type { Knex } from 'knex';
|
2021-04-29 10:21:29 +02:00
|
|
|
import * as events from './metric-events';
|
2021-02-16 14:30:08 +01:00
|
|
|
import {
|
2021-11-02 15:13:46 +01:00
|
|
|
DB_POOL_UPDATE,
|
2018-05-23 11:24:24 +02:00
|
|
|
FEATURE_ARCHIVED,
|
2021-11-02 15:13:46 +01:00
|
|
|
FEATURE_CREATED,
|
2018-05-23 11:24:24 +02:00
|
|
|
FEATURE_REVIVED,
|
2021-11-12 13:15:51 +01:00
|
|
|
FEATURE_STRATEGY_ADD,
|
|
|
|
FEATURE_STRATEGY_REMOVE,
|
|
|
|
FEATURE_STRATEGY_UPDATE,
|
2022-09-08 11:01:27 +02:00
|
|
|
FEATURE_ENVIRONMENT_ENABLED,
|
|
|
|
FEATURE_ENVIRONMENT_DISABLED,
|
|
|
|
FEATURE_VARIANTS_UPDATED,
|
|
|
|
FEATURE_METADATA_UPDATED,
|
2021-11-02 15:13:46 +01:00
|
|
|
FEATURE_UPDATED,
|
2021-12-09 21:02:58 +01:00
|
|
|
CLIENT_METRICS,
|
2022-07-22 11:00:22 +02:00
|
|
|
CLIENT_REGISTER,
|
chore: Establish a baseline for the number of envs disabled per project (#6807)
This PR adds a counter in Prometheus for counting the number of
"environment disabled" events we get per project. The purpose of this is
to establish a baseline for one of the "project management UI" project's
key results.
## On gauges vs counters
This PR uses a counter. Using a gauge would give you the total number of
envs disabled, not the number of disable events. The difference is
subtle, but important.
For projects that were created before the new feature, the gauge might
be appropriate. Because each disabled env would require at least one
disabled event, we can get a floor of how many events were triggered for
each project.
However, for projects created after we introduce the planned change,
we're not interested in the total envs anymore, because you can disable
a hundred envs on creation with a single action. In this case, a gauge
showing 100 disabled envs would be misleading, because it didn't take
100 events to disable them.
So the interesting metric here is how many times did you specifically
disable an environment in project settings, hence the counter.
## Assumptions and future plans
To make this easier on ourselves, we make the follow assumption: people
primarily disable envs **when creating a project**.
This means that there might be a few lagging indicators granting some
projects a smaller number of events than expected, but we may be able to
filter those out.
Further, if we had a metric for each project and its creation date, we
could correlate that with the metrics to answer the question "how many
envs do people disable in the first week? Two weeks? A month?". Or
worded differently: after creating a project, how long does it take for
people to configure environments?
Similarly, if we gather that data, it will also make filtering out the
number of events for projects created **after** the new changes have
been released much easier.
The good news: Because the project creation metric with dates is a
static aggregate, it can be applied at any time, even retroactively, to
see the effects.
2024-04-10 08:49:15 +02:00
|
|
|
PROJECT_ENVIRONMENT_REMOVED,
|
2021-04-29 10:21:29 +02:00
|
|
|
} from './types/events';
|
2024-03-18 13:58:05 +01:00
|
|
|
import type { IUnleashConfig } from './types/option';
|
2024-05-13 14:41:28 +02:00
|
|
|
import type { ISettingStore, IUnleashStores } from './types/stores';
|
2021-11-02 15:13:46 +01:00
|
|
|
import { hoursToMilliseconds, minutesToMilliseconds } from 'date-fns';
|
2024-03-18 13:58:05 +01:00
|
|
|
import type { InstanceStatsService } from './features/instance-stats/instance-stats-service';
|
|
|
|
import type { ValidatedClientMetrics } from './features/metrics/shared/schema';
|
|
|
|
import type { IEnvironment } from './types';
|
2024-02-22 14:29:21 +01:00
|
|
|
import {
|
|
|
|
createCounter,
|
|
|
|
createGauge,
|
|
|
|
createSummary,
|
|
|
|
createHistogram,
|
|
|
|
} from './util/metrics';
|
2024-03-18 13:58:05 +01:00
|
|
|
import type { SchedulerService } from './services';
|
2020-04-14 22:29:11 +02:00
|
|
|
|
2021-04-22 10:07:10 +02:00
|
|
|
export default class MetricsMonitor {
|
2024-02-08 17:15:42 +01:00
|
|
|
constructor() {}
|
2021-05-28 11:10:24 +02:00
|
|
|
|
2024-02-08 17:15:42 +01:00
|
|
|
async startMonitoring(
|
2021-04-22 10:07:10 +02:00
|
|
|
config: IUnleashConfig,
|
|
|
|
stores: IUnleashStores,
|
|
|
|
version: string,
|
|
|
|
eventBus: EventEmitter,
|
2022-10-25 13:10:27 +02:00
|
|
|
instanceStatsService: InstanceStatsService,
|
2024-02-08 17:15:42 +01:00
|
|
|
schedulerService: SchedulerService,
|
2021-08-12 15:04:37 +02:00
|
|
|
db: Knex,
|
2021-04-22 10:07:10 +02:00
|
|
|
): Promise<void> {
|
|
|
|
if (!config.server.serverMetrics) {
|
2023-11-29 13:09:30 +01:00
|
|
|
return Promise.resolve();
|
2020-12-16 14:49:11 +01:00
|
|
|
}
|
2020-08-03 13:34:10 +02:00
|
|
|
|
2024-01-09 16:33:00 +01:00
|
|
|
const { eventStore, environmentStore } = stores;
|
|
|
|
|
|
|
|
const cachedEnvironments: () => Promise<IEnvironment[]> = memoizee(
|
|
|
|
async () => environmentStore.getAll(),
|
|
|
|
{
|
|
|
|
promise: true,
|
|
|
|
maxAge: hoursToMilliseconds(1),
|
|
|
|
},
|
|
|
|
);
|
2017-06-28 14:21:05 +02:00
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
collectDefaultMetrics();
|
2020-02-28 14:50:32 +01:00
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const requestDuration = createSummary({
|
2020-12-16 14:49:11 +01:00
|
|
|
name: 'http_request_duration_milliseconds',
|
|
|
|
help: 'App response time',
|
2022-09-30 15:28:50 +02:00
|
|
|
labelNames: ['path', 'method', 'status', 'appName'],
|
2021-06-07 10:34:32 +02:00
|
|
|
percentiles: [0.1, 0.5, 0.9, 0.95, 0.99],
|
2022-08-30 13:53:28 +02:00
|
|
|
maxAgeSeconds: 600,
|
|
|
|
ageBuckets: 5,
|
2020-12-16 14:49:11 +01:00
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const schedulerDuration = createSummary({
|
2023-11-21 13:42:38 +01:00
|
|
|
name: 'scheduler_duration_seconds',
|
|
|
|
help: 'Scheduler duration time',
|
|
|
|
labelNames: ['jobId'],
|
|
|
|
percentiles: [0.1, 0.5, 0.9, 0.95, 0.99],
|
|
|
|
maxAgeSeconds: 600,
|
|
|
|
ageBuckets: 5,
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const dbDuration = createSummary({
|
2020-12-16 14:49:11 +01:00
|
|
|
name: 'db_query_duration_seconds',
|
|
|
|
help: 'DB query duration time',
|
|
|
|
labelNames: ['store', 'action'],
|
2021-06-07 10:34:32 +02:00
|
|
|
percentiles: [0.1, 0.5, 0.9, 0.95, 0.99],
|
2022-08-30 13:53:28 +02:00
|
|
|
maxAgeSeconds: 600,
|
|
|
|
ageBuckets: 5,
|
2020-12-16 14:49:11 +01:00
|
|
|
});
|
2024-03-12 13:27:04 +01:00
|
|
|
const functionDuration = createSummary({
|
|
|
|
name: 'function_duration_seconds',
|
|
|
|
help: 'Function duration time',
|
|
|
|
labelNames: ['functionName', 'className'],
|
2024-03-12 11:30:30 +01:00
|
|
|
percentiles: [0.1, 0.5, 0.9, 0.95, 0.99],
|
|
|
|
maxAgeSeconds: 600,
|
|
|
|
ageBuckets: 5,
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const featureToggleUpdateTotal = createCounter({
|
2020-12-16 14:49:11 +01:00
|
|
|
name: 'feature_toggle_update_total',
|
2022-09-08 11:01:27 +02:00
|
|
|
help: 'Number of times a toggle has been updated. Environment label would be "n/a" when it is not available, e.g. when a feature toggle is created.',
|
2024-01-09 16:33:00 +01:00
|
|
|
labelNames: ['toggle', 'project', 'environment', 'environmentType'],
|
2020-12-16 14:49:11 +01:00
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const featureToggleUsageTotal = createCounter({
|
2020-12-16 14:49:11 +01:00
|
|
|
name: 'feature_toggle_usage_total',
|
|
|
|
help: 'Number of times a feature toggle has been used',
|
|
|
|
labelNames: ['toggle', 'active', 'appName'],
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const featureTogglesTotal = createGauge({
|
2020-12-16 14:49:11 +01:00
|
|
|
name: 'feature_toggles_total',
|
|
|
|
help: 'Number of feature toggles',
|
|
|
|
labelNames: ['version'],
|
|
|
|
});
|
2024-03-28 12:40:30 +01:00
|
|
|
|
|
|
|
const featureTogglesArchivedTotal = createGauge({
|
|
|
|
name: 'feature_toggles_archived_total',
|
|
|
|
help: 'Number of archived feature toggles',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const usersTotal = createGauge({
|
2021-08-27 10:10:14 +02:00
|
|
|
name: 'users_total',
|
|
|
|
help: 'Number of users',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const serviceAccounts = createGauge({
|
2023-11-29 13:09:30 +01:00
|
|
|
name: 'service_accounts_total',
|
|
|
|
help: 'Number of service accounts',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const apiTokens = createGauge({
|
2023-11-29 13:09:30 +01:00
|
|
|
name: 'api_tokens_total',
|
|
|
|
help: 'Number of API tokens',
|
|
|
|
labelNames: ['type'],
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const enabledMetricsBucketsPreviousDay = createGauge({
|
2024-01-15 15:31:38 +01:00
|
|
|
name: 'enabled_metrics_buckets_previous_day',
|
|
|
|
help: 'Number of hourly enabled/disabled metric buckets in the previous day',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const variantMetricsBucketsPreviousDay = createGauge({
|
2024-01-15 15:31:38 +01:00
|
|
|
name: 'variant_metrics_buckets_previous_day',
|
|
|
|
help: 'Number of hourly variant metric buckets in the previous day',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const usersActive7days = createGauge({
|
2023-09-18 15:05:17 +02:00
|
|
|
name: 'users_active_7',
|
|
|
|
help: 'Number of users active in the last 7 days',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const usersActive30days = createGauge({
|
2023-09-18 15:05:17 +02:00
|
|
|
name: 'users_active_30',
|
|
|
|
help: 'Number of users active in the last 30 days',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const usersActive60days = createGauge({
|
2023-09-18 15:05:17 +02:00
|
|
|
name: 'users_active_60',
|
|
|
|
help: 'Number of users active in the last 60 days',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const usersActive90days = createGauge({
|
2023-09-18 15:05:17 +02:00
|
|
|
name: 'users_active_90',
|
|
|
|
help: 'Number of users active in the last 90 days',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const projectsTotal = createGauge({
|
2021-08-27 10:10:14 +02:00
|
|
|
name: 'projects_total',
|
|
|
|
help: 'Number of projects',
|
2023-09-25 11:07:59 +02:00
|
|
|
labelNames: ['mode'],
|
2021-08-27 10:10:14 +02:00
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const environmentsTotal = createGauge({
|
2022-09-06 13:24:13 +02:00
|
|
|
name: 'environments_total',
|
|
|
|
help: 'Number of environments',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const groupsTotal = createGauge({
|
2022-10-25 13:10:27 +02:00
|
|
|
name: 'groups_total',
|
|
|
|
help: 'Number of groups',
|
|
|
|
});
|
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const rolesTotal = createGauge({
|
2022-10-25 13:10:27 +02:00
|
|
|
name: 'roles_total',
|
|
|
|
help: 'Number of roles',
|
|
|
|
});
|
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const customRootRolesTotal = createGauge({
|
2023-08-07 15:59:29 +02:00
|
|
|
name: 'custom_root_roles_total',
|
|
|
|
help: 'Number of custom root roles',
|
|
|
|
});
|
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const customRootRolesInUseTotal = createGauge({
|
feat: add prom metric for total custom root roles in use (#4438)
https://linear.app/unleash/issue/2-1311/add-a-new-prometheus-metric-with-custom-root-roles-in-use
As a follow-up to https://github.com/Unleash/unleash/pull/4435, this PR
adds a metric for total custom root roles in use by at least one entity:
users, service accounts, groups.
`custom_root_roles_in_use_total`
Output from `http://localhost:4242/internal-backstage/prometheus`:
```
# HELP process_cpu_user_seconds_total Total user CPU time spent in seconds.
# TYPE process_cpu_user_seconds_total counter
process_cpu_user_seconds_total 0.060755
# HELP process_cpu_system_seconds_total Total system CPU time spent in seconds.
# TYPE process_cpu_system_seconds_total counter
process_cpu_system_seconds_total 0.01666
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total 0.077415
# HELP process_start_time_seconds Start time of the process since unix epoch in seconds.
# TYPE process_start_time_seconds gauge
process_start_time_seconds 1691420275
# HELP process_resident_memory_bytes Resident memory size in bytes.
# TYPE process_resident_memory_bytes gauge
process_resident_memory_bytes 199196672
# HELP nodejs_eventloop_lag_seconds Lag of event loop in seconds.
# TYPE nodejs_eventloop_lag_seconds gauge
nodejs_eventloop_lag_seconds 0
# HELP nodejs_eventloop_lag_min_seconds The minimum recorded event loop delay.
# TYPE nodejs_eventloop_lag_min_seconds gauge
nodejs_eventloop_lag_min_seconds 0.009076736
# HELP nodejs_eventloop_lag_max_seconds The maximum recorded event loop delay.
# TYPE nodejs_eventloop_lag_max_seconds gauge
nodejs_eventloop_lag_max_seconds 0.037683199
# HELP nodejs_eventloop_lag_mean_seconds The mean of the recorded event loop delays.
# TYPE nodejs_eventloop_lag_mean_seconds gauge
nodejs_eventloop_lag_mean_seconds 0.011063251638989169
# HELP nodejs_eventloop_lag_stddev_seconds The standard deviation of the recorded event loop delays.
# TYPE nodejs_eventloop_lag_stddev_seconds gauge
nodejs_eventloop_lag_stddev_seconds 0.0013618102764025837
# HELP nodejs_eventloop_lag_p50_seconds The 50th percentile of the recorded event loop delays.
# TYPE nodejs_eventloop_lag_p50_seconds gauge
nodejs_eventloop_lag_p50_seconds 0.011051007
# HELP nodejs_eventloop_lag_p90_seconds The 90th percentile of the recorded event loop delays.
# TYPE nodejs_eventloop_lag_p90_seconds gauge
nodejs_eventloop_lag_p90_seconds 0.011321343
# HELP nodejs_eventloop_lag_p99_seconds The 99th percentile of the recorded event loop delays.
# TYPE nodejs_eventloop_lag_p99_seconds gauge
nodejs_eventloop_lag_p99_seconds 0.013688831
# HELP nodejs_active_resources Number of active resources that are currently keeping the event loop alive, grouped by async resource type.
# TYPE nodejs_active_resources gauge
nodejs_active_resources{type="FSReqCallback"} 1
nodejs_active_resources{type="TTYWrap"} 3
nodejs_active_resources{type="TCPSocketWrap"} 5
nodejs_active_resources{type="TCPServerWrap"} 1
nodejs_active_resources{type="Timeout"} 1
nodejs_active_resources{type="Immediate"} 1
# HELP nodejs_active_resources_total Total number of active resources.
# TYPE nodejs_active_resources_total gauge
nodejs_active_resources_total 12
# HELP nodejs_active_handles Number of active libuv handles grouped by handle type. Every handle type is C++ class name.
# TYPE nodejs_active_handles gauge
nodejs_active_handles{type="WriteStream"} 2
nodejs_active_handles{type="ReadStream"} 1
nodejs_active_handles{type="Socket"} 5
nodejs_active_handles{type="Server"} 1
# HELP nodejs_active_handles_total Total number of active handles.
# TYPE nodejs_active_handles_total gauge
nodejs_active_handles_total 9
# HELP nodejs_active_requests Number of active libuv requests grouped by request type. Every request type is C++ class name.
# TYPE nodejs_active_requests gauge
nodejs_active_requests{type="FSReqCallback"} 1
# HELP nodejs_active_requests_total Total number of active requests.
# TYPE nodejs_active_requests_total gauge
nodejs_active_requests_total 1
# HELP nodejs_heap_size_total_bytes Process heap size from Node.js in bytes.
# TYPE nodejs_heap_size_total_bytes gauge
nodejs_heap_size_total_bytes 118587392
# HELP nodejs_heap_size_used_bytes Process heap size used from Node.js in bytes.
# TYPE nodejs_heap_size_used_bytes gauge
nodejs_heap_size_used_bytes 89642552
# HELP nodejs_external_memory_bytes Node.js external memory size in bytes.
# TYPE nodejs_external_memory_bytes gauge
nodejs_external_memory_bytes 1601594
# HELP nodejs_heap_space_size_total_bytes Process heap space size total from Node.js in bytes.
# TYPE nodejs_heap_space_size_total_bytes gauge
nodejs_heap_space_size_total_bytes{space="read_only"} 0
nodejs_heap_space_size_total_bytes{space="old"} 70139904
nodejs_heap_space_size_total_bytes{space="code"} 3588096
nodejs_heap_space_size_total_bytes{space="map"} 2899968
nodejs_heap_space_size_total_bytes{space="large_object"} 7258112
nodejs_heap_space_size_total_bytes{space="code_large_object"} 1146880
nodejs_heap_space_size_total_bytes{space="new_large_object"} 0
nodejs_heap_space_size_total_bytes{space="new"} 33554432
# HELP nodejs_heap_space_size_used_bytes Process heap space size used from Node.js in bytes.
# TYPE nodejs_heap_space_size_used_bytes gauge
nodejs_heap_space_size_used_bytes{space="read_only"} 0
nodejs_heap_space_size_used_bytes{space="old"} 66992120
nodejs_heap_space_size_used_bytes{space="code"} 2892640
nodejs_heap_space_size_used_bytes{space="map"} 2519280
nodejs_heap_space_size_used_bytes{space="large_object"} 7026824
nodejs_heap_space_size_used_bytes{space="code_large_object"} 983200
nodejs_heap_space_size_used_bytes{space="new_large_object"} 0
nodejs_heap_space_size_used_bytes{space="new"} 9236136
# HELP nodejs_heap_space_size_available_bytes Process heap space size available from Node.js in bytes.
# TYPE nodejs_heap_space_size_available_bytes gauge
nodejs_heap_space_size_available_bytes{space="read_only"} 0
nodejs_heap_space_size_available_bytes{space="old"} 1898360
nodejs_heap_space_size_available_bytes{space="code"} 7328
nodejs_heap_space_size_available_bytes{space="map"} 327888
nodejs_heap_space_size_available_bytes{space="large_object"} 0
nodejs_heap_space_size_available_bytes{space="code_large_object"} 0
nodejs_heap_space_size_available_bytes{space="new_large_object"} 16495616
nodejs_heap_space_size_available_bytes{space="new"} 7259480
# HELP nodejs_version_info Node.js version info.
# TYPE nodejs_version_info gauge
nodejs_version_info{version="v18.16.0",major="18",minor="16",patch="0"} 1
# HELP nodejs_gc_duration_seconds Garbage collection duration by kind, one of major, minor, incremental or weakcb.
# TYPE nodejs_gc_duration_seconds histogram
# HELP http_request_duration_milliseconds App response time
# TYPE http_request_duration_milliseconds summary
# HELP db_query_duration_seconds DB query duration time
# TYPE db_query_duration_seconds summary
db_query_duration_seconds{quantile="0.1",store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds{quantile="0.5",store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds{quantile="0.9",store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds{quantile="0.95",store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds{quantile="0.99",store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds_sum{store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds_count{store="api-tokens",action="getAllActive"} 1
# HELP feature_toggle_update_total Number of times a toggle has been updated. Environment label would be "n/a" when it is not available, e.g. when a feature toggle is created.
# TYPE feature_toggle_update_total counter
# HELP feature_toggle_usage_total Number of times a feature toggle has been used
# TYPE feature_toggle_usage_total counter
# HELP feature_toggles_total Number of feature toggles
# TYPE feature_toggles_total gauge
feature_toggles_total{version="5.3.0"} 31
# HELP users_total Number of users
# TYPE users_total gauge
users_total 1011
# HELP projects_total Number of projects
# TYPE projects_total gauge
projects_total 4
# HELP environments_total Number of environments
# TYPE environments_total gauge
environments_total 10
# HELP groups_total Number of groups
# TYPE groups_total gauge
groups_total 5
# HELP roles_total Number of roles
# TYPE roles_total gauge
roles_total 11
# HELP custom_root_roles_total Number of custom root roles
# TYPE custom_root_roles_total gauge
custom_root_roles_total 3
# HELP custom_root_roles_in_use_total Number of custom root roles in use
# TYPE custom_root_roles_in_use_total gauge
custom_root_roles_in_use_total 2
# HELP segments_total Number of segments
# TYPE segments_total gauge
segments_total 5
# HELP context_total Number of context
# TYPE context_total gauge
context_total 7
# HELP strategies_total Number of strategies
# TYPE strategies_total gauge
strategies_total 5
# HELP client_apps_total Number of registered client apps aggregated by range by last seen
# TYPE client_apps_total gauge
client_apps_total{range="allTime"} 0
client_apps_total{range="30d"} 0
client_apps_total{range="7d"} 0
# HELP saml_enabled Whether SAML is enabled
# TYPE saml_enabled gauge
saml_enabled 1
# HELP oidc_enabled Whether OIDC is enabled
# TYPE oidc_enabled gauge
oidc_enabled 0
# HELP client_sdk_versions Which sdk versions are being used
# TYPE client_sdk_versions counter
# HELP optimal_304_diffing Count the Optimal 304 diffing with status
# TYPE optimal_304_diffing counter
# HELP db_pool_min Minimum DB pool size
# TYPE db_pool_min gauge
db_pool_min 0
# HELP db_pool_max Maximum DB pool size
# TYPE db_pool_max gauge
db_pool_max 4
# HELP db_pool_free Current free connections in DB pool
# TYPE db_pool_free gauge
db_pool_free 0
# HELP db_pool_used Current connections in use in DB pool
# TYPE db_pool_used gauge
db_pool_used 4
# HELP db_pool_pending_creates how many asynchronous create calls are running in DB pool
# TYPE db_pool_pending_creates gauge
db_pool_pending_creates 0
# HELP db_pool_pending_acquires how many acquires are waiting for a resource to be released in DB pool
# TYPE db_pool_pending_acquires gauge
db_pool_pending_acquires 24
```
2023-08-08 09:14:40 +02:00
|
|
|
name: 'custom_root_roles_in_use_total',
|
|
|
|
help: 'Number of custom root roles in use',
|
|
|
|
});
|
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const segmentsTotal = createGauge({
|
2022-10-25 13:10:27 +02:00
|
|
|
name: 'segments_total',
|
|
|
|
help: 'Number of segments',
|
|
|
|
});
|
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const contextTotal = createGauge({
|
2022-10-25 13:10:27 +02:00
|
|
|
name: 'context_total',
|
|
|
|
help: 'Number of context',
|
|
|
|
});
|
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const strategiesTotal = createGauge({
|
2022-10-25 13:10:27 +02:00
|
|
|
name: 'strategies_total',
|
|
|
|
help: 'Number of strategies',
|
|
|
|
});
|
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const clientAppsTotal = createGauge({
|
2022-12-16 12:16:51 +01:00
|
|
|
name: 'client_apps_total',
|
|
|
|
help: 'Number of registered client apps aggregated by range by last seen',
|
|
|
|
labelNames: ['range'],
|
|
|
|
});
|
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const samlEnabled = createGauge({
|
2022-10-25 13:10:27 +02:00
|
|
|
name: 'saml_enabled',
|
|
|
|
help: 'Whether SAML is enabled',
|
|
|
|
});
|
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const oidcEnabled = createGauge({
|
2022-10-25 13:10:27 +02:00
|
|
|
name: 'oidc_enabled',
|
|
|
|
help: 'Whether OIDC is enabled',
|
|
|
|
});
|
2020-02-28 14:50:32 +01:00
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const clientSdkVersionUsage = createCounter({
|
2022-07-22 11:00:22 +02:00
|
|
|
name: 'client_sdk_versions',
|
|
|
|
help: 'Which sdk versions are being used',
|
|
|
|
labelNames: ['sdk_name', 'sdk_version'],
|
|
|
|
});
|
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const productionChanges30 = createGauge({
|
2023-10-10 12:32:23 +02:00
|
|
|
name: 'production_changes_30',
|
|
|
|
help: 'Changes made to production environment last 30 days',
|
|
|
|
labelNames: ['environment'],
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const productionChanges60 = createGauge({
|
2023-10-10 12:32:23 +02:00
|
|
|
name: 'production_changes_60',
|
|
|
|
help: 'Changes made to production environment last 60 days',
|
|
|
|
labelNames: ['environment'],
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const productionChanges90 = createGauge({
|
2023-10-10 12:32:23 +02:00
|
|
|
name: 'production_changes_90',
|
|
|
|
help: 'Changes made to production environment last 90 days',
|
|
|
|
labelNames: ['environment'],
|
|
|
|
});
|
|
|
|
|
2024-01-19 15:51:29 +01:00
|
|
|
const rateLimits = createGauge({
|
2023-10-26 09:20:29 +02:00
|
|
|
name: 'rate_limits',
|
|
|
|
help: 'Rate limits (per minute) for METHOD/ENDPOINT pairs',
|
|
|
|
labelNames: ['endpoint', 'method'],
|
|
|
|
});
|
2024-01-31 12:30:42 +01:00
|
|
|
const featureCreatedByMigration = createCounter({
|
|
|
|
name: 'feature_created_by_migration_count',
|
|
|
|
help: 'Feature createdBy migration count',
|
|
|
|
});
|
|
|
|
const eventCreatedByMigration = createCounter({
|
|
|
|
name: 'event_created_by_migration_count',
|
|
|
|
help: 'Event createdBy migration count',
|
|
|
|
});
|
2024-02-15 14:58:48 +01:00
|
|
|
const proxyRepositoriesCreated = createCounter({
|
|
|
|
name: 'proxy_repositories_created',
|
|
|
|
help: 'Proxy repositories created',
|
|
|
|
});
|
2024-03-12 10:15:24 +01:00
|
|
|
const frontendApiRepositoriesCreated = createCounter({
|
|
|
|
name: 'frontend_api_repositories_created',
|
|
|
|
help: 'Frontend API repositories created',
|
|
|
|
});
|
2024-02-22 14:29:21 +01:00
|
|
|
const mapFeaturesForClientDuration = createHistogram({
|
|
|
|
name: 'map_features_for_client_duration',
|
|
|
|
help: 'Duration of mapFeaturesForClient function',
|
|
|
|
});
|
2023-10-26 09:20:29 +02:00
|
|
|
|
2024-05-08 10:33:51 +02:00
|
|
|
const featureLifecycleStageDuration = createHistogram({
|
|
|
|
name: 'feature_lifecycle_stage_duration',
|
2024-05-10 14:24:27 +02:00
|
|
|
labelNames: ['stage', 'project_id'],
|
2024-05-08 10:33:51 +02:00
|
|
|
help: 'Duration of feature lifecycle stages',
|
|
|
|
});
|
|
|
|
|
chore: Establish a baseline for the number of envs disabled per project (#6807)
This PR adds a counter in Prometheus for counting the number of
"environment disabled" events we get per project. The purpose of this is
to establish a baseline for one of the "project management UI" project's
key results.
## On gauges vs counters
This PR uses a counter. Using a gauge would give you the total number of
envs disabled, not the number of disable events. The difference is
subtle, but important.
For projects that were created before the new feature, the gauge might
be appropriate. Because each disabled env would require at least one
disabled event, we can get a floor of how many events were triggered for
each project.
However, for projects created after we introduce the planned change,
we're not interested in the total envs anymore, because you can disable
a hundred envs on creation with a single action. In this case, a gauge
showing 100 disabled envs would be misleading, because it didn't take
100 events to disable them.
So the interesting metric here is how many times did you specifically
disable an environment in project settings, hence the counter.
## Assumptions and future plans
To make this easier on ourselves, we make the follow assumption: people
primarily disable envs **when creating a project**.
This means that there might be a few lagging indicators granting some
projects a smaller number of events than expected, but we may be able to
filter those out.
Further, if we had a metric for each project and its creation date, we
could correlate that with the metrics to answer the question "how many
envs do people disable in the first week? Two weeks? A month?". Or
worded differently: after creating a project, how long does it take for
people to configure environments?
Similarly, if we gather that data, it will also make filtering out the
number of events for projects created **after** the new changes have
been released much easier.
The good news: Because the project creation metric with dates is a
static aggregate, it can be applied at any time, even retroactively, to
see the effects.
2024-04-10 08:49:15 +02:00
|
|
|
const projectEnvironmentsDisabled = createCounter({
|
|
|
|
name: 'project_environments_disabled',
|
|
|
|
help: 'How many "environment disabled" events we have received for each project',
|
|
|
|
labelNames: ['project_id'],
|
|
|
|
});
|
|
|
|
|
2021-08-27 10:10:14 +02:00
|
|
|
async function collectStaticCounters() {
|
2021-02-17 15:24:43 +01:00
|
|
|
try {
|
2022-10-25 13:10:27 +02:00
|
|
|
const stats = await instanceStatsService.getStats();
|
|
|
|
|
|
|
|
featureTogglesTotal.reset();
|
2024-01-19 15:51:29 +01:00
|
|
|
featureTogglesTotal
|
|
|
|
.labels({ version })
|
|
|
|
.set(stats.featureToggles);
|
2021-02-17 15:24:43 +01:00
|
|
|
|
2024-03-28 12:40:30 +01:00
|
|
|
featureTogglesArchivedTotal.reset();
|
|
|
|
featureTogglesArchivedTotal.set(stats.archivedFeatureToggles);
|
|
|
|
|
2021-08-27 10:10:14 +02:00
|
|
|
usersTotal.reset();
|
2022-10-25 13:10:27 +02:00
|
|
|
usersTotal.set(stats.users);
|
|
|
|
|
2023-11-29 13:09:30 +01:00
|
|
|
serviceAccounts.reset();
|
|
|
|
serviceAccounts.set(stats.serviceAccounts);
|
|
|
|
|
2024-05-08 10:33:51 +02:00
|
|
|
stats.featureLifeCycles.forEach((stage) => {
|
|
|
|
featureLifecycleStageDuration
|
|
|
|
.labels({
|
|
|
|
stage: stage.stage,
|
2024-05-08 14:19:23 +02:00
|
|
|
project_id: stage.project,
|
2024-05-08 10:33:51 +02:00
|
|
|
})
|
|
|
|
.observe(stage.duration);
|
|
|
|
});
|
|
|
|
|
2023-11-29 13:09:30 +01:00
|
|
|
apiTokens.reset();
|
|
|
|
|
|
|
|
for (const [type, value] of stats.apiTokens) {
|
2024-01-19 15:51:29 +01:00
|
|
|
apiTokens.labels({ type }).set(value);
|
2023-11-29 13:09:30 +01:00
|
|
|
}
|
|
|
|
|
2024-01-15 15:31:38 +01:00
|
|
|
enabledMetricsBucketsPreviousDay.reset();
|
|
|
|
enabledMetricsBucketsPreviousDay.set(
|
|
|
|
stats.previousDayMetricsBucketsCount.enabledCount,
|
|
|
|
);
|
|
|
|
variantMetricsBucketsPreviousDay.reset();
|
|
|
|
variantMetricsBucketsPreviousDay.set(
|
|
|
|
stats.previousDayMetricsBucketsCount.variantCount,
|
|
|
|
);
|
|
|
|
|
2023-09-18 15:05:17 +02:00
|
|
|
usersActive7days.reset();
|
|
|
|
usersActive7days.set(stats.activeUsers.last7);
|
|
|
|
usersActive30days.reset();
|
|
|
|
usersActive30days.set(stats.activeUsers.last30);
|
|
|
|
usersActive60days.reset();
|
|
|
|
usersActive60days.set(stats.activeUsers.last60);
|
|
|
|
usersActive90days.reset();
|
|
|
|
usersActive90days.set(stats.activeUsers.last90);
|
|
|
|
|
2023-10-10 12:32:23 +02:00
|
|
|
productionChanges30.reset();
|
|
|
|
productionChanges30.set(stats.productionChanges.last30);
|
|
|
|
productionChanges60.reset();
|
|
|
|
productionChanges60.set(stats.productionChanges.last60);
|
|
|
|
productionChanges90.reset();
|
|
|
|
productionChanges90.set(stats.productionChanges.last90);
|
|
|
|
|
2021-08-27 10:10:14 +02:00
|
|
|
projectsTotal.reset();
|
2023-09-25 11:07:59 +02:00
|
|
|
stats.projects.forEach((projectStat) => {
|
|
|
|
projectsTotal
|
|
|
|
.labels({ mode: projectStat.mode })
|
|
|
|
.set(projectStat.count);
|
|
|
|
});
|
2022-10-25 13:10:27 +02:00
|
|
|
|
2022-09-06 13:24:13 +02:00
|
|
|
environmentsTotal.reset();
|
2022-10-25 13:10:27 +02:00
|
|
|
environmentsTotal.set(stats.environments);
|
|
|
|
|
|
|
|
groupsTotal.reset();
|
|
|
|
groupsTotal.set(stats.groups);
|
|
|
|
|
|
|
|
rolesTotal.reset();
|
|
|
|
rolesTotal.set(stats.roles);
|
|
|
|
|
2023-08-07 15:59:29 +02:00
|
|
|
customRootRolesTotal.reset();
|
|
|
|
customRootRolesTotal.set(stats.customRootRoles);
|
|
|
|
|
feat: add prom metric for total custom root roles in use (#4438)
https://linear.app/unleash/issue/2-1311/add-a-new-prometheus-metric-with-custom-root-roles-in-use
As a follow-up to https://github.com/Unleash/unleash/pull/4435, this PR
adds a metric for total custom root roles in use by at least one entity:
users, service accounts, groups.
`custom_root_roles_in_use_total`
Output from `http://localhost:4242/internal-backstage/prometheus`:
```
# HELP process_cpu_user_seconds_total Total user CPU time spent in seconds.
# TYPE process_cpu_user_seconds_total counter
process_cpu_user_seconds_total 0.060755
# HELP process_cpu_system_seconds_total Total system CPU time spent in seconds.
# TYPE process_cpu_system_seconds_total counter
process_cpu_system_seconds_total 0.01666
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total 0.077415
# HELP process_start_time_seconds Start time of the process since unix epoch in seconds.
# TYPE process_start_time_seconds gauge
process_start_time_seconds 1691420275
# HELP process_resident_memory_bytes Resident memory size in bytes.
# TYPE process_resident_memory_bytes gauge
process_resident_memory_bytes 199196672
# HELP nodejs_eventloop_lag_seconds Lag of event loop in seconds.
# TYPE nodejs_eventloop_lag_seconds gauge
nodejs_eventloop_lag_seconds 0
# HELP nodejs_eventloop_lag_min_seconds The minimum recorded event loop delay.
# TYPE nodejs_eventloop_lag_min_seconds gauge
nodejs_eventloop_lag_min_seconds 0.009076736
# HELP nodejs_eventloop_lag_max_seconds The maximum recorded event loop delay.
# TYPE nodejs_eventloop_lag_max_seconds gauge
nodejs_eventloop_lag_max_seconds 0.037683199
# HELP nodejs_eventloop_lag_mean_seconds The mean of the recorded event loop delays.
# TYPE nodejs_eventloop_lag_mean_seconds gauge
nodejs_eventloop_lag_mean_seconds 0.011063251638989169
# HELP nodejs_eventloop_lag_stddev_seconds The standard deviation of the recorded event loop delays.
# TYPE nodejs_eventloop_lag_stddev_seconds gauge
nodejs_eventloop_lag_stddev_seconds 0.0013618102764025837
# HELP nodejs_eventloop_lag_p50_seconds The 50th percentile of the recorded event loop delays.
# TYPE nodejs_eventloop_lag_p50_seconds gauge
nodejs_eventloop_lag_p50_seconds 0.011051007
# HELP nodejs_eventloop_lag_p90_seconds The 90th percentile of the recorded event loop delays.
# TYPE nodejs_eventloop_lag_p90_seconds gauge
nodejs_eventloop_lag_p90_seconds 0.011321343
# HELP nodejs_eventloop_lag_p99_seconds The 99th percentile of the recorded event loop delays.
# TYPE nodejs_eventloop_lag_p99_seconds gauge
nodejs_eventloop_lag_p99_seconds 0.013688831
# HELP nodejs_active_resources Number of active resources that are currently keeping the event loop alive, grouped by async resource type.
# TYPE nodejs_active_resources gauge
nodejs_active_resources{type="FSReqCallback"} 1
nodejs_active_resources{type="TTYWrap"} 3
nodejs_active_resources{type="TCPSocketWrap"} 5
nodejs_active_resources{type="TCPServerWrap"} 1
nodejs_active_resources{type="Timeout"} 1
nodejs_active_resources{type="Immediate"} 1
# HELP nodejs_active_resources_total Total number of active resources.
# TYPE nodejs_active_resources_total gauge
nodejs_active_resources_total 12
# HELP nodejs_active_handles Number of active libuv handles grouped by handle type. Every handle type is C++ class name.
# TYPE nodejs_active_handles gauge
nodejs_active_handles{type="WriteStream"} 2
nodejs_active_handles{type="ReadStream"} 1
nodejs_active_handles{type="Socket"} 5
nodejs_active_handles{type="Server"} 1
# HELP nodejs_active_handles_total Total number of active handles.
# TYPE nodejs_active_handles_total gauge
nodejs_active_handles_total 9
# HELP nodejs_active_requests Number of active libuv requests grouped by request type. Every request type is C++ class name.
# TYPE nodejs_active_requests gauge
nodejs_active_requests{type="FSReqCallback"} 1
# HELP nodejs_active_requests_total Total number of active requests.
# TYPE nodejs_active_requests_total gauge
nodejs_active_requests_total 1
# HELP nodejs_heap_size_total_bytes Process heap size from Node.js in bytes.
# TYPE nodejs_heap_size_total_bytes gauge
nodejs_heap_size_total_bytes 118587392
# HELP nodejs_heap_size_used_bytes Process heap size used from Node.js in bytes.
# TYPE nodejs_heap_size_used_bytes gauge
nodejs_heap_size_used_bytes 89642552
# HELP nodejs_external_memory_bytes Node.js external memory size in bytes.
# TYPE nodejs_external_memory_bytes gauge
nodejs_external_memory_bytes 1601594
# HELP nodejs_heap_space_size_total_bytes Process heap space size total from Node.js in bytes.
# TYPE nodejs_heap_space_size_total_bytes gauge
nodejs_heap_space_size_total_bytes{space="read_only"} 0
nodejs_heap_space_size_total_bytes{space="old"} 70139904
nodejs_heap_space_size_total_bytes{space="code"} 3588096
nodejs_heap_space_size_total_bytes{space="map"} 2899968
nodejs_heap_space_size_total_bytes{space="large_object"} 7258112
nodejs_heap_space_size_total_bytes{space="code_large_object"} 1146880
nodejs_heap_space_size_total_bytes{space="new_large_object"} 0
nodejs_heap_space_size_total_bytes{space="new"} 33554432
# HELP nodejs_heap_space_size_used_bytes Process heap space size used from Node.js in bytes.
# TYPE nodejs_heap_space_size_used_bytes gauge
nodejs_heap_space_size_used_bytes{space="read_only"} 0
nodejs_heap_space_size_used_bytes{space="old"} 66992120
nodejs_heap_space_size_used_bytes{space="code"} 2892640
nodejs_heap_space_size_used_bytes{space="map"} 2519280
nodejs_heap_space_size_used_bytes{space="large_object"} 7026824
nodejs_heap_space_size_used_bytes{space="code_large_object"} 983200
nodejs_heap_space_size_used_bytes{space="new_large_object"} 0
nodejs_heap_space_size_used_bytes{space="new"} 9236136
# HELP nodejs_heap_space_size_available_bytes Process heap space size available from Node.js in bytes.
# TYPE nodejs_heap_space_size_available_bytes gauge
nodejs_heap_space_size_available_bytes{space="read_only"} 0
nodejs_heap_space_size_available_bytes{space="old"} 1898360
nodejs_heap_space_size_available_bytes{space="code"} 7328
nodejs_heap_space_size_available_bytes{space="map"} 327888
nodejs_heap_space_size_available_bytes{space="large_object"} 0
nodejs_heap_space_size_available_bytes{space="code_large_object"} 0
nodejs_heap_space_size_available_bytes{space="new_large_object"} 16495616
nodejs_heap_space_size_available_bytes{space="new"} 7259480
# HELP nodejs_version_info Node.js version info.
# TYPE nodejs_version_info gauge
nodejs_version_info{version="v18.16.0",major="18",minor="16",patch="0"} 1
# HELP nodejs_gc_duration_seconds Garbage collection duration by kind, one of major, minor, incremental or weakcb.
# TYPE nodejs_gc_duration_seconds histogram
# HELP http_request_duration_milliseconds App response time
# TYPE http_request_duration_milliseconds summary
# HELP db_query_duration_seconds DB query duration time
# TYPE db_query_duration_seconds summary
db_query_duration_seconds{quantile="0.1",store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds{quantile="0.5",store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds{quantile="0.9",store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds{quantile="0.95",store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds{quantile="0.99",store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds_sum{store="api-tokens",action="getAllActive"} 0.03091475
db_query_duration_seconds_count{store="api-tokens",action="getAllActive"} 1
# HELP feature_toggle_update_total Number of times a toggle has been updated. Environment label would be "n/a" when it is not available, e.g. when a feature toggle is created.
# TYPE feature_toggle_update_total counter
# HELP feature_toggle_usage_total Number of times a feature toggle has been used
# TYPE feature_toggle_usage_total counter
# HELP feature_toggles_total Number of feature toggles
# TYPE feature_toggles_total gauge
feature_toggles_total{version="5.3.0"} 31
# HELP users_total Number of users
# TYPE users_total gauge
users_total 1011
# HELP projects_total Number of projects
# TYPE projects_total gauge
projects_total 4
# HELP environments_total Number of environments
# TYPE environments_total gauge
environments_total 10
# HELP groups_total Number of groups
# TYPE groups_total gauge
groups_total 5
# HELP roles_total Number of roles
# TYPE roles_total gauge
roles_total 11
# HELP custom_root_roles_total Number of custom root roles
# TYPE custom_root_roles_total gauge
custom_root_roles_total 3
# HELP custom_root_roles_in_use_total Number of custom root roles in use
# TYPE custom_root_roles_in_use_total gauge
custom_root_roles_in_use_total 2
# HELP segments_total Number of segments
# TYPE segments_total gauge
segments_total 5
# HELP context_total Number of context
# TYPE context_total gauge
context_total 7
# HELP strategies_total Number of strategies
# TYPE strategies_total gauge
strategies_total 5
# HELP client_apps_total Number of registered client apps aggregated by range by last seen
# TYPE client_apps_total gauge
client_apps_total{range="allTime"} 0
client_apps_total{range="30d"} 0
client_apps_total{range="7d"} 0
# HELP saml_enabled Whether SAML is enabled
# TYPE saml_enabled gauge
saml_enabled 1
# HELP oidc_enabled Whether OIDC is enabled
# TYPE oidc_enabled gauge
oidc_enabled 0
# HELP client_sdk_versions Which sdk versions are being used
# TYPE client_sdk_versions counter
# HELP optimal_304_diffing Count the Optimal 304 diffing with status
# TYPE optimal_304_diffing counter
# HELP db_pool_min Minimum DB pool size
# TYPE db_pool_min gauge
db_pool_min 0
# HELP db_pool_max Maximum DB pool size
# TYPE db_pool_max gauge
db_pool_max 4
# HELP db_pool_free Current free connections in DB pool
# TYPE db_pool_free gauge
db_pool_free 0
# HELP db_pool_used Current connections in use in DB pool
# TYPE db_pool_used gauge
db_pool_used 4
# HELP db_pool_pending_creates how many asynchronous create calls are running in DB pool
# TYPE db_pool_pending_creates gauge
db_pool_pending_creates 0
# HELP db_pool_pending_acquires how many acquires are waiting for a resource to be released in DB pool
# TYPE db_pool_pending_acquires gauge
db_pool_pending_acquires 24
```
2023-08-08 09:14:40 +02:00
|
|
|
customRootRolesInUseTotal.reset();
|
|
|
|
customRootRolesInUseTotal.set(stats.customRootRolesInUse);
|
|
|
|
|
2022-10-25 13:10:27 +02:00
|
|
|
segmentsTotal.reset();
|
|
|
|
segmentsTotal.set(stats.segments);
|
|
|
|
|
|
|
|
contextTotal.reset();
|
|
|
|
contextTotal.set(stats.contextFields);
|
|
|
|
|
|
|
|
strategiesTotal.reset();
|
|
|
|
strategiesTotal.set(stats.strategies);
|
|
|
|
|
|
|
|
samlEnabled.reset();
|
|
|
|
samlEnabled.set(stats.SAMLenabled ? 1 : 0);
|
|
|
|
|
|
|
|
oidcEnabled.reset();
|
|
|
|
oidcEnabled.set(stats.OIDCenabled ? 1 : 0);
|
2022-12-16 12:16:51 +01:00
|
|
|
|
|
|
|
clientAppsTotal.reset();
|
2024-02-08 17:15:42 +01:00
|
|
|
stats.clientApps.forEach(({ range, count }) =>
|
|
|
|
clientAppsTotal.labels({ range }).set(count),
|
2022-12-16 12:16:51 +01:00
|
|
|
);
|
2023-10-26 09:20:29 +02:00
|
|
|
|
|
|
|
rateLimits.reset();
|
|
|
|
rateLimits
|
2024-05-08 10:33:51 +02:00
|
|
|
.labels({
|
|
|
|
endpoint: '/api/client/metrics',
|
|
|
|
method: 'POST',
|
|
|
|
})
|
2023-10-26 09:20:29 +02:00
|
|
|
.set(config.metricsRateLimiting.clientMetricsMaxPerMinute);
|
|
|
|
rateLimits
|
|
|
|
.labels({
|
|
|
|
endpoint: '/api/client/register',
|
|
|
|
method: 'POST',
|
|
|
|
})
|
|
|
|
.set(config.metricsRateLimiting.clientRegisterMaxPerMinute);
|
|
|
|
rateLimits
|
|
|
|
.labels({
|
|
|
|
endpoint: '/api/frontend/metrics',
|
|
|
|
method: 'POST',
|
|
|
|
})
|
|
|
|
.set(
|
|
|
|
config.metricsRateLimiting.frontendMetricsMaxPerMinute,
|
|
|
|
);
|
|
|
|
rateLimits
|
|
|
|
.labels({
|
|
|
|
endpoint: '/api/frontend/register',
|
|
|
|
method: 'POST',
|
|
|
|
})
|
|
|
|
.set(
|
|
|
|
config.metricsRateLimiting.frontendRegisterMaxPerMinute,
|
|
|
|
);
|
|
|
|
rateLimits
|
|
|
|
.labels({
|
|
|
|
endpoint: '/api/admin/user-admin',
|
|
|
|
method: 'POST',
|
|
|
|
})
|
|
|
|
.set(config.rateLimiting.createUserMaxPerMinute);
|
|
|
|
rateLimits
|
2024-05-08 10:33:51 +02:00
|
|
|
.labels({
|
|
|
|
endpoint: '/auth/simple',
|
|
|
|
method: 'POST',
|
|
|
|
})
|
2023-10-26 09:20:29 +02:00
|
|
|
.set(config.rateLimiting.simpleLoginMaxPerMinute);
|
2024-02-21 08:49:54 +01:00
|
|
|
rateLimits
|
|
|
|
.labels({
|
|
|
|
endpoint: '/auth/reset/password-email',
|
|
|
|
method: 'POST',
|
|
|
|
})
|
|
|
|
.set(config.rateLimiting.passwordResetMaxPerMinute);
|
2024-02-15 18:05:52 +01:00
|
|
|
rateLimits
|
|
|
|
.labels({
|
2024-03-04 13:08:05 +01:00
|
|
|
endpoint: '/api/signal-endpoint/:name',
|
2024-02-15 18:05:52 +01:00
|
|
|
method: 'POST',
|
|
|
|
})
|
|
|
|
.set(
|
2024-03-04 13:08:05 +01:00
|
|
|
config.rateLimiting.callSignalEndpointMaxPerSecond * 60,
|
2024-02-15 18:05:52 +01:00
|
|
|
);
|
2022-10-25 13:10:27 +02:00
|
|
|
} catch (e) {}
|
2020-12-16 14:49:11 +01:00
|
|
|
}
|
2024-02-08 17:15:42 +01:00
|
|
|
await schedulerService.schedule(
|
|
|
|
collectStaticCounters.bind(this),
|
|
|
|
hoursToMilliseconds(2),
|
|
|
|
'collectStaticCounters',
|
|
|
|
0, // no jitter
|
|
|
|
);
|
2018-05-23 11:24:24 +02:00
|
|
|
|
2020-12-16 14:49:11 +01:00
|
|
|
eventBus.on(
|
|
|
|
events.REQUEST_TIME,
|
2022-09-30 15:28:50 +02:00
|
|
|
({ path, method, time, statusCode, appName }) => {
|
|
|
|
requestDuration
|
2024-05-08 10:33:51 +02:00
|
|
|
.labels({
|
|
|
|
path,
|
|
|
|
method,
|
|
|
|
status: statusCode,
|
|
|
|
appName,
|
|
|
|
})
|
2022-09-30 15:28:50 +02:00
|
|
|
.observe(time);
|
2020-12-16 14:49:11 +01:00
|
|
|
},
|
|
|
|
);
|
2019-08-04 11:10:51 +02:00
|
|
|
|
2023-11-21 13:42:38 +01:00
|
|
|
eventBus.on(events.SCHEDULER_JOB_TIME, ({ jobId, time }) => {
|
|
|
|
schedulerDuration.labels(jobId).observe(time);
|
|
|
|
});
|
|
|
|
|
2024-03-12 13:27:04 +01:00
|
|
|
eventBus.on(
|
|
|
|
events.FUNCTION_TIME,
|
|
|
|
({ functionName, className, time }) => {
|
|
|
|
functionDuration
|
2024-05-08 10:33:51 +02:00
|
|
|
.labels({
|
|
|
|
functionName,
|
|
|
|
className,
|
|
|
|
})
|
2024-03-12 13:27:04 +01:00
|
|
|
.observe(time);
|
|
|
|
},
|
|
|
|
);
|
2024-03-12 11:30:30 +01:00
|
|
|
|
2024-01-31 12:30:42 +01:00
|
|
|
eventBus.on(events.EVENTS_CREATED_BY_PROCESSED, ({ updated }) => {
|
|
|
|
eventCreatedByMigration.inc(updated);
|
|
|
|
});
|
|
|
|
|
|
|
|
eventBus.on(events.FEATURES_CREATED_BY_PROCESSED, ({ updated }) => {
|
|
|
|
featureCreatedByMigration.inc(updated);
|
|
|
|
});
|
|
|
|
|
2020-12-16 14:49:11 +01:00
|
|
|
eventBus.on(events.DB_TIME, ({ store, action, time }) => {
|
2024-05-08 10:33:51 +02:00
|
|
|
dbDuration
|
|
|
|
.labels({
|
|
|
|
store,
|
|
|
|
action,
|
|
|
|
})
|
|
|
|
.observe(time);
|
2020-12-16 14:49:11 +01:00
|
|
|
});
|
2018-11-28 15:50:49 +01:00
|
|
|
|
2024-02-15 14:58:48 +01:00
|
|
|
eventBus.on(events.PROXY_REPOSITORY_CREATED, () => {
|
|
|
|
proxyRepositoriesCreated.inc();
|
|
|
|
});
|
|
|
|
|
2024-03-12 10:15:24 +01:00
|
|
|
eventBus.on(events.FRONTEND_API_REPOSITORY_CREATED, () => {
|
|
|
|
frontendApiRepositoriesCreated.inc();
|
|
|
|
});
|
|
|
|
|
2024-02-22 14:29:21 +01:00
|
|
|
eventBus.on(events.PROXY_FEATURES_FOR_TOKEN_TIME, ({ duration }) => {
|
|
|
|
mapFeaturesForClientDuration.observe(duration);
|
|
|
|
});
|
|
|
|
|
2022-09-08 11:01:27 +02:00
|
|
|
eventStore.on(FEATURE_CREATED, ({ featureName, project }) => {
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUpdateTotal.increment({
|
|
|
|
toggle: featureName,
|
|
|
|
project,
|
|
|
|
environment: 'n/a',
|
|
|
|
environmentType: 'n/a',
|
|
|
|
});
|
2022-09-08 11:01:27 +02:00
|
|
|
});
|
|
|
|
eventStore.on(FEATURE_VARIANTS_UPDATED, ({ featureName, project }) => {
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUpdateTotal.increment({
|
|
|
|
toggle: featureName,
|
|
|
|
project,
|
|
|
|
environment: 'n/a',
|
|
|
|
environmentType: 'n/a',
|
|
|
|
});
|
2022-09-08 11:01:27 +02:00
|
|
|
});
|
|
|
|
eventStore.on(FEATURE_METADATA_UPDATED, ({ featureName, project }) => {
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUpdateTotal.increment({
|
|
|
|
toggle: featureName,
|
|
|
|
project,
|
|
|
|
environment: 'n/a',
|
|
|
|
environmentType: 'n/a',
|
|
|
|
});
|
2022-09-08 11:01:27 +02:00
|
|
|
});
|
|
|
|
eventStore.on(FEATURE_UPDATED, ({ featureName, project }) => {
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUpdateTotal.increment({
|
|
|
|
toggle: featureName,
|
|
|
|
project,
|
|
|
|
environment: 'default',
|
|
|
|
environmentType: 'production',
|
|
|
|
});
|
2022-09-08 11:01:27 +02:00
|
|
|
});
|
|
|
|
eventStore.on(
|
|
|
|
FEATURE_STRATEGY_ADD,
|
2024-01-09 16:33:00 +01:00
|
|
|
async ({ featureName, project, environment }) => {
|
|
|
|
const environmentType = await this.resolveEnvironmentType(
|
|
|
|
environment,
|
|
|
|
cachedEnvironments,
|
|
|
|
);
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUpdateTotal.increment({
|
|
|
|
toggle: featureName,
|
|
|
|
project,
|
|
|
|
environment,
|
|
|
|
environmentType,
|
|
|
|
});
|
2022-09-08 11:01:27 +02:00
|
|
|
},
|
|
|
|
);
|
|
|
|
eventStore.on(
|
|
|
|
FEATURE_STRATEGY_REMOVE,
|
2024-01-09 16:33:00 +01:00
|
|
|
async ({ featureName, project, environment }) => {
|
|
|
|
const environmentType = await this.resolveEnvironmentType(
|
|
|
|
environment,
|
|
|
|
cachedEnvironments,
|
|
|
|
);
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUpdateTotal.increment({
|
|
|
|
toggle: featureName,
|
|
|
|
project,
|
|
|
|
environment,
|
|
|
|
environmentType,
|
|
|
|
});
|
2022-09-08 11:01:27 +02:00
|
|
|
},
|
|
|
|
);
|
|
|
|
eventStore.on(
|
|
|
|
FEATURE_STRATEGY_UPDATE,
|
2024-01-09 16:33:00 +01:00
|
|
|
async ({ featureName, project, environment }) => {
|
|
|
|
const environmentType = await this.resolveEnvironmentType(
|
|
|
|
environment,
|
|
|
|
cachedEnvironments,
|
|
|
|
);
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUpdateTotal.increment({
|
|
|
|
toggle: featureName,
|
|
|
|
project,
|
|
|
|
environment,
|
|
|
|
environmentType,
|
|
|
|
});
|
2022-09-08 11:01:27 +02:00
|
|
|
},
|
|
|
|
);
|
|
|
|
eventStore.on(
|
|
|
|
FEATURE_ENVIRONMENT_DISABLED,
|
2024-01-09 16:33:00 +01:00
|
|
|
async ({ featureName, project, environment }) => {
|
|
|
|
const environmentType = await this.resolveEnvironmentType(
|
|
|
|
environment,
|
|
|
|
cachedEnvironments,
|
|
|
|
);
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUpdateTotal.increment({
|
|
|
|
toggle: featureName,
|
|
|
|
project,
|
|
|
|
environment,
|
|
|
|
environmentType,
|
|
|
|
});
|
2022-09-08 11:01:27 +02:00
|
|
|
},
|
|
|
|
);
|
|
|
|
eventStore.on(
|
|
|
|
FEATURE_ENVIRONMENT_ENABLED,
|
2024-01-09 16:33:00 +01:00
|
|
|
async ({ featureName, project, environment }) => {
|
|
|
|
const environmentType = await this.resolveEnvironmentType(
|
|
|
|
environment,
|
|
|
|
cachedEnvironments,
|
|
|
|
);
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUpdateTotal.increment({
|
|
|
|
toggle: featureName,
|
|
|
|
project,
|
|
|
|
environment,
|
|
|
|
environmentType,
|
|
|
|
});
|
2022-09-08 11:01:27 +02:00
|
|
|
},
|
|
|
|
);
|
|
|
|
eventStore.on(FEATURE_ARCHIVED, ({ featureName, project }) => {
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUpdateTotal.increment({
|
|
|
|
toggle: featureName,
|
|
|
|
project,
|
|
|
|
environment: 'n/a',
|
|
|
|
environmentType: 'n/a',
|
|
|
|
});
|
2021-11-12 13:15:51 +01:00
|
|
|
});
|
2022-09-08 11:01:27 +02:00
|
|
|
eventStore.on(FEATURE_REVIVED, ({ featureName, project }) => {
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUpdateTotal.increment({
|
|
|
|
toggle: featureName,
|
|
|
|
project,
|
|
|
|
environment: 'n/a',
|
|
|
|
environmentType: 'n/a',
|
|
|
|
});
|
2020-12-16 14:49:11 +01:00
|
|
|
});
|
|
|
|
|
2023-02-15 09:13:32 +01:00
|
|
|
eventBus.on(CLIENT_METRICS, (m: ValidatedClientMetrics) => {
|
2021-04-22 10:07:10 +02:00
|
|
|
for (const entry of Object.entries(m.bucket.toggles)) {
|
2024-01-19 15:51:29 +01:00
|
|
|
featureToggleUsageTotal.increment(
|
|
|
|
{
|
|
|
|
toggle: entry[0],
|
|
|
|
active: 'true',
|
|
|
|
appName: m.appName,
|
|
|
|
},
|
|
|
|
entry[1].yes,
|
|
|
|
);
|
|
|
|
featureToggleUsageTotal.increment(
|
|
|
|
{
|
|
|
|
toggle: entry[0],
|
|
|
|
active: 'false',
|
|
|
|
appName: m.appName,
|
|
|
|
},
|
|
|
|
entry[1].no,
|
|
|
|
);
|
2020-12-16 14:49:11 +01:00
|
|
|
}
|
|
|
|
});
|
2022-09-27 11:06:06 +02:00
|
|
|
eventStore.on(CLIENT_REGISTER, (m) => {
|
2022-07-22 11:00:22 +02:00
|
|
|
if (m.sdkVersion && m.sdkVersion.indexOf(':') > -1) {
|
|
|
|
const [sdkName, sdkVersion] = m.sdkVersion.split(':');
|
2024-01-19 15:51:29 +01:00
|
|
|
clientSdkVersionUsage.increment({
|
|
|
|
sdk_name: sdkName,
|
|
|
|
sdk_version: sdkVersion,
|
|
|
|
});
|
2022-07-22 11:00:22 +02:00
|
|
|
}
|
|
|
|
});
|
chore: Establish a baseline for the number of envs disabled per project (#6807)
This PR adds a counter in Prometheus for counting the number of
"environment disabled" events we get per project. The purpose of this is
to establish a baseline for one of the "project management UI" project's
key results.
## On gauges vs counters
This PR uses a counter. Using a gauge would give you the total number of
envs disabled, not the number of disable events. The difference is
subtle, but important.
For projects that were created before the new feature, the gauge might
be appropriate. Because each disabled env would require at least one
disabled event, we can get a floor of how many events were triggered for
each project.
However, for projects created after we introduce the planned change,
we're not interested in the total envs anymore, because you can disable
a hundred envs on creation with a single action. In this case, a gauge
showing 100 disabled envs would be misleading, because it didn't take
100 events to disable them.
So the interesting metric here is how many times did you specifically
disable an environment in project settings, hence the counter.
## Assumptions and future plans
To make this easier on ourselves, we make the follow assumption: people
primarily disable envs **when creating a project**.
This means that there might be a few lagging indicators granting some
projects a smaller number of events than expected, but we may be able to
filter those out.
Further, if we had a metric for each project and its creation date, we
could correlate that with the metrics to answer the question "how many
envs do people disable in the first week? Two weeks? A month?". Or
worded differently: after creating a project, how long does it take for
people to configure environments?
Similarly, if we gather that data, it will also make filtering out the
number of events for projects created **after** the new changes have
been released much easier.
The good news: Because the project creation metric with dates is a
static aggregate, it can be applied at any time, even retroactively, to
see the effects.
2024-04-10 08:49:15 +02:00
|
|
|
eventStore.on(PROJECT_ENVIRONMENT_REMOVED, ({ project }) => {
|
|
|
|
projectEnvironmentsDisabled.increment({ project_id: project });
|
|
|
|
});
|
2021-02-04 14:14:46 +01:00
|
|
|
|
2024-05-13 14:41:28 +02:00
|
|
|
await this.configureDbMetrics(
|
|
|
|
db,
|
|
|
|
eventBus,
|
|
|
|
schedulerService,
|
|
|
|
stores.settingStore,
|
|
|
|
);
|
2023-11-29 13:09:30 +01:00
|
|
|
|
|
|
|
return Promise.resolve();
|
2020-12-16 14:49:11 +01:00
|
|
|
}
|
|
|
|
|
2024-02-08 17:15:42 +01:00
|
|
|
async configureDbMetrics(
|
|
|
|
db: Knex,
|
|
|
|
eventBus: EventEmitter,
|
|
|
|
schedulerService: SchedulerService,
|
2024-05-13 14:41:28 +02:00
|
|
|
settingStore: ISettingStore,
|
2024-02-08 17:15:42 +01:00
|
|
|
): Promise<void> {
|
2023-09-29 14:18:21 +02:00
|
|
|
if (db?.client) {
|
2024-01-19 15:51:29 +01:00
|
|
|
const dbPoolMin = createGauge({
|
2021-02-04 14:14:46 +01:00
|
|
|
name: 'db_pool_min',
|
|
|
|
help: 'Minimum DB pool size',
|
|
|
|
});
|
2021-04-22 10:07:10 +02:00
|
|
|
dbPoolMin.set(db.client.pool.min);
|
2024-01-19 15:51:29 +01:00
|
|
|
const dbPoolMax = createGauge({
|
2021-02-04 14:14:46 +01:00
|
|
|
name: 'db_pool_max',
|
|
|
|
help: 'Maximum DB pool size',
|
|
|
|
});
|
2021-04-22 10:07:10 +02:00
|
|
|
dbPoolMax.set(db.client.pool.max);
|
2024-01-19 15:51:29 +01:00
|
|
|
const dbPoolFree = createGauge({
|
2021-02-04 14:14:46 +01:00
|
|
|
name: 'db_pool_free',
|
|
|
|
help: 'Current free connections in DB pool',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const dbPoolUsed = createGauge({
|
2021-02-04 14:14:46 +01:00
|
|
|
name: 'db_pool_used',
|
|
|
|
help: 'Current connections in use in DB pool',
|
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const dbPoolPendingCreates = createGauge({
|
2021-02-04 14:14:46 +01:00
|
|
|
name: 'db_pool_pending_creates',
|
2021-08-12 15:04:37 +02:00
|
|
|
help: 'how many asynchronous create calls are running in DB pool',
|
2021-02-04 14:14:46 +01:00
|
|
|
});
|
2024-01-19 15:51:29 +01:00
|
|
|
const dbPoolPendingAcquires = createGauge({
|
2021-02-04 14:14:46 +01:00
|
|
|
name: 'db_pool_pending_acquires',
|
2021-08-12 15:04:37 +02:00
|
|
|
help: 'how many acquires are waiting for a resource to be released in DB pool',
|
2021-02-04 14:14:46 +01:00
|
|
|
});
|
|
|
|
|
2021-08-12 15:04:37 +02:00
|
|
|
eventBus.on(DB_POOL_UPDATE, (data) => {
|
2021-02-04 14:14:46 +01:00
|
|
|
dbPoolFree.set(data.free);
|
|
|
|
dbPoolUsed.set(data.used);
|
|
|
|
dbPoolPendingCreates.set(data.pendingCreates);
|
|
|
|
dbPoolPendingAcquires.set(data.pendingAcquires);
|
|
|
|
});
|
|
|
|
|
2024-02-08 17:15:42 +01:00
|
|
|
await schedulerService.schedule(
|
2024-04-10 11:47:22 +02:00
|
|
|
async () =>
|
|
|
|
this.registerPoolMetrics.bind(
|
|
|
|
this,
|
|
|
|
db.client.pool,
|
|
|
|
eventBus,
|
|
|
|
),
|
2021-11-02 15:13:46 +01:00
|
|
|
minutesToMilliseconds(1),
|
2024-02-08 17:15:42 +01:00
|
|
|
'registerPoolMetrics',
|
|
|
|
0, // no jitter
|
2021-02-04 14:14:46 +01:00
|
|
|
);
|
2024-05-13 14:41:28 +02:00
|
|
|
const postgresVersion = await settingStore.postgresVersion();
|
|
|
|
const database_version = createGauge({
|
|
|
|
name: 'postgres_version',
|
|
|
|
help: 'Which version of postgres is running (SHOW server_version)',
|
|
|
|
labelNames: ['version'],
|
|
|
|
});
|
|
|
|
database_version.labels({ version: postgresVersion }).set(1);
|
2021-02-04 14:14:46 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-22 10:07:10 +02:00
|
|
|
// eslint-disable-next-line @typescript-eslint/explicit-module-boundary-types
|
|
|
|
registerPoolMetrics(pool: any, eventBus: EventEmitter) {
|
2021-02-17 15:24:43 +01:00
|
|
|
try {
|
|
|
|
eventBus.emit(DB_POOL_UPDATE, {
|
|
|
|
used: pool.numUsed(),
|
|
|
|
free: pool.numFree(),
|
|
|
|
pendingCreates: pool.numPendingCreates(),
|
|
|
|
pendingAcquires: pool.numPendingAcquires(),
|
|
|
|
});
|
|
|
|
// eslint-disable-next-line no-empty
|
|
|
|
} catch (e) {}
|
2021-02-04 14:14:46 +01:00
|
|
|
}
|
2024-01-09 16:33:00 +01:00
|
|
|
|
|
|
|
async resolveEnvironmentType(
|
|
|
|
environment: string,
|
|
|
|
cachedEnvironments: () => Promise<IEnvironment[]>,
|
|
|
|
): Promise<string> {
|
|
|
|
const environments = await cachedEnvironments();
|
|
|
|
const env = environments.find((e) => e.name === environment);
|
|
|
|
|
|
|
|
if (env) {
|
|
|
|
return env.type;
|
|
|
|
} else {
|
|
|
|
return 'unknown';
|
|
|
|
}
|
|
|
|
}
|
2020-12-16 14:49:11 +01:00
|
|
|
}
|
2024-05-08 10:33:51 +02:00
|
|
|
|
2021-04-22 10:07:10 +02:00
|
|
|
export function createMetricsMonitor(): MetricsMonitor {
|
|
|
|
return new MetricsMonitor();
|
|
|
|
}
|
2020-12-16 14:49:11 +01:00
|
|
|
|
|
|
|
module.exports = {
|
2021-04-22 10:07:10 +02:00
|
|
|
createMetricsMonitor,
|
2016-12-04 14:09:37 +01:00
|
|
|
};
|