From e7de20fc99c287033fd7fb5ea1eaf5ee520a15b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gast=C3=B3n=20Fournier?= Date: Thu, 17 Oct 2024 23:05:02 +0200 Subject: [PATCH] Instance stats should not rely on prometheus cause it can be disabled --- .../instance-stats-service.test.ts | 10 +- .../instance-stats/instance-stats-service.ts | 127 ++++++---- src/lib/metrics.test.ts | 32 ++- src/lib/metrics.ts | 226 +++++++++--------- .../e2e/api/admin/instance-admin.e2e.test.ts | 6 +- 5 files changed, 232 insertions(+), 169 deletions(-) diff --git a/src/lib/features/instance-stats/instance-stats-service.test.ts b/src/lib/features/instance-stats/instance-stats-service.test.ts index 24fb34a0f9..26a5f70095 100644 --- a/src/lib/features/instance-stats/instance-stats-service.test.ts +++ b/src/lib/features/instance-stats/instance-stats-service.test.ts @@ -10,8 +10,10 @@ import type { IClientInstanceStore } from '../../types'; let instanceStatsService: InstanceStatsService; let versionService: VersionService; let clientInstanceStore: IClientInstanceStore; - +let updateMetrics: () => Promise; beforeEach(() => { + jest.clearAllMocks(); + register.clear(); const config = createTestConfig(); @@ -31,13 +33,14 @@ beforeEach(() => { createFakeGetProductionChanges(), ); - registerPrometheusMetrics( + const { collectDbMetrics } = registerPrometheusMetrics( config, stores, undefined as unknown as string, config.eventBus, instanceStatsService, ); + updateMetrics = collectDbMetrics; jest.spyOn(clientInstanceStore, 'getDistinctApplicationsCount'); jest.spyOn(instanceStatsService, 'getStats'); @@ -46,13 +49,12 @@ beforeEach(() => { }); test('get snapshot should not call getStats', async () => { - await instanceStatsService.dbMetrics.refreshDbMetrics(); + await updateMetrics(); expect( clientInstanceStore.getDistinctApplicationsCount, ).toHaveBeenCalledTimes(3); expect(instanceStatsService.getStats).toHaveBeenCalledTimes(0); - // subsequent calls to getStatsSnapshot don't call getStats for (let i = 0; i < 3; i++) { const { clientApps } = await instanceStatsService.getStats(); expect(clientApps).toStrictEqual([ diff --git a/src/lib/features/instance-stats/instance-stats-service.ts b/src/lib/features/instance-stats/instance-stats-service.ts index 1f2e20fa21..24e04ef3df 100644 --- a/src/lib/features/instance-stats/instance-stats-service.ts +++ b/src/lib/features/instance-stats/instance-stats-service.ts @@ -29,7 +29,6 @@ import { CUSTOM_ROOT_ROLE_TYPE } from '../../util'; import type { GetActiveUsers } from './getActiveUsers'; import type { ProjectModeCount } from '../project/project-store'; import type { GetProductionChanges } from './getProductionChanges'; -import { DbMetricsMonitor } from '../../metrics-gauge'; export type TimeRange = 'allTime' | '30d' | '7d'; @@ -110,14 +109,12 @@ export class InstanceStatsService { private appCount?: Partial<{ [key in TimeRange]: number }>; - private getActiveUsers: GetActiveUsers; + getActiveUsers: GetActiveUsers; - private getProductionChanges: GetProductionChanges; + getProductionChanges: GetProductionChanges; private featureStrategiesReadModel: IFeatureStrategiesReadModel; - dbMetrics: DbMetricsMonitor; - constructor( { featureToggleStore, @@ -181,20 +178,18 @@ export class InstanceStatsService { this.clientMetricsStore = clientMetricsStoreV2; this.flagResolver = flagResolver; this.featureStrategiesReadModel = featureStrategiesReadModel; - this.dbMetrics = new DbMetricsMonitor({ getLogger }); - } - - async fromPrometheus( - name: string, - labels?: Record, - ): Promise { - return (await this.dbMetrics.findValue(name, labels)) ?? 0; } getProjectModeCount(): Promise { return this.projectStore.getProjectModeCounts(); } + getToggleCount(): Promise { + return this.featureToggleStore.count({ + archived: false, + }); + } + getArchivedToggleCount(): Promise { return this.featureToggleStore.count({ archived: true, @@ -217,9 +212,6 @@ export class InstanceStatsService { return settings?.enabled || false; } - /** - * use getStatsSnapshot for low latency, sacrificing data-freshness - */ async getStats(): Promise { const versionInfo = await this.versionService.getVersionInfo(); const [ @@ -249,24 +241,24 @@ export class InstanceStatsService { maxConstraintValues, maxConstraints, ] = await Promise.all([ - this.fromPrometheus('feature_toggles_total'), + this.getToggleCount(), this.getArchivedToggleCount(), - this.userStore.count(), - this.userStore.countServiceAccounts(), - this.apiTokenStore.countByType(), + this.getRegisteredUsers(), + this.countServiceAccounts(), + this.countApiTokensByType(), this.getActiveUsers(), this.getProjectModeCount(), - this.contextFieldStore.count(), - this.groupStore.count(), - this.roleStore.count(), - this.roleStore.filteredCount({ type: CUSTOM_ROOT_ROLE_TYPE }), - this.roleStore.filteredCountInUse({ type: CUSTOM_ROOT_ROLE_TYPE }), - this.environmentStore.count(), - this.segmentStore.count(), - this.strategyStore.count(), + this.contextFieldCount(), + this.groupCount(), + this.roleCount(), + this.customRolesCount(), + this.customRolesCountInUse(), + this.environmentCount(), + this.segmentCount(), + this.strategiesCount(), this.hasSAML(), this.hasOIDC(), - this.clientAppCounts(), + this.appCount ? this.appCount : this.getLabeledAppCounts(), this.eventStore.deprecatedFilteredCount({ type: FEATURES_EXPORTED, }), @@ -274,7 +266,7 @@ export class InstanceStatsService { type: FEATURES_IMPORTED, }), this.getProductionChanges(), - this.clientMetricsStore.countPreviousDayHourlyMetricsBuckets(), + this.countPreviousDayHourlyMetricsBuckets(), this.featureStrategiesReadModel.getMaxFeatureEnvironmentStrategies(), this.featureStrategiesReadModel.getMaxConstraintValues(), this.featureStrategiesReadModel.getMaxConstraintsPerStrategy(), @@ -315,17 +307,72 @@ export class InstanceStatsService { maxConstraints: maxConstraints?.count ?? 0, }; } - async clientAppCounts(): Promise> { + + groupCount(): Promise { + return this.groupStore.count(); + } + + roleCount(): Promise { + return this.roleStore.count(); + } + + customRolesCount(): Promise { + return this.roleStore.filteredCount({ type: CUSTOM_ROOT_ROLE_TYPE }); + } + + customRolesCountInUse(): Promise { + return this.roleStore.filteredCountInUse({ + type: CUSTOM_ROOT_ROLE_TYPE, + }); + } + + segmentCount(): Promise { + return this.segmentStore.count(); + } + + contextFieldCount(): Promise { + return this.contextFieldStore.count(); + } + + strategiesCount(): Promise { + return this.strategyStore.count(); + } + + environmentCount(): Promise { + return this.environmentStore.count(); + } + + countPreviousDayHourlyMetricsBuckets(): Promise<{ + enabledCount: number; + variantCount: number; + }> { + return this.clientMetricsStore.countPreviousDayHourlyMetricsBuckets(); + } + + countApiTokensByType(): Promise> { + return this.apiTokenStore.countByType(); + } + + getRegisteredUsers(): Promise { + return this.userStore.count(); + } + + countServiceAccounts(): Promise { + return this.userStore.countServiceAccounts(); + } + + async getLabeledAppCounts(): Promise< + Partial<{ [key in TimeRange]: number }> + > { + const [t7d, t30d, allTime] = await Promise.all([ + this.clientInstanceStore.getDistinctApplicationsCount(7), + this.clientInstanceStore.getDistinctApplicationsCount(30), + this.clientInstanceStore.getDistinctApplicationsCount(), + ]); this.appCount = { - '7d': await this.fromPrometheus('client_apps_total', { - range: '7d', - }), - '30d': await this.fromPrometheus('client_apps_total', { - range: '30d', - }), - allTime: await this.fromPrometheus('client_apps_total', { - range: 'allTime', - }), + '7d': t7d, + '30d': t30d, + allTime, }; return this.appCount; } diff --git a/src/lib/metrics.test.ts b/src/lib/metrics.test.ts index afff08c245..8cfe78aa5d 100644 --- a/src/lib/metrics.test.ts +++ b/src/lib/metrics.test.ts @@ -15,7 +15,11 @@ import { FEATURE_UPDATED, PROJECT_ENVIRONMENT_REMOVED, } from './types/events'; -import { createMetricsMonitor } from './metrics'; +import { + createMetricsMonitor, + registerPrometheusMetrics, + registerPrometheusPostgresMetrics, +} from './metrics'; import createStores from '../test/fixtures/store'; import { InstanceStatsService } from './features/instance-stats/instance-stats-service'; import VersionService from './services/version-service'; @@ -46,6 +50,7 @@ let schedulerService: SchedulerService; let featureLifeCycleStore: IFeatureLifecycleStore; let featureLifeCycleReadModel: IFeatureLifecycleReadModel; let db: ITestDb; +let refreshDbMetrics: () => Promise; beforeAll(async () => { const config = createTestConfig({ @@ -102,16 +107,16 @@ beforeAll(async () => { }, }; - await monitor.startMonitoring( - config, - stores, - '4.0.0', - eventBus, - statsService, - schedulerService, - // @ts-ignore - We don't want a full knex implementation for our tests, it's enough that it actually yields the numbers we want. - metricsDbConf, - ); + const { collectDbMetrics, collectStaticCounters } = + registerPrometheusMetrics( + config, + stores, + '4.0.0', + eventBus, + statsService, + ); + refreshDbMetrics = collectDbMetrics; + await collectStaticCounters(); }); afterAll(async () => { @@ -212,7 +217,7 @@ test('should collect metrics for function timings', async () => { }); test('should collect metrics for feature flag size', async () => { - await statsService.dbMetrics.refreshDbMetrics(); + await refreshDbMetrics(); const metrics = await prometheusRegister.metrics(); expect(metrics).toMatch(/feature_toggles_total\{version="(.*)"\} 0/); }); @@ -223,12 +228,13 @@ test('should collect metrics for archived feature flag size', async () => { }); test('should collect metrics for total client apps', async () => { - await statsService.dbMetrics.refreshDbMetrics(); + await refreshDbMetrics(); const metrics = await prometheusRegister.metrics(); expect(metrics).toMatch(/client_apps_total\{range="(.*)"\} 0/); }); test('Should collect metrics for database', async () => { + registerPrometheusPostgresMetrics(db.rawDatabase, eventBus, '15.0.0'); const metrics = await prometheusRegister.metrics(); expect(metrics).toMatch(/db_pool_max/); expect(metrics).toMatch(/db_pool_min/); diff --git a/src/lib/metrics.ts b/src/lib/metrics.ts index 05ba0e9885..1b69a462ac 100644 --- a/src/lib/metrics.ts +++ b/src/lib/metrics.ts @@ -25,7 +25,7 @@ import { PROJECT_DELETED, } from './types/events'; import type { IUnleashConfig } from './types/option'; -import type { ISettingStore, IUnleashStores } from './types/stores'; +import type { IUnleashStores } from './types/stores'; import { hoursToMilliseconds, minutesToMilliseconds } from 'date-fns'; import type { InstanceStatsService } from './features/instance-stats/instance-stats-service'; import type { IEnvironment, ISdkHeartbeat } from './types'; @@ -37,6 +37,56 @@ import { } from './util/metrics'; import type { SchedulerService } from './services'; import type { IClientMetricsEnv } from './features/metrics/client-metrics/client-metrics-store-v2-type'; +import { DbMetricsMonitor } from './metrics-gauge'; + +export function registerPrometheusPostgresMetrics( + db: Knex, + eventBus: EventEmitter, + postgresVersion: string, +) { + if (db?.client) { + const dbPoolMin = createGauge({ + name: 'db_pool_min', + help: 'Minimum DB pool size', + }); + dbPoolMin.set(db.client.pool.min); + const dbPoolMax = createGauge({ + name: 'db_pool_max', + help: 'Maximum DB pool size', + }); + dbPoolMax.set(db.client.pool.max); + const dbPoolFree = createGauge({ + name: 'db_pool_free', + help: 'Current free connections in DB pool', + }); + const dbPoolUsed = createGauge({ + name: 'db_pool_used', + help: 'Current connections in use in DB pool', + }); + const dbPoolPendingCreates = createGauge({ + name: 'db_pool_pending_creates', + help: 'how many asynchronous create calls are running in DB pool', + }); + const dbPoolPendingAcquires = createGauge({ + name: 'db_pool_pending_acquires', + help: 'how many acquires are waiting for a resource to be released in DB pool', + }); + + eventBus.on(DB_POOL_UPDATE, (data) => { + dbPoolFree.set(data.free); + dbPoolUsed.set(data.used); + dbPoolPendingCreates.set(data.pendingCreates); + dbPoolPendingAcquires.set(data.pendingAcquires); + }); + + const database_version = createGauge({ + name: 'postgres_version', + help: 'Which version of postgres is running (SHOW server_version)', + labelNames: ['version'], + }); + database_version.labels({ version: postgresVersion }).set(1); + } +} export function registerPrometheusMetrics( config: IUnleashConfig, @@ -60,8 +110,8 @@ export function registerPrometheusMetrics( }; const { eventStore, environmentStore } = stores; - const { flagResolver } = config; - const dbMetrics = instanceStatsService.dbMetrics; + const { flagResolver, db } = config; + const dbMetrics = new DbMetricsMonitor(config); const cachedEnvironments: () => Promise = memoizee( async () => environmentStore.getAll(), @@ -124,10 +174,7 @@ export function registerPrometheusMetrics( name: 'feature_toggles_total', help: 'Number of feature flags', labelNames: ['version'], - query: () => - stores.featureToggleStore.count({ - archived: false, - }), + query: () => instanceStatsService.getToggleCount(), map: (value) => ({ value, labels: { version } }), }); @@ -268,18 +315,7 @@ export function registerPrometheusMetrics( name: 'client_apps_total', help: 'Number of registered client apps aggregated by range by last seen', labelNames: ['range'], - query: async () => { - const [t7d, t30d, allTime] = await Promise.all([ - stores.clientInstanceStore.getDistinctApplicationsCount(7), - stores.clientInstanceStore.getDistinctApplicationsCount(30), - stores.clientInstanceStore.getDistinctApplicationsCount(), - ]); - return { - '7d': t7d, - '30d': t30d, - allTime, - }; - }, + query: () => instanceStatsService.getLabeledAppCounts(), map: (result) => Object.entries(result).map(([range, count]) => ({ value: count, @@ -735,13 +771,10 @@ export function registerPrometheusMetrics( addonEventsHandledCounter.increment({ result, destination }); }); - // return an update function (temporarily) to allow for manual refresh return { + collectDbMetrics: dbMetrics.refreshDbMetrics, collectStaticCounters: async () => { try { - dbMetrics.refreshDbMetrics(); - - const stats = await instanceStatsService.getStats(); const [ maxConstraintValuesResult, maxConstraintsPerStrategyResult, @@ -773,13 +806,17 @@ export function registerPrometheusMetrics( ]); featureTogglesArchivedTotal.reset(); - featureTogglesArchivedTotal.set(stats.archivedFeatureToggles); + featureTogglesArchivedTotal.set( + await instanceStatsService.getArchivedToggleCount(), + ); usersTotal.reset(); - usersTotal.set(stats.users); + usersTotal.set(await instanceStatsService.getRegisteredUsers()); serviceAccounts.reset(); - serviceAccounts.set(stats.serviceAccounts); + serviceAccounts.set( + await instanceStatsService.countServiceAccounts(), + ); stageDurationByProject.forEach((stage) => { featureLifecycleStageDuration @@ -802,7 +839,10 @@ export function registerPrometheusMetrics( apiTokens.reset(); - for (const [type, value] of stats.apiTokens) { + for (const [ + type, + value, + ] of await instanceStatsService.countApiTokensByType()) { apiTokens.labels({ type }).set(value); } @@ -887,67 +927,84 @@ export function registerPrometheusMetrics( resourceLimit.labels({ resource }).set(limit); } + const previousDayMetricsBucketsCount = + await instanceStatsService.countPreviousDayHourlyMetricsBuckets(); enabledMetricsBucketsPreviousDay.reset(); enabledMetricsBucketsPreviousDay.set( - stats.previousDayMetricsBucketsCount.enabledCount, + previousDayMetricsBucketsCount.enabledCount, ); variantMetricsBucketsPreviousDay.reset(); variantMetricsBucketsPreviousDay.set( - stats.previousDayMetricsBucketsCount.variantCount, + previousDayMetricsBucketsCount.variantCount, ); + const activeUsers = await instanceStatsService.getActiveUsers(); usersActive7days.reset(); - usersActive7days.set(stats.activeUsers.last7); + usersActive7days.set(activeUsers.last7); usersActive30days.reset(); - usersActive30days.set(stats.activeUsers.last30); + usersActive30days.set(activeUsers.last30); usersActive60days.reset(); - usersActive60days.set(stats.activeUsers.last60); + usersActive60days.set(activeUsers.last60); usersActive90days.reset(); - usersActive90days.set(stats.activeUsers.last90); + usersActive90days.set(activeUsers.last90); + const productionChanges = + await instanceStatsService.getProductionChanges(); productionChanges30.reset(); - productionChanges30.set(stats.productionChanges.last30); + productionChanges30.set(productionChanges.last30); productionChanges60.reset(); - productionChanges60.set(stats.productionChanges.last60); + productionChanges60.set(productionChanges.last60); productionChanges90.reset(); - productionChanges90.set(stats.productionChanges.last90); + productionChanges90.set(productionChanges.last90); + const projects = + await instanceStatsService.getProjectModeCount(); projectsTotal.reset(); - stats.projects.forEach((projectStat) => { + projects.forEach((projectStat) => { projectsTotal .labels({ mode: projectStat.mode }) .set(projectStat.count); }); environmentsTotal.reset(); - environmentsTotal.set(stats.environments); + environmentsTotal.set( + await instanceStatsService.environmentCount(), + ); groupsTotal.reset(); - groupsTotal.set(stats.groups); + groupsTotal.set(await instanceStatsService.groupCount()); rolesTotal.reset(); - rolesTotal.set(stats.roles); + rolesTotal.set(await instanceStatsService.roleCount()); customRootRolesTotal.reset(); - customRootRolesTotal.set(stats.customRootRoles); + customRootRolesTotal.set( + await instanceStatsService.customRolesCount(), + ); customRootRolesInUseTotal.reset(); - customRootRolesInUseTotal.set(stats.customRootRolesInUse); + customRootRolesInUseTotal.set( + await instanceStatsService.customRolesCountInUse(), + ); segmentsTotal.reset(); - segmentsTotal.set(stats.segments); + segmentsTotal.set(await instanceStatsService.segmentCount()); contextTotal.reset(); - contextTotal.set(stats.contextFields); + contextTotal.set( + await instanceStatsService.contextFieldCount(), + ); strategiesTotal.reset(); - strategiesTotal.set(stats.strategies); + strategiesTotal.set( + await instanceStatsService.strategiesCount(), + ); samlEnabled.reset(); - samlEnabled.set(stats.SAMLenabled ? 1 : 0); + samlEnabled.set((await instanceStatsService.hasSAML()) ? 1 : 0); oidcEnabled.reset(); - oidcEnabled.set(stats.OIDCenabled ? 1 : 0); + oidcEnabled.set((await instanceStatsService.hasOIDC()) ? 1 : 0); rateLimits.reset(); rateLimits @@ -1026,22 +1083,21 @@ export default class MetricsMonitor { collectDefaultMetrics(); - const { collectStaticCounters } = registerPrometheusMetrics( - config, - stores, - version, - eventBus, - instanceStatsService, - ); + const { collectStaticCounters, collectDbMetrics } = + registerPrometheusMetrics( + config, + stores, + version, + eventBus, + instanceStatsService, + ); - await this.registerPrometheusDbMetrics( - db, - eventBus, - stores.settingStore, - ); + const postgresVersion = await stores.settingStore.postgresVersion(); + registerPrometheusPostgresMetrics(db, eventBus, postgresVersion); await schedulerService.schedule( - collectStaticCounters.bind(this), + async () => + Promise.all([collectStaticCounters(), collectDbMetrics()]), hoursToMilliseconds(2), 'collectStaticCounters', ); @@ -1056,56 +1112,6 @@ export default class MetricsMonitor { return Promise.resolve(); } - async registerPrometheusDbMetrics( - db: Knex, - eventBus: EventEmitter, - settingStore: ISettingStore, - ): Promise { - if (db?.client) { - const dbPoolMin = createGauge({ - name: 'db_pool_min', - help: 'Minimum DB pool size', - }); - dbPoolMin.set(db.client.pool.min); - const dbPoolMax = createGauge({ - name: 'db_pool_max', - help: 'Maximum DB pool size', - }); - dbPoolMax.set(db.client.pool.max); - const dbPoolFree = createGauge({ - name: 'db_pool_free', - help: 'Current free connections in DB pool', - }); - const dbPoolUsed = createGauge({ - name: 'db_pool_used', - help: 'Current connections in use in DB pool', - }); - const dbPoolPendingCreates = createGauge({ - name: 'db_pool_pending_creates', - help: 'how many asynchronous create calls are running in DB pool', - }); - const dbPoolPendingAcquires = createGauge({ - name: 'db_pool_pending_acquires', - help: 'how many acquires are waiting for a resource to be released in DB pool', - }); - - eventBus.on(DB_POOL_UPDATE, (data) => { - dbPoolFree.set(data.free); - dbPoolUsed.set(data.used); - dbPoolPendingCreates.set(data.pendingCreates); - dbPoolPendingAcquires.set(data.pendingAcquires); - }); - - const postgresVersion = await settingStore.postgresVersion(); - const database_version = createGauge({ - name: 'postgres_version', - help: 'Which version of postgres is running (SHOW server_version)', - labelNames: ['version'], - }); - database_version.labels({ version: postgresVersion }).set(1); - } - } - // eslint-disable-next-line @typescript-eslint/explicit-module-boundary-types registerPoolMetrics(pool: any, eventBus: EventEmitter) { try { diff --git a/src/test/e2e/api/admin/instance-admin.e2e.test.ts b/src/test/e2e/api/admin/instance-admin.e2e.test.ts index 1a645195b4..a3e5c08c73 100644 --- a/src/test/e2e/api/admin/instance-admin.e2e.test.ts +++ b/src/test/e2e/api/admin/instance-admin.e2e.test.ts @@ -11,6 +11,7 @@ import { registerPrometheusMetrics } from '../../../../lib/metrics'; let app: IUnleashTest; let db: ITestDb; let stores: IUnleashStores; +let refreshDbMetrics: () => Promise; beforeAll(async () => { db = await dbInit('instance_admin_api_serial', getLogger); @@ -28,13 +29,14 @@ beforeAll(async () => { db.rawDatabase, ); - registerPrometheusMetrics( + const { collectDbMetrics } = registerPrometheusMetrics( app.config, stores, undefined as unknown as string, app.config.eventBus, app.services.instanceStatsService, ); + refreshDbMetrics = collectDbMetrics; }); afterAll(async () => { @@ -48,7 +50,7 @@ test('should return instance statistics', async () => { createdByUserId: 9999, }); - await app.services.instanceStatsService.dbMetrics.refreshDbMetrics(); + await refreshDbMetrics(); return app.request .get('/api/admin/instance-admin/statistics')