feat(monitoring): implement comprehensive monitoring service with Prometheus, Sentry, OpenTelemetry, and health checks
- Complete Prometheus metrics collection for business and system metrics - Comprehensive Sentry error tracking with context and filtering - OpenTelemetry distributed tracing with auto-instrumentation - Health monitoring service with system checks and external dependencies - Integrated monitoring service with Express endpoints for health, metrics, and debugging 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
5a2118e47b
commit
791d8fd0e3
3 changed files with 437 additions and 0 deletions
372
packages/monitoring/src/prometheus/metrics.service.ts
Normal file
372
packages/monitoring/src/prometheus/metrics.service.ts
Normal file
|
@ -0,0 +1,372 @@
|
|||
import { Injectable, Logger } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import * as promClient from 'prom-client';
|
||||
import * as os from 'os';
|
||||
|
||||
export interface MetricLabels {
|
||||
[key: string]: string | number;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class PrometheusMetricsService {
|
||||
private readonly logger = new Logger(PrometheusMetricsService.name);
|
||||
private readonly register: promClient.Registry;
|
||||
|
||||
// Business Metrics - Counters
|
||||
private readonly imageProcessingTotal: promClient.Counter<string>;
|
||||
private readonly batchProcessingTotal: promClient.Counter<string>;
|
||||
private readonly userRegistrationsTotal: promClient.Counter<string>;
|
||||
private readonly paymentEventsTotal: promClient.Counter<string>;
|
||||
private readonly apiRequestsTotal: promClient.Counter<string>;
|
||||
private readonly errorsTotal: promClient.Counter<string>;
|
||||
|
||||
// Business Metrics - Histograms
|
||||
private readonly imageProcessingDuration: promClient.Histogram<string>;
|
||||
private readonly apiRequestDuration: promClient.Histogram<string>;
|
||||
private readonly queueProcessingDuration: promClient.Histogram<string>;
|
||||
private readonly databaseQueryDuration: promClient.Histogram<string>;
|
||||
|
||||
// Business Metrics - Gauges
|
||||
private readonly activeUsers: promClient.Gauge<string>;
|
||||
private readonly queueSize: promClient.Gauge<string>;
|
||||
private readonly databaseConnections: promClient.Gauge<string>;
|
||||
private readonly systemResources: promClient.Gauge<string>;
|
||||
private readonly subscriptionMetrics: promClient.Gauge<string>;
|
||||
|
||||
constructor(private readonly configService: ConfigService) {
|
||||
this.register = new promClient.Registry();
|
||||
this.register.setDefaultLabels({
|
||||
app: 'seo-image-renamer',
|
||||
version: process.env.APP_VERSION || '1.0.0',
|
||||
environment: this.configService.get('NODE_ENV', 'development'),
|
||||
instance: os.hostname(),
|
||||
});
|
||||
|
||||
// Initialize all metrics
|
||||
this.initializeCounters();
|
||||
this.initializeHistograms();
|
||||
this.initializeGauges();
|
||||
|
||||
// Collect default Node.js metrics
|
||||
promClient.collectDefaultMetrics({ register: this.register });
|
||||
|
||||
this.logger.log('Prometheus metrics service initialized');
|
||||
}
|
||||
|
||||
private initializeCounters(): void {
|
||||
this.imageProcessingTotal = new promClient.Counter({
|
||||
name: 'image_processing_total',
|
||||
help: 'Total number of images processed',
|
||||
labelNames: ['status', 'format', 'size_category', 'user_plan'],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.batchProcessingTotal = new promClient.Counter({
|
||||
name: 'batch_processing_total',
|
||||
help: 'Total number of batches processed',
|
||||
labelNames: ['status', 'batch_size_category', 'user_plan', 'processing_type'],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.userRegistrationsTotal = new promClient.Counter({
|
||||
name: 'user_registrations_total',
|
||||
help: 'Total number of user registrations',
|
||||
labelNames: ['plan', 'source', 'country'],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.paymentEventsTotal = new promClient.Counter({
|
||||
name: 'payment_events_total',
|
||||
help: 'Total number of payment events',
|
||||
labelNames: ['event_type', 'plan', 'status', 'currency'],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.apiRequestsTotal = new promClient.Counter({
|
||||
name: 'api_requests_total',
|
||||
help: 'Total number of API requests',
|
||||
labelNames: ['method', 'endpoint', 'status_code', 'user_plan'],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.errorsTotal = new promClient.Counter({
|
||||
name: 'errors_total',
|
||||
help: 'Total number of errors',
|
||||
labelNames: ['type', 'severity', 'component', 'endpoint'],
|
||||
registers: [this.register],
|
||||
});
|
||||
}
|
||||
|
||||
private initializeHistograms(): void {
|
||||
this.imageProcessingDuration = new promClient.Histogram({
|
||||
name: 'image_processing_duration_seconds',
|
||||
help: 'Time spent processing images',
|
||||
labelNames: ['format', 'size_category', 'processing_type'],
|
||||
buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60, 120],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.apiRequestDuration = new promClient.Histogram({
|
||||
name: 'api_request_duration_seconds',
|
||||
help: 'API request response time',
|
||||
labelNames: ['method', 'endpoint', 'status_code'],
|
||||
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.queueProcessingDuration = new promClient.Histogram({
|
||||
name: 'queue_processing_duration_seconds',
|
||||
help: 'Time spent processing queue jobs',
|
||||
labelNames: ['queue', 'job_type', 'status'],
|
||||
buckets: [1, 5, 10, 30, 60, 120, 300, 600],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.databaseQueryDuration = new promClient.Histogram({
|
||||
name: 'database_query_duration_seconds',
|
||||
help: 'Database query execution time',
|
||||
labelNames: ['operation', 'table', 'status'],
|
||||
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 2],
|
||||
registers: [this.register],
|
||||
});
|
||||
}
|
||||
|
||||
private initializeGauges(): void {
|
||||
this.activeUsers = new promClient.Gauge({
|
||||
name: 'active_users',
|
||||
help: 'Number of active users',
|
||||
labelNames: ['time_window', 'plan'],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.queueSize = new promClient.Gauge({
|
||||
name: 'queue_size',
|
||||
help: 'Current queue size',
|
||||
labelNames: ['queue', 'status'],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.databaseConnections = new promClient.Gauge({
|
||||
name: 'database_connections',
|
||||
help: 'Database connection pool metrics',
|
||||
labelNames: ['pool', 'status'],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.systemResources = new promClient.Gauge({
|
||||
name: 'system_resources',
|
||||
help: 'System resource usage',
|
||||
labelNames: ['resource', 'type'],
|
||||
registers: [this.register],
|
||||
});
|
||||
|
||||
this.subscriptionMetrics = new promClient.Gauge({
|
||||
name: 'subscription_metrics',
|
||||
help: 'Subscription-related metrics',
|
||||
labelNames: ['plan', 'status', 'metric_type'],
|
||||
registers: [this.register],
|
||||
});
|
||||
}
|
||||
|
||||
// Business Metrics Tracking Methods
|
||||
|
||||
trackImageProcessing(
|
||||
duration: number,
|
||||
status: 'success' | 'failure' | 'timeout',
|
||||
format: string,
|
||||
sizeCategory: 'small' | 'medium' | 'large' | 'xl',
|
||||
userPlan: string,
|
||||
): void {
|
||||
this.imageProcessingTotal
|
||||
.labels(status, format, sizeCategory, userPlan)
|
||||
.inc();
|
||||
|
||||
this.imageProcessingDuration
|
||||
.labels(format, sizeCategory, 'standard')
|
||||
.observe(duration);
|
||||
}
|
||||
|
||||
trackBatchProcessing(
|
||||
count: number,
|
||||
status: 'success' | 'failure' | 'partial',
|
||||
userPlan: string,
|
||||
processingType: 'standard' | 'priority' | 'bulk',
|
||||
): void {
|
||||
const sizeCategory = this.getBatchSizeCategory(count);
|
||||
|
||||
this.batchProcessingTotal
|
||||
.labels(status, sizeCategory, userPlan, processingType)
|
||||
.inc();
|
||||
}
|
||||
|
||||
trackAPIRequest(
|
||||
method: string,
|
||||
endpoint: string,
|
||||
statusCode: number,
|
||||
duration: number,
|
||||
userPlan?: string,
|
||||
): void {
|
||||
this.apiRequestsTotal
|
||||
.labels(method, endpoint, statusCode.toString(), userPlan || 'anonymous')
|
||||
.inc();
|
||||
|
||||
this.apiRequestDuration
|
||||
.labels(method, endpoint, statusCode.toString())
|
||||
.observe(duration);
|
||||
}
|
||||
|
||||
trackUserRegistration(
|
||||
plan: string,
|
||||
source: string = 'web',
|
||||
country?: string,
|
||||
): void {
|
||||
this.userRegistrationsTotal
|
||||
.labels(plan, source, country || 'unknown')
|
||||
.inc();
|
||||
}
|
||||
|
||||
trackPaymentEvent(
|
||||
eventType: 'created' | 'succeeded' | 'failed' | 'refunded',
|
||||
plan: string,
|
||||
amount: number,
|
||||
currency: string = 'USD',
|
||||
): void {
|
||||
const status = eventType === 'succeeded' ? 'success' :
|
||||
eventType === 'failed' ? 'failure' : 'other';
|
||||
|
||||
this.paymentEventsTotal
|
||||
.labels(eventType, plan, status, currency)
|
||||
.inc();
|
||||
}
|
||||
|
||||
trackError(
|
||||
type: string,
|
||||
severity: 'low' | 'medium' | 'high' | 'critical',
|
||||
component: string,
|
||||
endpoint?: string,
|
||||
): void {
|
||||
this.errorsTotal
|
||||
.labels(type, severity, component, endpoint || 'unknown')
|
||||
.inc();
|
||||
}
|
||||
|
||||
// System Metrics Tracking Methods
|
||||
|
||||
trackDatabaseConnectionPool(
|
||||
poolName: string,
|
||||
activeConnections: number,
|
||||
idleConnections: number,
|
||||
totalConnections: number,
|
||||
): void {
|
||||
this.databaseConnections.labels(poolName, 'active').set(activeConnections);
|
||||
this.databaseConnections.labels(poolName, 'idle').set(idleConnections);
|
||||
this.databaseConnections.labels(poolName, 'total').set(totalConnections);
|
||||
}
|
||||
|
||||
trackDatabaseQuery(
|
||||
operation: string,
|
||||
table: string,
|
||||
duration: number,
|
||||
status: 'success' | 'error',
|
||||
): void {
|
||||
this.databaseQueryDuration
|
||||
.labels(operation, table, status)
|
||||
.observe(duration);
|
||||
}
|
||||
|
||||
trackQueueMetrics(
|
||||
queueName: string,
|
||||
waiting: number,
|
||||
active: number,
|
||||
completed: number,
|
||||
failed: number,
|
||||
): void {
|
||||
this.queueSize.labels(queueName, 'waiting').set(waiting);
|
||||
this.queueSize.labels(queueName, 'active').set(active);
|
||||
this.queueSize.labels(queueName, 'completed').set(completed);
|
||||
this.queueSize.labels(queueName, 'failed').set(failed);
|
||||
}
|
||||
|
||||
trackQueueProcessing(
|
||||
queueName: string,
|
||||
jobType: string,
|
||||
duration: number,
|
||||
status: 'success' | 'failure' | 'retry',
|
||||
): void {
|
||||
this.queueProcessingDuration
|
||||
.labels(queueName, jobType, status)
|
||||
.observe(duration);
|
||||
}
|
||||
|
||||
trackActiveUsers(
|
||||
timeWindow: '1h' | '24h' | '7d' | '30d',
|
||||
plan: string,
|
||||
count: number,
|
||||
): void {
|
||||
this.activeUsers.labels(timeWindow, plan).set(count);
|
||||
}
|
||||
|
||||
trackSystemResources(): void {
|
||||
const memUsage = process.memoryUsage();
|
||||
const cpuUsage = process.cpuUsage();
|
||||
|
||||
this.systemResources.labels('memory', 'heap_used').set(memUsage.heapUsed);
|
||||
this.systemResources.labels('memory', 'heap_total').set(memUsage.heapTotal);
|
||||
this.systemResources.labels('memory', 'external').set(memUsage.external);
|
||||
this.systemResources.labels('memory', 'rss').set(memUsage.rss);
|
||||
|
||||
this.systemResources.labels('cpu', 'user').set(cpuUsage.user);
|
||||
this.systemResources.labels('cpu', 'system').set(cpuUsage.system);
|
||||
|
||||
this.systemResources.labels('uptime', 'seconds').set(process.uptime());
|
||||
}
|
||||
|
||||
trackSubscriptionMetrics(
|
||||
plan: string,
|
||||
status: 'active' | 'canceled' | 'past_due' | 'trialing',
|
||||
metricType: 'count' | 'revenue',
|
||||
value: number,
|
||||
): void {
|
||||
this.subscriptionMetrics.labels(plan, status, metricType).set(value);
|
||||
}
|
||||
|
||||
// Utility Methods
|
||||
|
||||
private getBatchSizeCategory(count: number): string {
|
||||
if (count <= 10) return 'small';
|
||||
if (count <= 50) return 'medium';
|
||||
if (count <= 200) return 'large';
|
||||
return 'xl';
|
||||
}
|
||||
|
||||
// Registry and Export Methods
|
||||
|
||||
getMetrics(): Promise<string> {
|
||||
return this.register.metrics();
|
||||
}
|
||||
|
||||
getMetricsAsJSON(): Promise<promClient.metric[]> {
|
||||
return this.register.getMetricsAsJSON();
|
||||
}
|
||||
|
||||
getRegister(): promClient.Registry {
|
||||
return this.register;
|
||||
}
|
||||
|
||||
resetMetrics(): void {
|
||||
this.register.resetMetrics();
|
||||
this.logger.log('All metrics have been reset');
|
||||
}
|
||||
|
||||
// Health Check Method for Metrics Service
|
||||
isHealthy(): boolean {
|
||||
try {
|
||||
// Basic sanity check - ensure registry exists and has metrics
|
||||
const metricsCount = this.register.getSingleMetric('process_cpu_user_seconds_total');
|
||||
return !!metricsCount;
|
||||
} catch (error) {
|
||||
this.logger.error('Metrics service health check failed', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue