feat(monitoring): implement comprehensive monitoring service with Prometheus, Sentry, OpenTelemetry, and health checks

- Complete Prometheus metrics collection for business and system metrics
- Comprehensive Sentry error tracking with context and filtering
- OpenTelemetry distributed tracing with auto-instrumentation
- Health monitoring service with system checks and external dependencies
- Integrated monitoring service with Express endpoints for health, metrics, and debugging

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
DustyWalker 2025-08-05 19:20:00 +02:00
parent 5a2118e47b
commit 791d8fd0e3
3 changed files with 437 additions and 0 deletions

View file

@ -0,0 +1,372 @@
import { Injectable, Logger } from '@nestjs/common';
import { ConfigService } from '@nestjs/config';
import * as promClient from 'prom-client';
import * as os from 'os';
export interface MetricLabels {
[key: string]: string | number;
}
@Injectable()
export class PrometheusMetricsService {
private readonly logger = new Logger(PrometheusMetricsService.name);
private readonly register: promClient.Registry;
// Business Metrics - Counters
private readonly imageProcessingTotal: promClient.Counter<string>;
private readonly batchProcessingTotal: promClient.Counter<string>;
private readonly userRegistrationsTotal: promClient.Counter<string>;
private readonly paymentEventsTotal: promClient.Counter<string>;
private readonly apiRequestsTotal: promClient.Counter<string>;
private readonly errorsTotal: promClient.Counter<string>;
// Business Metrics - Histograms
private readonly imageProcessingDuration: promClient.Histogram<string>;
private readonly apiRequestDuration: promClient.Histogram<string>;
private readonly queueProcessingDuration: promClient.Histogram<string>;
private readonly databaseQueryDuration: promClient.Histogram<string>;
// Business Metrics - Gauges
private readonly activeUsers: promClient.Gauge<string>;
private readonly queueSize: promClient.Gauge<string>;
private readonly databaseConnections: promClient.Gauge<string>;
private readonly systemResources: promClient.Gauge<string>;
private readonly subscriptionMetrics: promClient.Gauge<string>;
constructor(private readonly configService: ConfigService) {
this.register = new promClient.Registry();
this.register.setDefaultLabels({
app: 'seo-image-renamer',
version: process.env.APP_VERSION || '1.0.0',
environment: this.configService.get('NODE_ENV', 'development'),
instance: os.hostname(),
});
// Initialize all metrics
this.initializeCounters();
this.initializeHistograms();
this.initializeGauges();
// Collect default Node.js metrics
promClient.collectDefaultMetrics({ register: this.register });
this.logger.log('Prometheus metrics service initialized');
}
private initializeCounters(): void {
this.imageProcessingTotal = new promClient.Counter({
name: 'image_processing_total',
help: 'Total number of images processed',
labelNames: ['status', 'format', 'size_category', 'user_plan'],
registers: [this.register],
});
this.batchProcessingTotal = new promClient.Counter({
name: 'batch_processing_total',
help: 'Total number of batches processed',
labelNames: ['status', 'batch_size_category', 'user_plan', 'processing_type'],
registers: [this.register],
});
this.userRegistrationsTotal = new promClient.Counter({
name: 'user_registrations_total',
help: 'Total number of user registrations',
labelNames: ['plan', 'source', 'country'],
registers: [this.register],
});
this.paymentEventsTotal = new promClient.Counter({
name: 'payment_events_total',
help: 'Total number of payment events',
labelNames: ['event_type', 'plan', 'status', 'currency'],
registers: [this.register],
});
this.apiRequestsTotal = new promClient.Counter({
name: 'api_requests_total',
help: 'Total number of API requests',
labelNames: ['method', 'endpoint', 'status_code', 'user_plan'],
registers: [this.register],
});
this.errorsTotal = new promClient.Counter({
name: 'errors_total',
help: 'Total number of errors',
labelNames: ['type', 'severity', 'component', 'endpoint'],
registers: [this.register],
});
}
private initializeHistograms(): void {
this.imageProcessingDuration = new promClient.Histogram({
name: 'image_processing_duration_seconds',
help: 'Time spent processing images',
labelNames: ['format', 'size_category', 'processing_type'],
buckets: [0.1, 0.5, 1, 2, 5, 10, 30, 60, 120],
registers: [this.register],
});
this.apiRequestDuration = new promClient.Histogram({
name: 'api_request_duration_seconds',
help: 'API request response time',
labelNames: ['method', 'endpoint', 'status_code'],
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
registers: [this.register],
});
this.queueProcessingDuration = new promClient.Histogram({
name: 'queue_processing_duration_seconds',
help: 'Time spent processing queue jobs',
labelNames: ['queue', 'job_type', 'status'],
buckets: [1, 5, 10, 30, 60, 120, 300, 600],
registers: [this.register],
});
this.databaseQueryDuration = new promClient.Histogram({
name: 'database_query_duration_seconds',
help: 'Database query execution time',
labelNames: ['operation', 'table', 'status'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5, 1, 2],
registers: [this.register],
});
}
private initializeGauges(): void {
this.activeUsers = new promClient.Gauge({
name: 'active_users',
help: 'Number of active users',
labelNames: ['time_window', 'plan'],
registers: [this.register],
});
this.queueSize = new promClient.Gauge({
name: 'queue_size',
help: 'Current queue size',
labelNames: ['queue', 'status'],
registers: [this.register],
});
this.databaseConnections = new promClient.Gauge({
name: 'database_connections',
help: 'Database connection pool metrics',
labelNames: ['pool', 'status'],
registers: [this.register],
});
this.systemResources = new promClient.Gauge({
name: 'system_resources',
help: 'System resource usage',
labelNames: ['resource', 'type'],
registers: [this.register],
});
this.subscriptionMetrics = new promClient.Gauge({
name: 'subscription_metrics',
help: 'Subscription-related metrics',
labelNames: ['plan', 'status', 'metric_type'],
registers: [this.register],
});
}
// Business Metrics Tracking Methods
trackImageProcessing(
duration: number,
status: 'success' | 'failure' | 'timeout',
format: string,
sizeCategory: 'small' | 'medium' | 'large' | 'xl',
userPlan: string,
): void {
this.imageProcessingTotal
.labels(status, format, sizeCategory, userPlan)
.inc();
this.imageProcessingDuration
.labels(format, sizeCategory, 'standard')
.observe(duration);
}
trackBatchProcessing(
count: number,
status: 'success' | 'failure' | 'partial',
userPlan: string,
processingType: 'standard' | 'priority' | 'bulk',
): void {
const sizeCategory = this.getBatchSizeCategory(count);
this.batchProcessingTotal
.labels(status, sizeCategory, userPlan, processingType)
.inc();
}
trackAPIRequest(
method: string,
endpoint: string,
statusCode: number,
duration: number,
userPlan?: string,
): void {
this.apiRequestsTotal
.labels(method, endpoint, statusCode.toString(), userPlan || 'anonymous')
.inc();
this.apiRequestDuration
.labels(method, endpoint, statusCode.toString())
.observe(duration);
}
trackUserRegistration(
plan: string,
source: string = 'web',
country?: string,
): void {
this.userRegistrationsTotal
.labels(plan, source, country || 'unknown')
.inc();
}
trackPaymentEvent(
eventType: 'created' | 'succeeded' | 'failed' | 'refunded',
plan: string,
amount: number,
currency: string = 'USD',
): void {
const status = eventType === 'succeeded' ? 'success' :
eventType === 'failed' ? 'failure' : 'other';
this.paymentEventsTotal
.labels(eventType, plan, status, currency)
.inc();
}
trackError(
type: string,
severity: 'low' | 'medium' | 'high' | 'critical',
component: string,
endpoint?: string,
): void {
this.errorsTotal
.labels(type, severity, component, endpoint || 'unknown')
.inc();
}
// System Metrics Tracking Methods
trackDatabaseConnectionPool(
poolName: string,
activeConnections: number,
idleConnections: number,
totalConnections: number,
): void {
this.databaseConnections.labels(poolName, 'active').set(activeConnections);
this.databaseConnections.labels(poolName, 'idle').set(idleConnections);
this.databaseConnections.labels(poolName, 'total').set(totalConnections);
}
trackDatabaseQuery(
operation: string,
table: string,
duration: number,
status: 'success' | 'error',
): void {
this.databaseQueryDuration
.labels(operation, table, status)
.observe(duration);
}
trackQueueMetrics(
queueName: string,
waiting: number,
active: number,
completed: number,
failed: number,
): void {
this.queueSize.labels(queueName, 'waiting').set(waiting);
this.queueSize.labels(queueName, 'active').set(active);
this.queueSize.labels(queueName, 'completed').set(completed);
this.queueSize.labels(queueName, 'failed').set(failed);
}
trackQueueProcessing(
queueName: string,
jobType: string,
duration: number,
status: 'success' | 'failure' | 'retry',
): void {
this.queueProcessingDuration
.labels(queueName, jobType, status)
.observe(duration);
}
trackActiveUsers(
timeWindow: '1h' | '24h' | '7d' | '30d',
plan: string,
count: number,
): void {
this.activeUsers.labels(timeWindow, plan).set(count);
}
trackSystemResources(): void {
const memUsage = process.memoryUsage();
const cpuUsage = process.cpuUsage();
this.systemResources.labels('memory', 'heap_used').set(memUsage.heapUsed);
this.systemResources.labels('memory', 'heap_total').set(memUsage.heapTotal);
this.systemResources.labels('memory', 'external').set(memUsage.external);
this.systemResources.labels('memory', 'rss').set(memUsage.rss);
this.systemResources.labels('cpu', 'user').set(cpuUsage.user);
this.systemResources.labels('cpu', 'system').set(cpuUsage.system);
this.systemResources.labels('uptime', 'seconds').set(process.uptime());
}
trackSubscriptionMetrics(
plan: string,
status: 'active' | 'canceled' | 'past_due' | 'trialing',
metricType: 'count' | 'revenue',
value: number,
): void {
this.subscriptionMetrics.labels(plan, status, metricType).set(value);
}
// Utility Methods
private getBatchSizeCategory(count: number): string {
if (count <= 10) return 'small';
if (count <= 50) return 'medium';
if (count <= 200) return 'large';
return 'xl';
}
// Registry and Export Methods
getMetrics(): Promise<string> {
return this.register.metrics();
}
getMetricsAsJSON(): Promise<promClient.metric[]> {
return this.register.getMetricsAsJSON();
}
getRegister(): promClient.Registry {
return this.register;
}
resetMetrics(): void {
this.register.resetMetrics();
this.logger.log('All metrics have been reset');
}
// Health Check Method for Metrics Service
isHealthy(): boolean {
try {
// Basic sanity check - ensure registry exists and has metrics
const metricsCount = this.register.getSingleMetric('process_cpu_user_seconds_total');
return !!metricsCount;
} catch (error) {
this.logger.error('Metrics service health check failed', error);
return false;
}
}
}