const winston = require('winston');
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
transports: [
new winston.transports.File({ filename: 'error.log', level: 'error' }),
new winston.transports.File({ filename: 'combined.log' })
]
});
// In development, also log to console
if (process.env.NODE_ENV !== 'production') {
logger.add(new winston.transports.Console({
format: winston.format.simple()
}));
}
logger.info('Server started', { port: 3000 });
logger.error('Database connection failed', { error: err.message });
Why it matters: In production, console.log is not enough — Winston provides level filtering, structured JSON output, file transports, and custom transports that ensure logs are usable in log aggregation tools.
Real applications: Express APIs use Winston to write structured JSON logs to files and stdout, enabling Datadog/CloudWatch agents to ingest and search them; error logs routed to a separate file allow quick incident triage.
Common mistakes: Using console.log in production loses log levels, lacks timestamps, and cannot be filtered or redirected; always set up Winston (or Pino) from the start of the project.
const logger = winston.createLogger({
levels: {
error: 0, // Application errors that need immediate attention
warn: 1, // Warning conditions that might cause issues
info: 2, // General informational messages
http: 3, // HTTP request logging
verbose: 4, // Detailed informational messages
debug: 5, // Debug information for development
silly: 6 // Most verbose level
},
level: process.env.LOG_LEVEL || 'info'
});
// Usage
logger.error('Payment processing failed', { orderId, error: err.message });
logger.warn('API rate limit approaching', { remaining: 10 });
logger.info('User registered', { userId: user.id });
logger.debug('Query executed', { sql: query, duration: '45ms' });
Why it matters: Incorrect level usage makes logs noisy or incomplete — too many errors cause alert fatigue, while insufficient logging hides real problems; consistent level discipline enables reliable alerting.
Real applications: Production services set LOG_LEVEL=info by default, enable debug only during incident investigation, and use warn for approaching rate limits or degraded states.
Common mistakes: Logging sensitive data (passwords, tokens, PII) at debug level — even if debug is off in production, these logs may appear in non-prod environments or during troubleshooting; always sanitize log fields.
const morgan = require('morgan');
const express = require('express');
const app = express();
// Predefined formats
app.use(morgan('dev')); // Colored, concise for dev
app.use(morgan('combined')); // Apache combined format for production
app.use(morgan('tiny')); // Minimal output
// Custom format
app.use(morgan(':method :url :status :res[content-length] - :response-time ms'));
// Write to file
const fs = require('fs');
const accessLog = fs.createWriteStream('access.log', { flags: 'a' });
app.use(morgan('combined', { stream: accessLog }));
// Integrate with Winston
app.use(morgan('combined', {
stream: { write: (message) => logger.http(message.trim()) }
}));
Why it matters: HTTP request logs are essential for debugging slow endpoints, identifying error patterns, and auditing API usage; Morgan automates this without requiring manual logging in every route handler.
Real applications: REST APIs log every request with combined format in production and stream to a centralized log aggregator; security teams analyze Morgan logs for rate limit detection and abuse patterns.
Common mistakes: Logging health check endpoints (e.g., /health, /metrics) pollutes logs with hundreds of irrelevant entries per minute; always skip high-frequency internal endpoints with the skip option.
// BAD — unstructured plain text
console.log('User alice@example.com logged in from 192.168.1.1');
// GOOD — structured JSON
logger.info('User login', {
event: 'auth.login',
userId: '12345',
email: 'alice@example.com',
ip: '192.168.1.1',
userAgent: req.headers['user-agent'],
timestamp: new Date().toISOString()
});
// Output: {"level":"info","message":"User login","event":"auth.login",
// "userId":"12345","email":"alice@example.com","ip":"192.168.1.1",
// "timestamp":"2025-01-15T10:30:00.000Z"}
// Easily searchable in log aggregators
// e.g., search: event="auth.login" AND userId="12345"
Why it matters: Structured JSON logs enable powerful querying in Kibana, Datadog, and CloudWatch Insights — without structure you cannot filter by userId, correlate errors, or build meaningful dashboards.
Real applications: Auth services emit { event: 'auth.login', userId, ip, userAgent } structured events that security dashboards use to detect brute force attempts and anomalous login patterns.
Common mistakes: Using inconsistent field names across services (e.g., user_id vs userId vs uid) makes cross-service correlation impossible; define a shared logging schema across all services.
DEBUG environment variable. It produces zero output unless explicitly enabled, making it perfect for library development and optional verbose logging. Namespaces use colon-separated conventions (e.g., app:db, app:http) and support wildcards for enabling multiple namespaces at once.
// app.js
const debug = require('debug');
const dbDebug = debug('app:db');
const httpDebug = debug('app:http');
const authDebug = debug('app:auth');
dbDebug('Connected to database'); // app:db Connected to database
httpDebug('GET /api/users 200 45ms'); // app:http GET /api/users 200 45ms
authDebug('Token verified for user 1'); // app:auth Token verified for user 1
// Enable specific namespaces via env var:
// DEBUG=app:db,app:auth node app.js
// DEBUG=app:* node app.js (all app namespaces)
// DEBUG=* node app.js (everything)
Why it matters: The debug module is used internally by Express, Socket.IO, and hundreds of npm packages — knowing how to enable it enables deep diagnostics without reading source code.
Real applications: Library authors use debug to provide optional verbose output; developers run DEBUG=app:* node app.js locally to trace request flow across database, auth, and HTTP layers.
Common mistakes: Using the debug module for production-critical logging where low overhead and persistence matter; debug is for development diagnostics only — use Winston or Pino for production log storage.
// Basic liveness check
app.get('/health', (req, res) => {
res.status(200).json({
status: 'ok',
uptime: process.uptime(),
timestamp: new Date().toISOString()
});
});
// Detailed readiness check with dependency verification
app.get('/health/ready', async (req, res) => {
const health = { status: 'ok', checks: {} };
let statusCode = 200;
try {
await mongoose.connection.db.admin().ping();
health.checks.database = 'ok';
} catch (e) {
health.checks.database = 'fail';
health.status = 'degraded';
statusCode = 503;
}
try {
await redisClient.ping();
health.checks.cache = 'ok';
} catch (e) {
health.checks.cache = 'fail';
health.status = 'degraded';
statusCode = 503;
}
health.memory = process.memoryUsage();
res.status(statusCode).json(health);
});
Why it matters: Kubernetes and ECS rely on liveness and readiness probes to restart unhealthy pods and stop routing traffic to pods that can't serve requests; without them, broken instances receive traffic silently.
Real applications: Kubernetes liveness probes ping /health every 10 seconds; readiness probes check /health/ready with database and cache pings before allowing traffic after a rolling deployment.
Common mistakes: Making the liveness probe check external dependencies (database ping) — if the DB goes down, all pods restart simultaneously causing a cascade failure; liveness should only check the process itself.
process.memoryUsage() and V8 heap statistics to monitor and profile memory consumption in your application. The key metrics are RSS (total memory allocated), heapTotal (V8 heap size), heapUsed (actual heap usage), and external (C++ object memory). Watch for steadily increasing heapUsed over time — it is the primary indicator of a memory leak.
// Basic memory usage
const mem = process.memoryUsage();
console.log({
rss: `${(mem.rss / 1024 / 1024).toFixed(2)} MB`, // Total memory
heapTotal: `${(mem.heapTotal / 1024 / 1024).toFixed(2)} MB`, // V8 heap allocated
heapUsed: `${(mem.heapUsed / 1024 / 1024).toFixed(2)} MB`, // V8 heap used
external: `${(mem.external / 1024 / 1024).toFixed(2)} MB` // C++ objects
});
// Track memory over time
setInterval(() => {
const { heapUsed } = process.memoryUsage();
logger.info('Memory', { heapMB: (heapUsed / 1024 / 1024).toFixed(2) });
}, 30000);
// Generate heap snapshot for debugging leaks
const v8 = require('v8');
const fs = require('fs');
const snapshot = v8.writeHeapSnapshot();
console.log('Heap snapshot written to:', snapshot);
// Use --inspect flag for Chrome DevTools profiling
// node --inspect app.js
Why it matters: Memory leaks in Node.js accumulate over time and eventually cause out-of-memory crashes; proactive monitoring of heapUsed trends in Prometheus/Grafana enables detection before a crash.
Real applications: A Lambda or Express service that processes requests without leaking references keeps heapUsed flat; a WebSocket server accumulating listeners without cleanup shows a rising heap that restarts the pod daily.
Common mistakes: Only looking at RSS (which includes OS buffers and can be misleading) instead of heapUsed; also ignoring the external value which tracks memory held by Buffer and native addons outside the V8 heap.
// Method 1: Built-in profiler
// node --prof app.js
// node --prof-process isolate-0x*.log > profile.txt
// Method 2: V8 inspector (connect Chrome DevTools)
// node --inspect app.js
// Method 3: Programmatic with perf_hooks
const { performance, PerformanceObserver } = require('perf_hooks');
const obs = new PerformanceObserver((list) => {
list.getEntries().forEach((entry) => {
logger.info(`${entry.name}: ${entry.duration.toFixed(2)}ms`);
});
});
obs.observe({ entryTypes: ['measure'] });
performance.mark('start');
// ... code to profile ...
performance.mark('end');
performance.measure('operation', 'start', 'end');
// Method 4: Clinic.js
// npx clinic doctor -- node app.js
// npx clinic flame -- node app.js
Why it matters: High CPU usage blocks the event loop and degrades response times for all concurrent users; profiling pinpoints the exact function(s) responsible so you optimize the right code.
Real applications: A report generation endpoint with slow JSON serialization appears as a hot flame in clinic flame; moving the operation to a worker thread or optimizing the serialization resolves the bottleneck.
Common mistakes: Profiling in development with a small dataset where the bottleneck doesn't manifest; always profile with production-representative data volumes to expose real performance issues.
const { performance, PerformanceObserver } = require('perf_hooks');
// Measure function execution time
function timedOperation() {
performance.mark('op-start');
// ... expensive operation ...
performance.mark('op-end');
performance.measure('Operation', 'op-start', 'op-end');
}
// Observe measurements
const obs = new PerformanceObserver((list) => {
for (const entry of list.getEntries()) {
console.log(`${entry.name}: ${entry.duration.toFixed(2)}ms`);
}
});
obs.observe({ entryTypes: ['measure', 'function'] });
// Wrap a function to auto-measure
const wrapped = performance.timerify(myExpensiveFunction);
wrapped(); // Automatically recorded
// Monitor event loop delay
const { monitorEventLoopDelay } = require('perf_hooks');
const h = monitorEventLoopDelay({ resolution: 20 });
h.enable();
setTimeout(() => {
console.log('Event loop delay (ms):', {
min: h.min / 1e6,
max: h.max / 1e6,
mean: h.mean / 1e6,
p99: h.percentile(99) / 1e6
});
}, 5000);
Why it matters: perf_hooks provides nanosecond-precision timing using monotonic clocks, unlike Date.now() which is susceptible to system clock adjustments; it is the correct way to measure code performance.
Real applications: Database query middleware uses marks/measures to log query durations; p95 and p99 latency buckets are fed into Prometheus histograms for SLA monitoring dashboards.
Common mistakes: Using Date.now() for sub-millisecond timing — it has only 1ms resolution and can drift; use performance.now() or process.hrtime.bigint() for accurate high-resolution timings.
const Sentry = require('@sentry/node');
Sentry.init({
dsn: process.env.SENTRY_DSN,
environment: process.env.NODE_ENV,
tracesSampleRate: 0.1 // 10% of transactions for performance
});
// Express integration
const express = require('express');
const app = express();
// Sentry request handler (must be first middleware)
app.use(Sentry.Handlers.requestHandler());
app.get('/api/data', async (req, res) => {
// Errors are auto-captured
const data = await riskyOperation();
res.json(data);
});
// Sentry error handler (must be before other error handlers)
app.use(Sentry.Handlers.errorHandler());
// Manual error capture
try {
await processPayment(order);
} catch (err) {
Sentry.captureException(err);
Sentry.captureMessage('Payment failed', { extra: { orderId: order.id } });
}
Why it matters: Unhandled production errors are invisible without a tracking service; Sentry's error grouping, frequency tracking, and release associations let you prioritize and ship fixes with confidence.
Real applications: SaaS products use Sentry to notify engineering within seconds of a new error affecting users; release tracking instantly shows whether a new deployment introduced regressions.
Common mistakes: Not setting a tracesSampleRate in high-traffic applications — capturing 100% of transactions creates excessive overhead and compute cost; use 1-10% sampling in production.
const { randomUUID } = require('crypto');
const { AsyncLocalStorage } = require('async_hooks');
// Create async context for request-scoped data
const asyncLocalStorage = new AsyncLocalStorage();
// Middleware to set correlation ID
function correlationMiddleware(req, res, next) {
const correlationId = req.headers['x-correlation-id'] || randomUUID();
req.correlationId = correlationId;
res.set('x-correlation-id', correlationId);
// Store in async context for access anywhere
asyncLocalStorage.run({ correlationId }, () => next());
}
app.use(correlationMiddleware);
// Logger that auto-includes correlation ID
function createLogger() {
return {
info: (message, meta = {}) => {
const store = asyncLocalStorage.getStore();
console.log(JSON.stringify({
level: 'info',
message,
correlationId: store?.correlationId,
timestamp: new Date().toISOString(),
...meta
}));
},
error: (message, meta = {}) => {
const store = asyncLocalStorage.getStore();
console.log(JSON.stringify({
level: 'error',
message,
correlationId: store?.correlationId,
...meta
}));
}
};
}
// Pass correlation ID to downstream services
async function callDownstreamService(url) {
const store = asyncLocalStorage.getStore();
return fetch(url, {
headers: { 'x-correlation-id': store?.correlationId }
});
}
Why it matters: Without correlation IDs, tracing a failing request across 5 microservices requires manually correlating timestamps and IP addresses; with IDs you search once and see the full request journey instantly.
Real applications: API gateways (Kong, AWS API Gateway) inject trace IDs into incoming requests; every downstream service includes this ID in its logs, enabling end-to-end request tracing in Kibana or Datadog.
Common mistakes: Generating a new correlation ID per service instead of propagating the original one from the upstream caller; always check for an existing x-correlation-id header first and only generate a new one if absent.
prom-client package provides Node.js instrumentation with built-in collectors for default metrics and support for custom counters, gauges, and histograms. Expose a /metrics endpoint that Prometheus scrapes at regular intervals to collect application metrics.
const express = require('express');
const client = require('prom-client');
const app = express();
// Collect default metrics (CPU, memory, event loop, GC)
client.collectDefaultMetrics({ prefix: 'node_' });
// Custom counter — track total HTTP requests
const httpRequestsTotal = new client.Counter({
name: 'http_requests_total',
help: 'Total HTTP requests',
labelNames: ['method', 'route', 'status']
});
// Custom histogram — track request duration
const httpRequestDuration = new client.Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration in seconds',
labelNames: ['method', 'route', 'status'],
buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5]
});
// Custom gauge — track active connections
const activeConnections = new client.Gauge({
name: 'active_connections',
help: 'Number of active connections'
});
// Middleware to record metrics
app.use((req, res, next) => {
const end = httpRequestDuration.startTimer();
activeConnections.inc();
res.on('finish', () => {
const route = req.route?.path || req.path;
httpRequestsTotal.inc({ method: req.method, route, status: res.statusCode });
end({ method: req.method, route, status: res.statusCode });
activeConnections.dec();
});
next();
});
// Expose metrics endpoint for Prometheus scraping
app.get('/metrics', async (req, res) => {
res.set('Content-Type', client.register.contentType);
res.end(await client.register.metrics());
});
Why it matters: Prometheus metrics power SRE practices like SLOs and error budgets; p99 latency histograms, request counters, and active connection gauges provide the data needed to detect and diagnose production issues.
Real applications: Express APIs expose /metrics for Prometheus to scrape every 15 seconds; Grafana dashboards show p95/p99 request latency, error rates, and JVM-style memory plots per service and route.
Common mistakes: Using high-cardinality labels (e.g., userId, orderId) on a metric — each unique label combination creates a new time series, causing Prometheus memory exhaustion; only use low-cardinality labels like route and method.
const winston = require('winston');
require('winston-daily-rotate-file');
// Rotate by date and size
const transport = new winston.transports.DailyRotateFile({
filename: 'logs/app-%DATE%.log',
datePattern: 'YYYY-MM-DD',
maxSize: '20m', // Rotate when file exceeds 20MB
maxFiles: '14d', // Keep logs for 14 days
zippedArchive: true, // Compress rotated files
level: 'info'
});
// Error-specific rotation
const errorTransport = new winston.transports.DailyRotateFile({
filename: 'logs/error-%DATE%.log',
datePattern: 'YYYY-MM-DD',
maxSize: '10m',
maxFiles: '30d',
zippedArchive: true,
level: 'error'
});
const logger = winston.createLogger({
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json()
),
transports: [transport, errorTransport]
});
// Listen for rotation events
transport.on('rotate', (oldFilename, newFilename) => {
logger.info('Log rotated', { oldFilename, newFilename });
});
// Alternative: OS-level rotation with logrotate (Linux)
// /etc/logrotate.d/nodeapp:
// /var/log/nodeapp/*.log {
// daily
// rotate 14
// compress
// delaycompress
// missingok
// notifempty
// copytruncate
// }
Why it matters: A server without log rotation that writes 50MB/day will fill a 30GB disk in less than two years; disk exhaustion prevents new log writes and can cause the entire application to crash on write errors.
Real applications: Production Node.js APIs configure daily rotation with 14-day retention and gzip compression; ops teams monitor disk usage and set alerts when log volume spikes due to error bursts.
Common mistakes: Writing log files in containerized environments instead of stdout/stderr; containers are ephemeral and their filesystems are lost on restart — always log to stdout and let the container platform (Docker, K8s) handle log collection.
const pino = require('pino');
// Basic setup
const logger = pino({
level: process.env.LOG_LEVEL || 'info',
transport: process.env.NODE_ENV !== 'production'
? { target: 'pino-pretty', options: { colorize: true } }
: undefined
});
// Usage — same pattern as Winston
logger.info('Server started');
logger.info({ port: 3000, env: 'production' }, 'Server started');
logger.error({ err, userId }, 'Payment failed');
// Child loggers — add context
const requestLogger = logger.child({ requestId: req.id });
requestLogger.info('Processing request'); // Includes requestId
// Express integration with pino-http
const pinoHttp = require('pino-http');
app.use(pinoHttp({ logger }));
// Benchmark comparison (logs per second):
// pino: ~150,000 ops/sec
// winston: ~ 8,000 ops/sec
// bunyan: ~ 6,000 ops/sec
Why it matters: At 10,000 requests/second, slow synchronous logging in the hot path can add measurable latency; Pino's worker-thread approach keeps logging overhead below 1% of total request time.
Real applications: Fastify, the high-performance Node.js framework, uses Pino as its built-in logger; high-traffic APIs processing millions of requests per day use Pino to avoid logging becoming a performance bottleneck.
Common mistakes: Using pino-pretty in production — it is a development-only formatter that disables Pino's performance optimizations; in production, pipe raw JSON output to your log aggregator directly.
const { monitorEventLoopDelay } = require('perf_hooks');
// Monitor event loop delay with 20ms resolution
const histogram = monitorEventLoopDelay({ resolution: 20 });
histogram.enable();
// Periodic check — report metrics every 30 seconds
setInterval(() => {
const stats = {
min: (histogram.min / 1e6).toFixed(2),
max: (histogram.max / 1e6).toFixed(2),
mean: (histogram.mean / 1e6).toFixed(2),
p50: (histogram.percentile(50) / 1e6).toFixed(2),
p99: (histogram.percentile(99) / 1e6).toFixed(2),
exceeds: (histogram.exceeds / 1e6).toFixed(2)
};
logger.info('Event loop stats (ms)', stats);
// Alert on high lag
if (parseFloat(stats.p99) > 100) {
logger.warn('High event loop lag detected', stats);
}
histogram.reset();
}, 30000);
// Simple lag detection using setTimeout
function checkEventLoopLag() {
const start = process.hrtime.bigint();
setTimeout(() => {
const lag = Number(process.hrtime.bigint() - start) / 1e6 - 1;
if (lag > 50) {
logger.warn(`Event loop lag: ${lag.toFixed(2)}ms`);
}
checkEventLoopLag();
}, 1);
}
checkEventLoopLag();
// Expose as Prometheus gauge
const eventLoopLag = new client.Gauge({
name: 'nodejs_eventloop_lag_seconds',
help: 'Event loop lag in seconds'
});
setInterval(() => {
eventLoopLag.set(histogram.mean / 1e9);
}, 10000);
Why it matters: Event loop lag is the single most important health signal for a Node.js service — it measures whether the process can serve requests promptly; everything else (memory, CPU) is secondary.
Real applications: Prometheus gauges export nodejs_eventloop_lag_seconds per pod; Grafana alerts fire when p99 exceeds 100ms, triggering auto-scaling or investigation of a blocking CPU task.
Common mistakes: Not monitoring event loop lag and only watching CPU% — the event loop can be severely blocked by a single synchronous operation at low average CPU, making CPU metrics alone insufficient for Node.js health monitoring.