// Monolith — single Express app handles everything
app.use('/users', usersRouter);
app.use('/orders', ordersRouter);
app.use('/products', productsRouter);
app.use('/payments', paymentsRouter);
// Microservices — separate services
// user-service → localhost:3001
// order-service → localhost:3002
// product-service → localhost:3003
// payment-service → localhost:3004
Why it matters: Most teams start with a monolith and reach microservices as team size and scaling requirements grow; understanding the tradeoffs prevents premature decomposition, which is one of the most common architectural mistakes.
Real applications: Netflix, Uber, and Amazon evolved from monoliths to microservices as their engineering teams grew to hundreds; startups with fewer than 10 engineers almost always benefit from starting with a modular monolith instead.
Common mistakes: Decomposing a monolith too early before clear service boundaries emerge ("nano-services"); each service needs its own schema, deployment, and monitoring — this overhead is only justified when the benefits outweigh the complexity.
const express = require('express');
const { createProxyMiddleware } = require('http-proxy-middleware');
const app = express();
// Route requests to microservices
app.use('/api/users', createProxyMiddleware({
target: 'http://user-service:3001',
changeOrigin: true
}));
app.use('/api/orders', createProxyMiddleware({
target: 'http://order-service:3002',
changeOrigin: true
}));
app.use('/api/products', createProxyMiddleware({
target: 'http://product-service:3003',
changeOrigin: true
}));
app.listen(3000); // Gateway listens on single port
Why it matters: Without a gateway, every client must know the address of every microservice; adding a cross-cutting concern like auth or rate limiting requires changes to all services individually instead of once at the gateway.
Real applications: AWS API Gateway and Kong route requests to Lambda functions and internal services; the gateway enforces JWT auth, applies rate limits per plan tier, and logs all incoming requests without any service needing to implement these themselves.
Common mistakes: Putting too much business logic in the gateway — it should route, authenticate, and rate-limit, not aggregate data or make business decisions; fat gateways become a bottleneck and a single point of failure.
// Simple registry pattern
class ServiceRegistry {
constructor() { this.services = new Map(); }
register(name, host, port) {
if (!this.services.has(name)) this.services.set(name, []);
this.services.get(name).push({ host, port, timestamp: Date.now() });
}
discover(name) {
const instances = this.services.get(name) || [];
// Simple round-robin load balancing
return instances[Math.floor(Math.random() * instances.length)];
}
heartbeat(name, host, port) {
this.register(name, host, port); // Refresh timestamp
}
}
// Each service registers itself on startup
// GET /discover/:service returns available instances
Why it matters: In containerized environments, service IPs change every deployment; hardcoded addresses cause flaky communication — service discovery makes inter-service communication resilient to scaling and restarts.
Real applications: Kubernetes Services provide built-in DNS-based discovery — http://order-service resolves to the current pod IP(s) automatically; Consul provides service discovery with health checks for non-Kubernetes environments.
Common mistakes: Caching discovered service addresses too aggressively without TTL — stale addresses cause connection failures after pod restarts; always respect the TTL from the registry and refresh on connection errors.
amqplib package provides the Node.js client.
const amqp = require('amqplib');
// Producer — send order event
async function sendOrder(order) {
const conn = await amqp.connect('amqp://localhost');
const channel = await conn.createChannel();
const queue = 'order_queue';
await channel.assertQueue(queue, { durable: true });
channel.sendToQueue(queue, Buffer.from(JSON.stringify(order)), {
persistent: true
});
console.log('Order sent:', order.id);
}
// Consumer — process orders
async function processOrders() {
const conn = await amqp.connect('amqp://localhost');
const channel = await conn.createChannel();
const queue = 'order_queue';
await channel.assertQueue(queue, { durable: true });
channel.prefetch(1); // Process one at a time
channel.consume(queue, (msg) => {
const order = JSON.parse(msg.content.toString());
console.log('Processing order:', order.id);
channel.ack(msg); // Acknowledge completion
});
}
Why it matters: Message queues decouple producer and consumer lifecycles; if the email service is down when an order is placed, the message persists and is processed when the service recovers, preventing data loss.
Real applications: E-commerce order processing sends order events to RabbitMQ queues consumed by separate inventory, email, and analytics services; each consumer scales independently based on its own processing rate.
Common mistakes: Not acknowledging messages properly — if a consumer crashes before calling channel.ack(msg), the message is requeued; always acknowledge only after successful processing to guarantee at-least-once delivery.
class CircuitBreaker {
constructor(fn, { threshold = 5, timeout = 30000 } = {}) {
this.fn = fn;
this.state = 'CLOSED'; // CLOSED → OPEN → HALF_OPEN
this.failures = 0;
this.threshold = threshold;
this.timeout = timeout;
}
async call(...args) {
if (this.state === 'OPEN') {
throw new Error('Circuit is OPEN — service unavailable');
}
try {
const result = await this.fn(...args);
this.onSuccess();
return result;
} catch (err) {
this.onFailure();
throw err;
}
}
onSuccess() { this.failures = 0; this.state = 'CLOSED'; }
onFailure() {
this.failures++;
if (this.failures >= this.threshold) {
this.state = 'OPEN';
setTimeout(() => { this.state = 'HALF_OPEN'; }, this.timeout);
}
}
}
const breaker = new CircuitBreaker(fetchUserService);
const user = await breaker.call(userId);
Why it matters: Without circuit breakers, a slow downstream service causes all callers to block on timeouts, quickly exhausting the thread/connection pool and taking down the entire application — the circuit breaker limits this blast radius.
Real applications: Payment service calls wrapped in a circuit breaker return a "payment temporarily unavailable" error immediately when the payment provider is down, keeping the rest of the checkout flow responsive.
Common mistakes: Setting the failure threshold too low (e.g., 1 failure opens the circuit) causes the circuit to trip on transient errors; a threshold of 5 failures within 60 seconds is more typical for production resilience.
const EventEmitter = require('events');
// Event bus (in production, use Redis Pub/Sub or Kafka)
class EventBus extends EventEmitter {}
const eventBus = new EventBus();
// Order service — publishes event
function createOrder(order) {
saveToDatabase(order);
eventBus.emit('order.created', {
orderId: order.id,
userId: order.userId,
total: order.total,
timestamp: new Date().toISOString()
});
}
// Email service — subscribes to event
eventBus.on('order.created', (event) => {
sendConfirmationEmail(event.userId, event.orderId);
});
// Inventory service — subscribes to same event
eventBus.on('order.created', (event) => {
reserveInventory(event.orderId);
});
Why it matters: Event-driven architecture eliminates temporal coupling — the order service doesn't need the email service; it just publishes an event and any number of additional consumers (analytics, inventory, loyalty) can be added without touching the order service.
Real applications: An order placement event triggers simultaneous processing in inventory reservation, email confirmation, fraud detection, and analytics services — all subscribing independently to the same event stream.
Common mistakes: Using in-process EventEmitter for inter-service communication in production — it doesn't survive process restarts, lacks persistence, and doesn't work across multiple service instances; use RabbitMQ, Kafka, or Redis Pub/Sub instead.
package*.json first for layer caching, using npm ci for deterministic installs, and running as a non-root user.
# Dockerfile
FROM node:20-alpine
WORKDIR /app
# Install dependencies first (caching layer)
COPY package*.json ./
RUN npm ci --production
# Copy application code
COPY . .
EXPOSE 3000
# Run as non-root user
USER node
CMD ["node", "server.js"]
# docker-compose.yml
version: '3.8'
services:
user-service:
build: ./user-service
ports: ["3001:3000"]
environment:
- DATABASE_URL=mongodb://mongo:27017/users
order-service:
build: ./order-service
ports: ["3002:3000"]
mongo:
image: mongo:7
volumes:
- mongo-data:/data/db
volumes:
mongo-data:
Why it matters: Docker eliminates "works on my machine" issues; every developer, CI pipeline, and production host runs the exact same image built from the same Dockerfile with the same package-lock.json.
Real applications: Node.js microservices each have their own Dockerfile; docker-compose spins up all services with their databases and message brokers locally; Kubernetes runs the same images in production.
Common mistakes: Running the container as root (the default) — if the process is compromised, an attacker gains root access to the host; always add USER node to run as a non-privileged user.
.proto files from which both client and server code is auto-generated, ensuring type safety across service boundaries. gRPC also supports four communication patterns: unary, server streaming, client streaming, and bidirectional streaming.
// user.proto — define the service contract
syntax = "proto3";
service UserService {
rpc GetUser (UserRequest) returns (UserResponse);
}
message UserRequest { string id = 1; }
message UserResponse { string id = 1; string name = 2; string email = 3; }
// Server
const grpc = require('@grpc/grpc-js');
const protoLoader = require('@grpc/proto-loader');
const packageDef = protoLoader.loadSync('user.proto');
const proto = grpc.loadPackageDefinition(packageDef);
const server = new grpc.Server();
server.addService(proto.UserService.service, {
GetUser: (call, callback) => {
callback(null, { id: call.request.id, name: 'Alice', email: 'alice@example.com' });
}
});
server.bindAsync('0.0.0.0:50051', grpc.ServerCredentials.createInsecure(), () => {
server.start();
});
Why it matters: gRPC's binary serialization and HTTP/2 multiplexing can be 5-10x faster than REST for high-frequency inter-service calls; the auto-generated strongly-typed clients also eliminate a class of runtime errors caused by API contract drift.
Real applications: Google, Netflix, and Square use gRPC for internal service-to-service communication where low latency and high throughput are critical; it's especially common in ML inference pipelines and real-time data pipelines.
Common mistakes: Using gRPC for public-facing APIs without a REST/gRPC transcoding layer; browsers don't support native gRPC — it is best used for internal service-to-service communication, with REST or GraphQL at the public edge.
// Order saga — orchestrator pattern
class OrderSaga {
async execute(orderData) {
try {
// Step 1: Create order
const order = await orderService.create(orderData);
// Step 2: Reserve inventory
await inventoryService.reserve(order.items);
// Step 3: Process payment
await paymentService.charge(order.userId, order.total);
// Step 4: Confirm order
await orderService.confirm(order.id);
} catch (error) {
// Compensating transactions (rollback)
await this.compensate(order, error);
}
}
async compensate(order, error) {
await paymentService.refund(order.id).catch(() => {});
await inventoryService.release(order.items).catch(() => {});
await orderService.cancel(order.id).catch(() => {});
throw error;
}
}
Why it matters: Distributed transactions (2PC) across microservices cause tight coupling, performance issues, and availability problems; sagas achieve business correctness through compensating actions without distributed locking.
Real applications: An e-commerce order placement saga coordinates inventory reservation, payment processing, and order confirmation; if payment fails, it compensates by releasing the inventory reservation automatically.
Common mistakes: Not making compensating actions idempotent — if a compensation step fails and is retried, running it twice must produce the same result; a double refund or double inventory release would be worse than the original failure.
app.get('/health', (req, res) => {
res.status(200).json({ status: 'ok', uptime: process.uptime() });
});
// Detailed readiness check
app.get('/health/ready', async (req, res) => {
const checks = {};
try {
// Check database connection
await db.query('SELECT 1');
checks.database = 'ok';
} catch (e) {
checks.database = 'fail';
}
try {
// Check Redis connection
await redisClient.ping();
checks.redis = 'ok';
} catch (e) {
checks.redis = 'fail';
}
const healthy = Object.values(checks).every(s => s === 'ok');
res.status(healthy ? 200 : 503).json({
status: healthy ? 'ok' : 'degraded',
checks,
timestamp: new Date().toISOString()
});
});
Why it matters: Without health checks, Kubernetes has no way to know a pod is broken; it continues routing traffic to a pod that returns 500 errors or hangs indefinitely, silently degrading user experience.
Real applications: Kubernetes readiness probes prevent traffic from reaching a pod during its startup database connection phase; liveness probes restart pods that have entered a deadlock and stopped processing requests.
Common mistakes: Including external dependency checks in the liveness probe — if the database is slow, all pods fail their liveness check simultaneously, get restarted, and cause an outage worse than the original DB issue.
const express = require('express');
const { createProxyMiddleware } = require('http-proxy-middleware');
const app = express();
// NEW: Migrated user service (microservice)
app.use('/api/users', createProxyMiddleware({
target: 'http://user-service:3001',
changeOrigin: true
}));
// NEW: Migrated product service (microservice)
app.use('/api/products', createProxyMiddleware({
target: 'http://product-service:3002',
changeOrigin: true
}));
// LEGACY: Everything else still goes to the monolith
app.use('/', createProxyMiddleware({
target: 'http://monolith:3000',
changeOrigin: true
}));
// Migration steps:
// 1. Identify bounded contexts in the monolith
// 2. Build new microservice for one context
// 3. Route traffic to new service via facade
// 4. Verify correctness (shadow traffic, canary)
// 5. Remove old code from monolith
// 6. Repeat for next context
Why it matters: Big-bang rewrites have a high failure rate; the strangler fig pattern de-risks migration by shipping incremental changes that can be validated and rolled back independently, keeping the business running throughout.
Real applications: Amazon migrated from a monolith to microservices over several years using this pattern — one team at a time extracted their bounded context behind a gateway while the monolith handled everything else.
Common mistakes: Not establishing a proper data migration strategy alongside code migration — if the new service shares the monolith's database, you haven't truly decoupled it; each extracted service needs its own data ownership.
const express = require('express');
// COMMAND side — handles writes
const commandRouter = express.Router();
commandRouter.post('/orders', async (req, res) => {
const order = await db.orders.create(req.body);
// Publish event for read model sync
await eventBus.publish('order.created', {
id: order.id,
userId: order.userId,
items: order.items,
total: order.total,
createdAt: new Date()
});
res.status(201).json({ id: order.id });
});
// QUERY side — handles reads (separate database/model)
const queryRouter = express.Router();
queryRouter.get('/orders/user/:userId', async (req, res) => {
// Read from denormalized read model (optimized for this query)
const orders = await readDb.userOrders.find({
userId: req.params.userId
});
res.json(orders);
});
// Event handler — syncs read model
eventBus.subscribe('order.created', async (event) => {
// Build denormalized view for fast reads
await readDb.userOrders.upsert({
orderId: event.id,
userId: event.userId,
summary: `Order #${event.id} - ${event.total}`,
itemCount: event.items.length,
createdAt: event.createdAt
});
});
Why it matters: CQRS allows read and write sides to scale independently — a system with 100:1 read/write ratio can scale the query side (ElasticSearch, Redis) without scaling the expensive write database.
Real applications: Reporting dashboards use denormalized read models precomputed from event streams; e-commerce order history pages query a Elasticsearch index rebuilt from order events rather than a normalized relational schema.
Common mistakes: Applying CQRS to simple CRUD applications where the complexity isn't justified; the pattern adds significant overhead (event handlers, read model synchronization, eventual consistency) — only use it for read-heavy domains with complex query requirements.
// tracing.js — initialize tracing (run before app code)
const { NodeTracerProvider } = require('@opentelemetry/sdk-trace-node');
const { SimpleSpanProcessor } = require('@opentelemetry/sdk-trace-base');
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http');
const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express');
const { registerInstrumentations } = require('@opentelemetry/instrumentation');
const provider = new NodeTracerProvider();
provider.addSpanProcessor(new SimpleSpanProcessor(
new JaegerExporter({ endpoint: 'http://jaeger:14268/api/traces' })
));
provider.register();
registerInstrumentations({
instrumentations: [
new HttpInstrumentation(),
new ExpressInstrumentation(),
],
});
// Custom span in business logic
const { trace } = require('@opentelemetry/api');
const tracer = trace.getTracer('order-service');
async function processOrder(order) {
const span = tracer.startSpan('process-order');
span.setAttribute('order.id', order.id);
span.setAttribute('order.total', order.total);
try {
await validateOrder(order);
span.addEvent('order validated');
await chargePayment(order);
span.addEvent('payment charged');
span.setStatus({ code: 1 }); // OK
} catch (err) {
span.setStatus({ code: 2, message: err.message }); // ERROR
throw err;
} finally {
span.end();
}
}
Why it matters: When a request touches 8 services before returning an error, logs from individual services are insufficient to pinpoint the problem; distributed tracing shows exactly which service, at which timestamp, caused the failure.
Real applications: OpenTelemetry auto-instruments Express and HTTP clients, propagating trace context headers automatically; Jaeger UI visualizes the waterfall of spans showing latency and errors for every service in a request's path.
Common mistakes: Not adding custom span attributes like order.id or user.id to spans — without domain context, a trace shows timing data but doesn't help identify which specific entity caused the issue.
// Bulkhead with connection pools
class Bulkhead {
constructor(name, maxConcurrent = 10) {
this.name = name;
this.maxConcurrent = maxConcurrent;
this.active = 0;
this.queue = [];
}
async execute(fn) {
if (this.active >= this.maxConcurrent) {
// Queue the request or reject
return new Promise((resolve, reject) => {
this.queue.push({ fn, resolve, reject });
// Timeout waiting requests
setTimeout(() => reject(new Error(
`Bulkhead ${this.name}: queue timeout`
)), 5000);
});
}
this.active++;
try {
return await fn();
} finally {
this.active--;
this.processQueue();
}
}
processQueue() {
if (this.queue.length > 0 && this.active < this.maxConcurrent) {
const { fn, resolve, reject } = this.queue.shift();
this.execute(fn).then(resolve).catch(reject);
}
}
}
// Separate bulkheads per downstream service
const userBulkhead = new Bulkhead('user-service', 20);
const paymentBulkhead = new Bulkhead('payment-service', 10);
const emailBulkhead = new Bulkhead('email-service', 5);
// If payment-service is slow, only its 10 slots are consumed
// user-service and email-service continue with their own pools
app.get('/api/checkout', async (req, res) => {
const user = await userBulkhead.execute(() => fetchUser(req.userId));
const payment = await paymentBulkhead.execute(() => processPayment(req.body));
await emailBulkhead.execute(() => sendReceipt(user.email));
res.json({ success: true });
});
Why it matters: Without bulkheads, a payment service slowdown can exhaust all connections, making the user service and email service unavailable too — a partial failure cascades into a total outage.
Real applications: A checkout endpoint uses separate bulkheads for user, inventory, and payment calls; if payments are slow and fill their 10-slot pool, user and inventory calls continue unaffected with their own dedicated pools.
Common mistakes: Setting all bulkhead sizes to the same value regardless of the downstream service's throughput; a fast low-latency service needs fewer slots than a slow high-latency one — size each bulkhead based on measured latency and expected concurrency.
// SYNCHRONOUS — HTTP request-response
const axios = require('axios');
async function getOrderWithUser(orderId) {
const order = await axios.get(`http://order-service:3002/orders/${orderId}`);
// Synchronous call — blocks until response
const user = await axios.get(`http://user-service:3001/users/${order.data.userId}`);
return { ...order.data, user: user.data };
}
// ASYNCHRONOUS — event-driven via message queue
const amqp = require('amqplib');
// Order service publishes event (fire and forget)
async function createOrder(orderData) {
const order = await db.orders.create(orderData);
// Publish event — don't wait for consumers
const channel = await getChannel();
channel.publish('events', 'order.created',
Buffer.from(JSON.stringify(order)),
{ persistent: true }
);
return order; // Return immediately
}
// Notification service consumes event independently
async function startNotificationConsumer() {
const channel = await getChannel();
await channel.assertQueue('notifications');
await channel.bindQueue('notifications', 'events', 'order.created');
channel.consume('notifications', async (msg) => {
const order = JSON.parse(msg.content.toString());
await sendOrderConfirmation(order);
channel.ack(msg);
});
}
// HYBRID — sync for queries, async for commands
// GET /orders/:id → synchronous (need immediate response)
// POST /orders → async (publish event, return 202 Accepted)
Why it matters: Choosing the wrong communication pattern is a common design mistake — using synchronous HTTP for a long-running background operation blocks the client thread, while async events for a user-facing read query complicate the data access flow unnecessarily.
Real applications: A product search uses synchronous HTTP (user needs immediate results); an order placement uses asynchronous events (email, inventory, analytics can process independently after the order is created and the client receives 202 Accepted).
Common mistakes: Using synchronous HTTP for all inter-service communication in a complex workflow — a chain of 5 synchronous calls multiplies latency and creates a fault dependency chain where any one failure brings down the entire flow.