WebSockets in Production: Handling Disconnects and Scaling

TL;DR

WebSockets enable real-time bidirectional communication. Handle reconnection with exponential backoff. Use heartbeats to detect dead connections. Scale with Redis pub/sub. Store connection state. Close connections properly.

Our chat app worked perfectly in development. In production, connections dropped randomly. Users saw "disconnected" every few minutes. Mobile connections were worse. I added automatic reconnection with exponential backoff and heartbeats. Connection stability went from 60% to 99%.

Here's how to build WebSockets that work in production, not just on localhost.

What Are WebSockets?

WebSockets provide real-time, bidirectional communication:

// HTTP (request/response)
const data = await fetch('/api/messages');  // Client requests
// Server responds
// Connection closes

// WebSocket (persistent connection)
const ws = new WebSocket('ws://localhost:3000');
ws.onmessage = (event) => {
    console.log('Server sent:', event.data);  // Server pushes data anytime
};
ws.send('Hello');  // Client sends anytime
// Connection stays open

Use cases:

  • Chat applications
  • Live notifications
  • Real-time dashboards
  • Multiplayer games
  • Collaborative editing
  • Stock tickers
  • Live sports scores

Basic Server (Node.js)

const WebSocket = require('ws');

const wss = new WebSocket.Server({ port: 3000 });

wss.on('connection', (ws) => {
    console.log('Client connected');

    ws.on('message', (message) => {
        console.log('Received:', message);
        ws.send(`Echo: ${message}`);
    });

    ws.on('close', () => {
        console.log('Client disconnected');
    });

    ws.on('error', (error) => {
        console.error('WebSocket error:', error);
    });
});

console.log('WebSocket server running on ws://localhost:3000');

Basic Client

const ws = new WebSocket('ws://localhost:3000');

ws.onopen = () => {
    console.log('Connected');
    ws.send('Hello Server!');
};

ws.onmessage = (event) => {
    console.log('Received:', event.data);
};

ws.onclose = () => {
    console.log('Disconnected');
};

ws.onerror = (error) => {
    console.error('WebSocket error:', error);
};

The Disconnection Problem

WebSocket connections drop frequently:

// Connections drop because:
- User's WiFi disconnects
- Mobile switches from WiFi to cellular
- Server restarts
- Load balancer timeout
- Network proxy closes idle connections
- Browser tab goes to sleep
- Phone screen turns off

Without reconnection logic, users stay disconnected.

Automatic Reconnection

class ReconnectingWebSocket {
    constructor(url) {
        this.url = url;
        this.ws = null;
        this.reconnectDelay = 1000;  // Start at 1 second
        this.maxReconnectDelay = 30000;  // Max 30 seconds
        this.reconnectAttempts = 0;

        this.connect();
    }

    connect() {
        this.ws = new WebSocket(this.url);

        this.ws.onopen = () => {
            console.log('Connected');
            this.reconnectDelay = 1000;  // Reset delay
            this.reconnectAttempts = 0;
        };

        this.ws.onmessage = (event) => {
            this.onMessage(event.data);
        };

        this.ws.onclose = () => {
            console.log('Disconnected, reconnecting...');
            this.reconnect();
        };

        this.ws.onerror = (error) => {
            console.error('WebSocket error:', error);
        };
    }

    reconnect() {
        this.reconnectAttempts++;

        setTimeout(() => {
            console.log(`Reconnect attempt ${this.reconnectAttempts}`);
            this.connect();
        }, this.reconnectDelay);

        // Exponential backoff
        this.reconnectDelay = Math.min(
            this.reconnectDelay * 2,
            this.maxReconnectDelay
        );
    }

    send(data) {
        if (this.ws.readyState === WebSocket.OPEN) {
            this.ws.send(data);
        } else {
            console.error('WebSocket not open');
        }
    }

    onMessage(data) {
        // Override this method
        console.log('Received:', data);
    }

    close() {
        this.ws.close();
    }
}

// Usage
const ws = new ReconnectingWebSocket('ws://localhost:3000');
ws.onMessage = (data) => {
    console.log('Message:', data);
};
ws.send('Hello');

Reconnection delays:

Attempt 1: 1 second
Attempt 2: 2 seconds
Attempt 3: 4 seconds
Attempt 4: 8 seconds
Attempt 5: 16 seconds
Attempt 6: 30 seconds (capped)

Heartbeats (Ping/Pong)

Detect dead connections before they timeout:

Server-Side

const WebSocket = require('ws');

const wss = new WebSocket.Server({ port: 3000 });

function heartbeat() {
    this.isAlive = true;
}

wss.on('connection', (ws) => {
    ws.isAlive = true;
    ws.on('pong', heartbeat);

    ws.on('message', (message) => {
        console.log('Received:', message);
    });
});

// Check alive connections every 30 seconds
const interval = setInterval(() => {
    wss.clients.forEach((ws) => {
        if (ws.isAlive === false) {
            console.log('Terminating dead connection');
            return ws.terminate();
        }

        ws.isAlive = false;
        ws.ping();
    });
}, 30000);

wss.on('close', () => {
    clearInterval(interval);
});

Client-Side

class WebSocketWithHeartbeat {
    constructor(url) {
        this.url = url;
        this.ws = null;
        this.pingInterval = null;
        this.connect();
    }

    connect() {
        this.ws = new WebSocket(this.url);

        this.ws.onopen = () => {
            console.log('Connected');
            this.startHeartbeat();
        };

        this.ws.onmessage = (event) => {
            if (event.data === 'pong') {
                // Received pong response
                return;
            }
            this.onMessage(event.data);
        };

        this.ws.onclose = () => {
            console.log('Disconnected');
            this.stopHeartbeat();
            this.reconnect();
        };
    }

    startHeartbeat() {
        this.pingInterval = setInterval(() => {
            if (this.ws.readyState === WebSocket.OPEN) {
                this.ws.send('ping');
            }
        }, 30000);  // Every 30 seconds
    }

    stopHeartbeat() {
        if (this.pingInterval) {
            clearInterval(this.pingInterval);
        }
    }

    send(data) {
        if (this.ws.readyState === WebSocket.OPEN) {
            this.ws.send(data);
        }
    }

    onMessage(data) {
        console.log('Received:', data);
    }
}

Message Queue for Offline Messages

class RobustWebSocket {
    constructor(url) {
        this.url = url;
        this.ws = null;
        this.messageQueue = [];
        this.connect();
    }

    connect() {
        this.ws = new WebSocket(this.url);

        this.ws.onopen = () => {
            console.log('Connected');
            this.flushQueue();
        };

        this.ws.onmessage = (event) => {
            this.onMessage(event.data);
        };

        this.ws.onclose = () => {
            console.log('Disconnected');
            this.reconnect();
        };
    }

    send(data) {
        if (this.ws.readyState === WebSocket.OPEN) {
            this.ws.send(data);
        } else {
            console.log('Queuing message');
            this.messageQueue.push(data);
        }
    }

    flushQueue() {
        console.log(`Flushing ${this.messageQueue.length} queued messages`);
        while (this.messageQueue.length > 0) {
            const message = this.messageQueue.shift();
            this.ws.send(message);
        }
    }

    reconnect() {
        setTimeout(() => this.connect(), 1000);
    }

    onMessage(data) {
        console.log('Received:', data);
    }
}

Authentication

// Server
const WebSocket = require('ws');
const jwt = require('jsonwebtoken');

const wss = new WebSocket.Server({ port: 3000 });

wss.on('connection', (ws, req) => {
    // Extract token from URL
    const token = new URL(req.url, 'ws://localhost').searchParams.get('token');

    try {
        const user = jwt.verify(token, process.env.JWT_SECRET);
        ws.userId = user.id;
        console.log(`User ${user.id} connected`);
    } catch (err) {
        ws.close(1008, 'Invalid token');
        return;
    }

    ws.on('message', (message) => {
        console.log(`User ${ws.userId} sent:`, message);
    });
});

// Client
const token = 'your-jwt-token';
const ws = new WebSocket(`ws://localhost:3000?token=${token}`);

Scaling with Redis Pub/Sub

Multiple server instances with shared state:

// server.js
const WebSocket = require('ws');
const Redis = require('ioredis');

const wss = new WebSocket.Server({ port: 3000 });
const redis = new Redis();
const redisSub = new Redis();

// Store all connections
const connections = new Map();

// Subscribe to Redis channel
redisSub.subscribe('chat', (err) => {
    if (err) console.error('Subscribe error:', err);
});

// When message received from Redis, broadcast to all local connections
redisSub.on('message', (channel, message) => {
    const data = JSON.parse(message);

    connections.forEach((ws) => {
        if (ws.readyState === WebSocket.OPEN) {
            ws.send(JSON.stringify(data));
        }
    });
});

wss.on('connection', (ws, req) => {
    const connectionId = Math.random().toString(36);
    connections.set(connectionId, ws);

    ws.on('message', (message) => {
        const data = JSON.parse(message);

        // Publish to Redis (all servers will receive)
        redis.publish('chat', JSON.stringify({
            user: ws.userId,
            message: data.message,
            timestamp: Date.now()
        }));
    });

    ws.on('close', () => {
        connections.delete(connectionId);
    });
});

console.log('WebSocket server running on port 3000');
// Run multiple instances behind load balancer
// All instances share messages via Redis

Room-Based Messaging

const WebSocket = require('ws');
const wss = new WebSocket.Server({ port: 3000 });

const rooms = new Map();

wss.on('connection', (ws) => {
    ws.rooms = new Set();

    ws.on('message', (data) => {
        const message = JSON.parse(data);

        if (message.type === 'join') {
            joinRoom(ws, message.room);
        } else if (message.type === 'leave') {
            leaveRoom(ws, message.room);
        } else if (message.type === 'message') {
            broadcastToRoom(message.room, message.data, ws);
        }
    });

    ws.on('close', () => {
        ws.rooms.forEach((room) => leaveRoom(ws, room));
    });
});

function joinRoom(ws, roomId) {
    if (!rooms.has(roomId)) {
        rooms.set(roomId, new Set());
    }

    rooms.get(roomId).add(ws);
    ws.rooms.add(roomId);

    console.log(`Client joined room ${roomId}`);
}

function leaveRoom(ws, roomId) {
    if (rooms.has(roomId)) {
        rooms.get(roomId).delete(ws);

        if (rooms.get(roomId).size === 0) {
            rooms.delete(roomId);
        }
    }

    ws.rooms.delete(roomId);
    console.log(`Client left room ${roomId}`);
}

function broadcastToRoom(roomId, data, sender) {
    if (!rooms.has(roomId)) return;

    rooms.get(roomId).forEach((client) => {
        if (client !== sender && client.readyState === WebSocket.OPEN) {
            client.send(JSON.stringify(data));
        }
    });
}

Real-World Example: Chat Application

Server

const WebSocket = require('ws');
const jwt = require('jsonwebtoken');

const wss = new WebSocket.Server({ port: 3000 });
const connections = new Map();
const rooms = new Map();

wss.on('connection', async (ws, req) => {
    // Authenticate
    const token = new URL(req.url, 'ws://localhost').searchParams.get('token');

    try {
        const user = jwt.verify(token, process.env.JWT_SECRET);
        ws.userId = user.id;
        ws.username = user.username;
    } catch (err) {
        ws.close(1008, 'Authentication failed');
        return;
    }

    const connectionId = Math.random().toString(36);
    connections.set(connectionId, ws);
    ws.rooms = new Set();

    // Send connection success
    ws.send(JSON.stringify({
        type: 'connected',
        userId: ws.userId,
        username: ws.username
    }));

    ws.on('message', (data) => {
        try {
            const message = JSON.parse(data);

            switch (message.type) {
                case 'join_room':
                    joinRoom(ws, message.roomId);
                    break;

                case 'leave_room':
                    leaveRoom(ws, message.roomId);
                    break;

                case 'chat_message':
                    handleChatMessage(ws, message);
                    break;

                case 'ping':
                    ws.send(JSON.stringify({ type: 'pong' }));
                    break;
            }
        } catch (err) {
            console.error('Message handling error:', err);
        }
    });

    ws.on('close', () => {
        ws.rooms.forEach((roomId) => leaveRoom(ws, roomId));
        connections.delete(connectionId);
        console.log(`User ${ws.username} disconnected`);
    });

    ws.on('error', (error) => {
        console.error(`WebSocket error for user ${ws.username}:`, error);
    });
});

function joinRoom(ws, roomId) {
    if (!rooms.has(roomId)) {
        rooms.set(roomId, new Set());
    }

    rooms.get(roomId).add(ws);
    ws.rooms.add(roomId);

    // Notify room
    broadcastToRoom(roomId, {
        type: 'user_joined',
        userId: ws.userId,
        username: ws.username,
        roomId: roomId
    }, ws);

    console.log(`${ws.username} joined room ${roomId}`);
}

function leaveRoom(ws, roomId) {
    if (rooms.has(roomId)) {
        rooms.get(roomId).delete(ws);

        // Notify room
        broadcastToRoom(roomId, {
            type: 'user_left',
            userId: ws.userId,
            username: ws.username,
            roomId: roomId
        });

        if (rooms.get(roomId).size === 0) {
            rooms.delete(roomId);
        }
    }

    ws.rooms.delete(roomId);
}

function handleChatMessage(ws, message) {
    const { roomId, text } = message;

    const chatMessage = {
        type: 'chat_message',
        userId: ws.userId,
        username: ws.username,
        roomId: roomId,
        text: text,
        timestamp: Date.now()
    };

    // Save to database
    // await saveMessage(chatMessage);

    // Broadcast to room
    broadcastToRoom(roomId, chatMessage);
}

function broadcastToRoom(roomId, data, excludeWs = null) {
    if (!rooms.has(roomId)) return;

    const message = JSON.stringify(data);

    rooms.get(roomId).forEach((client) => {
        if (client !== excludeWs && client.readyState === WebSocket.OPEN) {
            client.send(message);
        }
    });
}

console.log('WebSocket server running on ws://localhost:3000');

Client

class ChatClient {
    constructor(url, token) {
        this.url = url;
        this.token = token;
        this.ws = null;
        this.currentRoom = null;
        this.messageHandlers = new Map();
        this.connect();
    }

    connect() {
        this.ws = new WebSocket(`${this.url}?token=${this.token}`);

        this.ws.onopen = () => {
            console.log('Connected to chat');
            this.startHeartbeat();
        };

        this.ws.onmessage = (event) => {
            const message = JSON.parse(event.data);
            this.handleMessage(message);
        };

        this.ws.onclose = () => {
            console.log('Disconnected from chat');
            this.stopHeartbeat();
            this.reconnect();
        };

        this.ws.onerror = (error) => {
            console.error('WebSocket error:', error);
        };
    }

    reconnect() {
        setTimeout(() => {
            console.log('Reconnecting...');
            this.connect();
        }, 1000);
    }

    startHeartbeat() {
        this.heartbeatInterval = setInterval(() => {
            this.send({ type: 'ping' });
        }, 30000);
    }

    stopHeartbeat() {
        if (this.heartbeatInterval) {
            clearInterval(this.heartbeatInterval);
        }
    }

    send(data) {
        if (this.ws.readyState === WebSocket.OPEN) {
            this.ws.send(JSON.stringify(data));
        } else {
            console.error('WebSocket not connected');
        }
    }

    joinRoom(roomId) {
        this.currentRoom = roomId;
        this.send({
            type: 'join_room',
            roomId: roomId
        });
    }

    leaveRoom(roomId) {
        this.send({
            type: 'leave_room',
            roomId: roomId
        });
        this.currentRoom = null;
    }

    sendMessage(text) {
        if (!this.currentRoom) {
            console.error('Not in a room');
            return;
        }

        this.send({
            type: 'chat_message',
            roomId: this.currentRoom,
            text: text
        });
    }

    on(messageType, handler) {
        this.messageHandlers.set(messageType, handler);
    }

    handleMessage(message) {
        const handler = this.messageHandlers.get(message.type);
        if (handler) {
            handler(message);
        }
    }

    disconnect() {
        this.ws.close();
    }
}

// Usage
const chat = new ChatClient('ws://localhost:3000', 'your-jwt-token');

chat.on('connected', (data) => {
    console.log('Connected as:', data.username);
    chat.joinRoom('room1');
});

chat.on('chat_message', (data) => {
    console.log(`${data.username}: ${data.text}`);
});

chat.on('user_joined', (data) => {
    console.log(`${data.username} joined the room`);
});

chat.on('user_left', (data) => {
    console.log(`${data.username} left the room`);
});

// Send message
chat.sendMessage('Hello everyone!');

Load Balancing

# nginx.conf
upstream websocket_backend {
    # Use IP hash to keep connections on same server
    ip_hash;

    server backend1:3000;
    server backend2:3000;
    server backend3:3000;
}

server {
    listen 80;

    location /ws {
        proxy_pass http://websocket_backend;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection "upgrade";
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;

        # Increase timeout for long-lived connections
        proxy_read_timeout 3600s;
        proxy_send_timeout 3600s;
    }
}

Monitoring

const WebSocket = require('ws');

const wss = new WebSocket.Server({ port: 3000 });

let stats = {
    totalConnections: 0,
    activeConnections: 0,
    messagesReceived: 0,
    messagesSent: 0
};

wss.on('connection', (ws) => {
    stats.totalConnections++;
    stats.activeConnections++;

    ws.on('message', () => {
        stats.messagesReceived++;
    });

    ws.on('close', () => {
        stats.activeConnections--;
    });
});

// Expose metrics endpoint
const express = require('express');
const app = express();

app.get('/metrics', (req, res) => {
    res.json({
        ...stats,
        rooms: rooms.size,
        uptime: process.uptime()
    });
});

app.listen(3001);

Common Mistakes

Mistake 1: No Reconnection Logic

// BAD - Stays disconnected forever
const ws = new WebSocket('ws://localhost:3000');

// GOOD - Auto-reconnects
class ReconnectingWS extends WebSocket {
    // ... reconnection logic
}

Mistake 2: No Heartbeats

// BAD - Dead connections stay open
// No way to detect disconnection

// GOOD - Regular heartbeats
setInterval(() => ws.ping(), 30000);

Mistake 3: Not Checking readyState

// BAD - Throws error if not connected
ws.send('message');

// GOOD - Check before sending
if (ws.readyState === WebSocket.OPEN) {
    ws.send('message');
}

Mistake 4: Memory Leaks

// BAD - Connections never removed
const connections = [];
wss.on('connection', (ws) => {
    connections.push(ws);
    // Never removed on close!
});

// GOOD - Clean up on close
const connections = new Set();
wss.on('connection', (ws) => {
    connections.add(ws);
    ws.on('close', () => connections.delete(ws));
});

The Bottom Line

WebSockets enable real-time communication. But connections drop constantly in production.

Implement automatic reconnection with exponential backoff. Don't leave users disconnected.

Use heartbeats to detect dead connections. Close them before they timeout.

Scale with Redis pub/sub to share messages across multiple servers.

Queue messages when disconnected. Send them when reconnected.

Our chat worked in dev but dropped connections every few minutes in production. Added reconnection and heartbeats. Connection stability: 60% → 99%.

Build WebSockets properly from day one. Handle disconnections. Your users expect real-time to stay connected.