From Zero to Production: Deploying LLM Applications with Node.js
A comprehensive guide to deploying, scaling, and monitoring Node.js-based LLM applications in production environments with real-world best practices.
From Zero to Production: Deploying LLM Applications with Node.js
Building an LLM application is one thing, but deploying it to production where it can handle real user traffic, scale efficiently, and remain reliable is an entirely different challenge. In this comprehensive guide, we'll walk through everything you need to know to take your Node.js LLM application from development to a robust production deployment.
Production-Ready Application Architecture
Before diving into deployment, let's establish a solid foundation with a production-ready application structure:
// src/app.ts
import express from 'express';
import helmet from 'helmet';
import cors from 'cors';
import rateLimit from 'express-rate-limit';
import { createProxyMiddleware } from 'http-proxy-middleware';
import { AIService } from './services/ai-service';
import { MetricsService } from './services/metrics-service';
import { Logger } from './utils/logger';
class ProductionApp {
private app: express.Application;
private aiService: AIService;
private metricsService: MetricsService;
private logger: Logger;
constructor() {
this.app = express();
this.aiService = new AIService();
this.metricsService = new MetricsService();
this.logger = new Logger();
this.setupMiddleware();
this.setupRoutes();
this.setupErrorHandling();
}
private setupMiddleware() {
// Security
this.app.use(helmet({
contentSecurityPolicy: {
directives: {
defaultSrc: ["'self'"],
scriptSrc: ["'self'", "'unsafe-inline'"],
styleSrc: ["'self'", "'unsafe-inline'"],
imgSrc: ["'self'", "data:", "https:"],
},
},
}));
// CORS
this.app.use(cors({
origin: process.env.ALLOWED_ORIGINS?.split(',') || ['http://localhost:3000'],
credentials: true,
}));
// Rate limiting
const limiter = rateLimit({
windowMs: 15 * 60 * 1000, // 15 minutes
max: 100, // limit each IP to 100 requests per windowMs
message: {
error: 'Too many requests, please try again later.',
},
standardHeaders: true,
legacyHeaders: false,
});
this.app.use('/api/', limiter);
// Request parsing
this.app.use(express.json({ limit: '10mb' }));
this.app.use(express.urlencoded({ extended: true, limit: '10mb' }));
// Request logging
this.app.use((req, res, next) => {
this.logger.info(`${req.method} ${req.path}`, {
ip: req.ip,
userAgent: req.get('User-Agent'),
requestId: req.headers['x-request-id'],
});
next();
});
// Metrics collection
this.app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = Date.now() - start;
this.metricsService.recordRequest(req.method, req.path, res.statusCode, duration);
});
next();
});
}
private setupRoutes() {
// Health check
this.app.get('/health', (req, res) => {
res.json({
status: 'healthy',
timestamp: new Date().toISOString(),
version: process.env.APP_VERSION || '1.0.0',
uptime: process.uptime(),
});
});
// Metrics endpoint
this.app.get('/metrics', async (req, res) => {
const metrics = await this.metricsService.getMetrics();
res.json(metrics);
});
// AI endpoints
this.app.post('/api/chat', this.handleChat.bind(this));
this.app.post('/api/chat/stream', this.handleStreamChat.bind(this));
this.app.post('/api/embeddings', this.handleEmbeddings.bind(this));
}
private async handleChat(req: express.Request, res: express.Response) {
try {
const { message, context, model = 'gpt-3.5-turbo' } = req.body;
if (!message) {
return res.status(400).json({ error: 'Message is required' });
}
const startTime = Date.now();
const response = await this.aiService.chat(message, { context, model });
const duration = Date.now() - startTime;
this.metricsService.recordAIRequest('chat', model, duration, true);
res.json({
response: response.content,
metadata: {
model,
tokens: response.usage,
duration,
},
});
} catch (error) {
this.handleError(error, req, res);
}
}
private async handleStreamChat(req: express.Request, res: express.Response) {
try {
const { message, context, model = 'gpt-3.5-turbo' } = req.body;
res.setHeader('Content-Type', 'text/event-stream');
res.setHeader('Cache-Control', 'no-cache');
res.setHeader('Connection', 'keep-alive');
const stream = this.aiService.chatStream(message, { context, model });
for await (const chunk of stream) {
res.write(`data: ${JSON.stringify({ content: chunk })}\n\n`);
}
res.write('data: [DONE]\n\n');
res.end();
} catch (error) {
res.write(`data: ${JSON.stringify({ error: error.message })}\n\n`);
res.end();
}
}
private setupErrorHandling() {
// 404 handler
this.app.use((req, res) => {
res.status(404).json({
error: 'Not Found',
message: 'The requested resource was not found',
});
});
// Global error handler
this.app.use((error: Error, req: express.Request, res: express.Response, next: express.NextFunction) => {
this.handleError(error, req, res);
});
}
private handleError(error: Error, req: express.Request, res: express.Response) {
this.logger.error('Request error', {
error: error.message,
stack: error.stack,
path: req.path,
method: req.method,
});
// Don't leak error details in production
const isDevelopment = process.env.NODE_ENV === 'development';
res.status(500).json({
error: 'Internal Server Error',
message: isDevelopment ? error.message : 'Something went wrong',
...(isDevelopment && { stack: error.stack }),
});
}
public start(port: number = 3000) {
this.app.listen(port, () => {
this.logger.info(`Server running on port ${port}`);
});
}
}
export default ProductionApp;
Containerization with Docker
Create a production-ready Docker setup:
# Dockerfile
FROM node:18-alpine AS base
# Install dependencies only when needed
FROM base AS deps
RUN apk add --no-cache libc6-compat
WORKDIR /app
# Install dependencies based on the preferred package manager
COPY package.json package-lock.json* ./
RUN npm ci --only=production && npm cache clean --force
# Rebuild the source code only when needed
FROM base AS builder
WORKDIR /app
COPY --from=deps /app/node_modules ./node_modules
COPY . .
# Build the application
RUN npm run build
# Production image, copy all the files and run the app
FROM base AS runner
WORKDIR /app
ENV NODE_ENV=production
RUN addgroup --system --gid 1001 nodejs
RUN adduser --system --uid 1001 nextjs
# Copy built application
COPY --from=builder /app/dist ./dist
COPY --from=builder /app/node_modules ./node_modules
COPY --from=builder /app/package.json ./package.json
USER nextjs
EXPOSE 3000
ENV PORT 3000
ENV HOSTNAME "0.0.0.0"
CMD ["node", "dist/app.js"]
# docker-compose.yml
version: '3.8'
services:
app:
build: .
ports:
- "3000:3000"
environment:
- NODE_ENV=production
- REDIS_URL=redis://redis:6379
- DATABASE_URL=postgresql://user:password@postgres:5432/llm_app
depends_on:
- redis
- postgres
restart: unless-stopped
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis_data:/data
restart: unless-stopped
postgres:
image: postgres:15-alpine
environment:
POSTGRES_USER: user
POSTGRES_PASSWORD: password
POSTGRES_DB: llm_app
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- "5432:5432"
restart: unless-stopped
nginx:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf
- ./ssl:/etc/nginx/ssl
depends_on:
- app
restart: unless-stopped
volumes:
redis_data:
postgres_data:
Kubernetes Deployment
For enterprise-scale deployments, use Kubernetes:
# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-app
labels:
app: llm-app
spec:
replicas: 3
selector:
matchLabels:
app: llm-app
template:
metadata:
labels:
app: llm-app
spec:
containers:
- name: llm-app
image: your-registry/llm-app:latest
ports:
- containerPort: 3000
env:
- name: NODE_ENV
value: "production"
- name: REDIS_URL
valueFrom:
secretKeyRef:
name: app-secrets
key: redis-url
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: app-secrets
key: database-url
- name: OPENAI_API_KEY
valueFrom:
secretKeyRef:
name: app-secrets
key: openai-api-key
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
livenessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
name: llm-app-service
spec:
selector:
app: llm-app
ports:
- protocol: TCP
port: 80
targetPort: 3000
type: LoadBalancer
Advanced Monitoring and Observability
🚨 Critical: Implement comprehensive monitoring from day one. LLM applications have unique failure modes and performance characteristics that require specialized monitoring.
Comprehensive Metrics Collection
// src/services/metrics-service.ts
import { StatsD } from 'node-statsd';
import { performance } from 'perf_hooks';
interface AIMetrics {
requests: number;
errors: number;
latency: number[];
tokenUsage: number;
costs: number;
}
export class MetricsService {
private statsd: StatsD;
private metrics: Map<string, AIMetrics> = new Map();
private requestCounts: Map<string, number> = new Map();
constructor() {
this.statsd = new StatsD({
host: process.env.STATSD_HOST || 'localhost',
port: parseInt(process.env.STATSD_PORT || '8125'),
});
}
recordRequest(method: string, path: string, statusCode: number, duration: number) {
const key = `${method}:${path}`;
this.requestCounts.set(key, (this.requestCounts.get(key) || 0) + 1);
this.statsd.increment('http.requests.total', 1, [`method:${method}`, `status:${statusCode}`]);
this.statsd.histogram('http.request.duration', duration, [`method:${method}`]);
}
recordAIRequest(
type: string,
model: string,
duration: number,
success: boolean,
tokens?: { prompt: number; completion: number; total: number },
cost?: number
) {
const key = `${type}:${model}`;
if (!this.metrics.has(key)) {
this.metrics.set(key, {
requests: 0,
errors: 0,
latency: [],
tokenUsage: 0,
costs: 0,
});
}
const metrics = this.metrics.get(key)!;
metrics.requests++;
metrics.latency.push(duration);
if (!success) {
metrics.errors++;
}
if (tokens) {
metrics.tokenUsage += tokens.total;
}
if (cost) {
metrics.costs += cost;
}
// Send to StatsD
this.statsd.increment('ai.requests.total', 1, [`type:${type}`, `model:${model}`]);
this.statsd.histogram('ai.request.duration', duration, [`type:${type}`, `model:${model}`]);
if (tokens) {
this.statsd.histogram('ai.tokens.used', tokens.total, [`type:${type}`, `model:${model}`]);
}
if (cost) {
this.statsd.histogram('ai.cost', cost, [`type:${type}`, `model:${model}`]);
}
if (!success) {
this.statsd.increment('ai.requests.errors', 1, [`type:${type}`, `model:${model}`]);
}
}
async getMetrics() {
const systemMetrics = {
memory: process.memoryUsage(),
uptime: process.uptime(),
cpuUsage: process.cpuUsage(),
};
const aiMetrics = Array.from(this.metrics.entries()).map(([key, metrics]) => {
const avgLatency = metrics.latency.reduce((a, b) => a + b, 0) / metrics.latency.length;
const errorRate = metrics.errors / metrics.requests;
return {
key,
requests: metrics.requests,
errors: metrics.errors,
errorRate,
avgLatency,
totalTokens: metrics.tokenUsage,
totalCost: metrics.costs,
};
});
return {
timestamp: new Date().toISOString(),
system: systemMetrics,
ai: aiMetrics,
requests: Object.fromEntries(this.requestCounts),
};
}
}
Application Performance Monitoring (APM)
// src/utils/apm.ts
import apm from 'elastic-apm-node';
// Initialize APM
if (process.env.NODE_ENV === 'production') {
apm.start({
serviceName: 'llm-app',
secretToken: process.env.ELASTIC_APM_SECRET_TOKEN,
serverUrl: process.env.ELASTIC_APM_SERVER_URL,
environment: process.env.NODE_ENV,
captureBody: 'all',
captureHeaders: true,
});
}
export class APMService {
static createSpan(name: string, type: string = 'custom') {
return apm.startSpan(name, type);
}
static recordError(error: Error, context?: any) {
apm.captureError(error, context);
}
static addLabels(labels: Record<string, string>) {
apm.addLabels(labels);
}
static setUserContext(user: { id: string; email?: string; username?: string }) {
apm.setUserContext(user);
}
static setCustomContext(context: Record<string, any>) {
apm.setCustomContext(context);
}
}
Auto-Scaling and Load Balancing
Horizontal Pod Autoscaler (HPA)
# k8s/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: llm-app-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: llm-app
minReplicas: 3
maxReplicas: 20
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
- type: Pods
pods:
metric:
name: ai_requests_per_second
target:
type: AverageValue
averageValue: "10"
behavior:
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Percent
value: 100
periodSeconds: 15
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 10
periodSeconds: 60
Application-Level Load Balancing
// src/services/load-balancer.ts
interface ServiceEndpoint {
url: string;
weight: number;
healthy: boolean;
responseTime: number;
}
export class LoadBalancer {
private endpoints: ServiceEndpoint[] = [];
private currentIndex = 0;
addEndpoint(url: string, weight: number = 1) {
this.endpoints.push({
url,
weight,
healthy: true,
responseTime: 0,
});
}
async selectEndpoint(): Promise<string> {
const healthyEndpoints = this.endpoints.filter(ep => ep.healthy);
if (healthyEndpoints.length === 0) {
throw new Error('No healthy endpoints available');
}
// Weighted round-robin selection
const totalWeight = healthyEndpoints.reduce((sum, ep) => sum + ep.weight, 0);
let randomWeight = Math.random() * totalWeight;
for (const endpoint of healthyEndpoints) {
randomWeight -= endpoint.weight;
if (randomWeight <= 0) {
return endpoint.url;
}
}
return healthyEndpoints[0].url;
}
async healthCheck() {
const checks = this.endpoints.map(async (endpoint) => {
try {
const start = Date.now();
const response = await fetch(`${endpoint.url}/health`, {
method: 'GET',
timeout: 5000,
});
const responseTime = Date.now() - start;
endpoint.healthy = response.ok;
endpoint.responseTime = responseTime;
} catch (error) {
endpoint.healthy = false;
endpoint.responseTime = Infinity;
}
});
await Promise.all(checks);
}
startHealthChecking(intervalMs: number = 30000) {
setInterval(() => {
this.healthCheck();
}, intervalMs);
}
}
Cost Optimization Strategies
Token Usage Optimization
// src/services/cost-optimizer.ts
export class CostOptimizer {
private tokenLimits = new Map<string, number>();
private dailySpend = new Map<string, number>();
constructor() {
// Set daily spending limits per user/API key
this.tokenLimits.set('free-tier', 10000);
this.tokenLimits.set('pro-tier', 100000);
this.tokenLimits.set('enterprise', 1000000);
}
async optimizeRequest(
request: any,
userTier: string,
currentUsage: number
): Promise<{ model: string; maxTokens: number; temperature: number }> {
const remainingTokens = this.tokenLimits.get(userTier)! - currentUsage;
// Select appropriate model based on remaining budget
let model = 'gpt-3.5-turbo';
let maxTokens = 150;
let temperature = 0.7;
if (remainingTokens < 1000) {
// Use cheaper, faster model for low budget
model = 'gpt-3.5-turbo';
maxTokens = 100;
temperature = 0.3; // More deterministic to reduce retries
} else if (remainingTokens > 10000) {
// Can afford premium model
model = 'gpt-4';
maxTokens = 300;
}
// Cache frequent requests
const cacheKey = this.generateCacheKey(request);
const cached = await this.getCachedResponse(cacheKey);
if (cached) {
return cached;
}
return { model, maxTokens, temperature };
}
private generateCacheKey(request: any): string {
// Create hash of request for caching
const hash = require('crypto').createHash('sha256');
hash.update(JSON.stringify(request));
return hash.digest('hex');
}
private async getCachedResponse(key: string): Promise<any | null> {
// Implement Redis-based caching
return null;
}
}
Security Best Practices
API Security
// src/middleware/security.ts
import jwt from 'jsonwebtoken';
import rateLimit from 'express-rate-limit';
import slowDown from 'express-slow-down';
export const authenticateAPIKey = (req: express.Request, res: express.Response, next: express.NextFunction) => {
const apiKey = req.headers['x-api-key'];
if (!apiKey) {
return res.status(401).json({ error: 'API key required' });
}
// Validate API key (implement your validation logic)
if (!isValidAPIKey(apiKey as string)) {
return res.status(401).json({ error: 'Invalid API key' });
}
// Add user info to request
req.user = getUserFromAPIKey(apiKey as string);
next();
};
export const sensitiveDataFilter = (req: express.Request, res: express.Response, next: express.NextFunction) => {
// Remove sensitive data from requests
if (req.body.message) {
req.body.message = filterSensitiveData(req.body.message);
}
next();
};
export const aiSpecificRateLimit = rateLimit({
windowMs: 60 * 1000, // 1 minute
max: (req) => {
// Different limits based on user tier
const user = req.user as any;
switch (user?.tier) {
case 'enterprise': return 1000;
case 'pro': return 100;
default: return 10;
}
},
message: 'AI request rate limit exceeded',
});
function filterSensitiveData(text: string): string {
// Remove credit card numbers, SSNs, emails, etc.
return text
.replace(/\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/g, '[REDACTED-CC]')
.replace(/\b\d{3}-\d{2}-\d{4}\b/g, '[REDACTED-SSN]')
.replace(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g, '[REDACTED-EMAIL]');
}
Conclusion
Deploying LLM applications to production requires careful attention to scalability, monitoring, security, and cost optimization. The examples in this guide provide a solid foundation, but remember that production deployments are iterative processes.
Key takeaways for successful LLM deployments:
- Start with solid foundations: Proper error handling, logging, and monitoring
- Plan for scale: Use containerization and orchestration tools
- Monitor everything: AI applications have unique failure modes
- Optimize costs: Implement intelligent model selection and caching
- Secure by design: Filter sensitive data and implement proper authentication
- Iterate and improve: Use metrics to continuously optimize performance
The LLM landscape is evolving rapidly, but these foundational practices will serve you well regardless of which models or services you're using.
Ready to deploy your LLM application? Check out our production deployment checklist to ensure you haven't missed any critical steps.