From Zero to Production: Deploying LLM Applications with Node.js

15 min read

A comprehensive guide to deploying, scaling, and monitoring Node.js-based LLM applications in production environments with real-world best practices.

Node.jsLLMDeploymentProductionDevOpsScaling

From Zero to Production: Deploying LLM Applications with Node.js

Building an LLM application is one thing, but deploying it to production where it can handle real user traffic, scale efficiently, and remain reliable is an entirely different challenge. In this comprehensive guide, we'll walk through everything you need to know to take your Node.js LLM application from development to a robust production deployment.

Production-Ready Application Architecture

Before diving into deployment, let's establish a solid foundation with a production-ready application structure:

// src/app.ts
import express from 'express';
import helmet from 'helmet';
import cors from 'cors';
import rateLimit from 'express-rate-limit';
import { createProxyMiddleware } from 'http-proxy-middleware';
import { AIService } from './services/ai-service';
import { MetricsService } from './services/metrics-service';
import { Logger } from './utils/logger';

class ProductionApp {
  private app: express.Application;
  private aiService: AIService;
  private metricsService: MetricsService;
  private logger: Logger;

  constructor() {
    this.app = express();
    this.aiService = new AIService();
    this.metricsService = new MetricsService();
    this.logger = new Logger();
    
    this.setupMiddleware();
    this.setupRoutes();
    this.setupErrorHandling();
  }

  private setupMiddleware() {
    // Security
    this.app.use(helmet({
      contentSecurityPolicy: {
        directives: {
          defaultSrc: ["'self'"],
          scriptSrc: ["'self'", "'unsafe-inline'"],
          styleSrc: ["'self'", "'unsafe-inline'"],
          imgSrc: ["'self'", "data:", "https:"],
        },
      },
    }));

    // CORS
    this.app.use(cors({
      origin: process.env.ALLOWED_ORIGINS?.split(',') || ['http://localhost:3000'],
      credentials: true,
    }));

    // Rate limiting
    const limiter = rateLimit({
      windowMs: 15 * 60 * 1000, // 15 minutes
      max: 100, // limit each IP to 100 requests per windowMs
      message: {
        error: 'Too many requests, please try again later.',
      },
      standardHeaders: true,
      legacyHeaders: false,
    });
    this.app.use('/api/', limiter);

    // Request parsing
    this.app.use(express.json({ limit: '10mb' }));
    this.app.use(express.urlencoded({ extended: true, limit: '10mb' }));

    // Request logging
    this.app.use((req, res, next) => {
      this.logger.info(`${req.method} ${req.path}`, {
        ip: req.ip,
        userAgent: req.get('User-Agent'),
        requestId: req.headers['x-request-id'],
      });
      next();
    });

    // Metrics collection
    this.app.use((req, res, next) => {
      const start = Date.now();
      res.on('finish', () => {
        const duration = Date.now() - start;
        this.metricsService.recordRequest(req.method, req.path, res.statusCode, duration);
      });
      next();
    });
  }

  private setupRoutes() {
    // Health check
    this.app.get('/health', (req, res) => {
      res.json({
        status: 'healthy',
        timestamp: new Date().toISOString(),
        version: process.env.APP_VERSION || '1.0.0',
        uptime: process.uptime(),
      });
    });

    // Metrics endpoint
    this.app.get('/metrics', async (req, res) => {
      const metrics = await this.metricsService.getMetrics();
      res.json(metrics);
    });

    // AI endpoints
    this.app.post('/api/chat', this.handleChat.bind(this));
    this.app.post('/api/chat/stream', this.handleStreamChat.bind(this));
    this.app.post('/api/embeddings', this.handleEmbeddings.bind(this));
  }

  private async handleChat(req: express.Request, res: express.Response) {
    try {
      const { message, context, model = 'gpt-3.5-turbo' } = req.body;
      
      if (!message) {
        return res.status(400).json({ error: 'Message is required' });
      }

      const startTime = Date.now();
      const response = await this.aiService.chat(message, { context, model });
      const duration = Date.now() - startTime;

      this.metricsService.recordAIRequest('chat', model, duration, true);

      res.json({
        response: response.content,
        metadata: {
          model,
          tokens: response.usage,
          duration,
        },
      });
    } catch (error) {
      this.handleError(error, req, res);
    }
  }

  private async handleStreamChat(req: express.Request, res: express.Response) {
    try {
      const { message, context, model = 'gpt-3.5-turbo' } = req.body;
      
      res.setHeader('Content-Type', 'text/event-stream');
      res.setHeader('Cache-Control', 'no-cache');
      res.setHeader('Connection', 'keep-alive');

      const stream = this.aiService.chatStream(message, { context, model });
      
      for await (const chunk of stream) {
        res.write(`data: ${JSON.stringify({ content: chunk })}\n\n`);
      }
      
      res.write('data: [DONE]\n\n');
      res.end();
    } catch (error) {
      res.write(`data: ${JSON.stringify({ error: error.message })}\n\n`);
      res.end();
    }
  }

  private setupErrorHandling() {
    // 404 handler
    this.app.use((req, res) => {
      res.status(404).json({
        error: 'Not Found',
        message: 'The requested resource was not found',
      });
    });

    // Global error handler
    this.app.use((error: Error, req: express.Request, res: express.Response, next: express.NextFunction) => {
      this.handleError(error, req, res);
    });
  }

  private handleError(error: Error, req: express.Request, res: express.Response) {
    this.logger.error('Request error', {
      error: error.message,
      stack: error.stack,
      path: req.path,
      method: req.method,
    });

    // Don't leak error details in production
    const isDevelopment = process.env.NODE_ENV === 'development';
    
    res.status(500).json({
      error: 'Internal Server Error',
      message: isDevelopment ? error.message : 'Something went wrong',
      ...(isDevelopment && { stack: error.stack }),
    });
  }

  public start(port: number = 3000) {
    this.app.listen(port, () => {
      this.logger.info(`Server running on port ${port}`);
    });
  }
}

export default ProductionApp;

Containerization with Docker

Create a production-ready Docker setup:

# Dockerfile
FROM node:18-alpine AS base

# Install dependencies only when needed
FROM base AS deps
RUN apk add --no-cache libc6-compat
WORKDIR /app

# Install dependencies based on the preferred package manager
COPY package.json package-lock.json* ./
RUN npm ci --only=production && npm cache clean --force

# Rebuild the source code only when needed
FROM base AS builder
WORKDIR /app
COPY --from=deps /app/node_modules ./node_modules
COPY . .

# Build the application
RUN npm run build

# Production image, copy all the files and run the app
FROM base AS runner
WORKDIR /app

ENV NODE_ENV=production

RUN addgroup --system --gid 1001 nodejs
RUN adduser --system --uid 1001 nextjs

# Copy built application
COPY --from=builder /app/dist ./dist
COPY --from=builder /app/node_modules ./node_modules
COPY --from=builder /app/package.json ./package.json

USER nextjs

EXPOSE 3000

ENV PORT 3000
ENV HOSTNAME "0.0.0.0"

CMD ["node", "dist/app.js"]
# docker-compose.yml
version: '3.8'

services:
  app:
    build: .
    ports:
      - "3000:3000"
    environment:
      - NODE_ENV=production
      - REDIS_URL=redis://redis:6379
      - DATABASE_URL=postgresql://user:password@postgres:5432/llm_app
    depends_on:
      - redis
      - postgres
    restart: unless-stopped
    
  redis:
    image: redis:7-alpine
    ports:
      - "6379:6379"
    volumes:
      - redis_data:/data
    restart: unless-stopped
    
  postgres:
    image: postgres:15-alpine
    environment:
      POSTGRES_USER: user
      POSTGRES_PASSWORD: password
      POSTGRES_DB: llm_app
    volumes:
      - postgres_data:/var/lib/postgresql/data
    ports:
      - "5432:5432"
    restart: unless-stopped

  nginx:
    image: nginx:alpine
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf
      - ./ssl:/etc/nginx/ssl
    depends_on:
      - app
    restart: unless-stopped

volumes:
  redis_data:
  postgres_data:

Kubernetes Deployment

For enterprise-scale deployments, use Kubernetes:

# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: llm-app
  labels:
    app: llm-app
spec:
  replicas: 3
  selector:
    matchLabels:
      app: llm-app
  template:
    metadata:
      labels:
        app: llm-app
    spec:
      containers:
      - name: llm-app
        image: your-registry/llm-app:latest
        ports:
        - containerPort: 3000
        env:
        - name: NODE_ENV
          value: "production"
        - name: REDIS_URL
          valueFrom:
            secretKeyRef:
              name: app-secrets
              key: redis-url
        - name: DATABASE_URL
          valueFrom:
            secretKeyRef:
              name: app-secrets
              key: database-url
        - name: OPENAI_API_KEY
          valueFrom:
            secretKeyRef:
              name: app-secrets
              key: openai-api-key
        resources:
          requests:
            memory: "512Mi"
            cpu: "250m"
          limits:
            memory: "1Gi"
            cpu: "500m"
        livenessProbe:
          httpGet:
            path: /health
            port: 3000
          initialDelaySeconds: 30
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /health
            port: 3000
          initialDelaySeconds: 5
          periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
  name: llm-app-service
spec:
  selector:
    app: llm-app
  ports:
    - protocol: TCP
      port: 80
      targetPort: 3000
  type: LoadBalancer

Advanced Monitoring and Observability

🚨 Critical: Implement comprehensive monitoring from day one. LLM applications have unique failure modes and performance characteristics that require specialized monitoring.

Comprehensive Metrics Collection

// src/services/metrics-service.ts
import { StatsD } from 'node-statsd';
import { performance } from 'perf_hooks';

interface AIMetrics {
  requests: number;
  errors: number;
  latency: number[];
  tokenUsage: number;
  costs: number;
}

export class MetricsService {
  private statsd: StatsD;
  private metrics: Map<string, AIMetrics> = new Map();
  private requestCounts: Map<string, number> = new Map();

  constructor() {
    this.statsd = new StatsD({
      host: process.env.STATSD_HOST || 'localhost',
      port: parseInt(process.env.STATSD_PORT || '8125'),
    });
  }

  recordRequest(method: string, path: string, statusCode: number, duration: number) {
    const key = `${method}:${path}`;
    this.requestCounts.set(key, (this.requestCounts.get(key) || 0) + 1);
    
    this.statsd.increment('http.requests.total', 1, [`method:${method}`, `status:${statusCode}`]);
    this.statsd.histogram('http.request.duration', duration, [`method:${method}`]);
  }

  recordAIRequest(
    type: string,
    model: string,
    duration: number,
    success: boolean,
    tokens?: { prompt: number; completion: number; total: number },
    cost?: number
  ) {
    const key = `${type}:${model}`;
    
    if (!this.metrics.has(key)) {
      this.metrics.set(key, {
        requests: 0,
        errors: 0,
        latency: [],
        tokenUsage: 0,
        costs: 0,
      });
    }

    const metrics = this.metrics.get(key)!;
    metrics.requests++;
    metrics.latency.push(duration);
    
    if (!success) {
      metrics.errors++;
    }
    
    if (tokens) {
      metrics.tokenUsage += tokens.total;
    }
    
    if (cost) {
      metrics.costs += cost;
    }

    // Send to StatsD
    this.statsd.increment('ai.requests.total', 1, [`type:${type}`, `model:${model}`]);
    this.statsd.histogram('ai.request.duration', duration, [`type:${type}`, `model:${model}`]);
    
    if (tokens) {
      this.statsd.histogram('ai.tokens.used', tokens.total, [`type:${type}`, `model:${model}`]);
    }
    
    if (cost) {
      this.statsd.histogram('ai.cost', cost, [`type:${type}`, `model:${model}`]);
    }
    
    if (!success) {
      this.statsd.increment('ai.requests.errors', 1, [`type:${type}`, `model:${model}`]);
    }
  }

  async getMetrics() {
    const systemMetrics = {
      memory: process.memoryUsage(),
      uptime: process.uptime(),
      cpuUsage: process.cpuUsage(),
    };

    const aiMetrics = Array.from(this.metrics.entries()).map(([key, metrics]) => {
      const avgLatency = metrics.latency.reduce((a, b) => a + b, 0) / metrics.latency.length;
      const errorRate = metrics.errors / metrics.requests;
      
      return {
        key,
        requests: metrics.requests,
        errors: metrics.errors,
        errorRate,
        avgLatency,
        totalTokens: metrics.tokenUsage,
        totalCost: metrics.costs,
      };
    });

    return {
      timestamp: new Date().toISOString(),
      system: systemMetrics,
      ai: aiMetrics,
      requests: Object.fromEntries(this.requestCounts),
    };
  }
}

Application Performance Monitoring (APM)

// src/utils/apm.ts
import apm from 'elastic-apm-node';

// Initialize APM
if (process.env.NODE_ENV === 'production') {
  apm.start({
    serviceName: 'llm-app',
    secretToken: process.env.ELASTIC_APM_SECRET_TOKEN,
    serverUrl: process.env.ELASTIC_APM_SERVER_URL,
    environment: process.env.NODE_ENV,
    captureBody: 'all',
    captureHeaders: true,
  });
}

export class APMService {
  static createSpan(name: string, type: string = 'custom') {
    return apm.startSpan(name, type);
  }

  static recordError(error: Error, context?: any) {
    apm.captureError(error, context);
  }

  static addLabels(labels: Record<string, string>) {
    apm.addLabels(labels);
  }

  static setUserContext(user: { id: string; email?: string; username?: string }) {
    apm.setUserContext(user);
  }

  static setCustomContext(context: Record<string, any>) {
    apm.setCustomContext(context);
  }
}

Auto-Scaling and Load Balancing

Horizontal Pod Autoscaler (HPA)

# k8s/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: llm-app-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: llm-app
  minReplicas: 3
  maxReplicas: 20
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Resource
    resource:
      name: memory
      target:
        type: Utilization
        averageUtilization: 80
  - type: Pods
    pods:
      metric:
        name: ai_requests_per_second
      target:
        type: AverageValue
        averageValue: "10"
  behavior:
    scaleUp:
      stabilizationWindowSeconds: 60
      policies:
      - type: Percent
        value: 100
        periodSeconds: 15
    scaleDown:
      stabilizationWindowSeconds: 300
      policies:
      - type: Percent
        value: 10
        periodSeconds: 60

Application-Level Load Balancing

// src/services/load-balancer.ts
interface ServiceEndpoint {
  url: string;
  weight: number;
  healthy: boolean;
  responseTime: number;
}

export class LoadBalancer {
  private endpoints: ServiceEndpoint[] = [];
  private currentIndex = 0;

  addEndpoint(url: string, weight: number = 1) {
    this.endpoints.push({
      url,
      weight,
      healthy: true,
      responseTime: 0,
    });
  }

  async selectEndpoint(): Promise<string> {
    const healthyEndpoints = this.endpoints.filter(ep => ep.healthy);
    
    if (healthyEndpoints.length === 0) {
      throw new Error('No healthy endpoints available');
    }

    // Weighted round-robin selection
    const totalWeight = healthyEndpoints.reduce((sum, ep) => sum + ep.weight, 0);
    let randomWeight = Math.random() * totalWeight;
    
    for (const endpoint of healthyEndpoints) {
      randomWeight -= endpoint.weight;
      if (randomWeight <= 0) {
        return endpoint.url;
      }
    }

    return healthyEndpoints[0].url;
  }

  async healthCheck() {
    const checks = this.endpoints.map(async (endpoint) => {
      try {
        const start = Date.now();
        const response = await fetch(`${endpoint.url}/health`, {
          method: 'GET',
          timeout: 5000,
        });
        const responseTime = Date.now() - start;
        
        endpoint.healthy = response.ok;
        endpoint.responseTime = responseTime;
      } catch (error) {
        endpoint.healthy = false;
        endpoint.responseTime = Infinity;
      }
    });

    await Promise.all(checks);
  }

  startHealthChecking(intervalMs: number = 30000) {
    setInterval(() => {
      this.healthCheck();
    }, intervalMs);
  }
}

Cost Optimization Strategies

Token Usage Optimization

// src/services/cost-optimizer.ts
export class CostOptimizer {
  private tokenLimits = new Map<string, number>();
  private dailySpend = new Map<string, number>();

  constructor() {
    // Set daily spending limits per user/API key
    this.tokenLimits.set('free-tier', 10000);
    this.tokenLimits.set('pro-tier', 100000);
    this.tokenLimits.set('enterprise', 1000000);
  }

  async optimizeRequest(
    request: any,
    userTier: string,
    currentUsage: number
  ): Promise<{ model: string; maxTokens: number; temperature: number }> {
    const remainingTokens = this.tokenLimits.get(userTier)! - currentUsage;
    
    // Select appropriate model based on remaining budget
    let model = 'gpt-3.5-turbo';
    let maxTokens = 150;
    let temperature = 0.7;

    if (remainingTokens < 1000) {
      // Use cheaper, faster model for low budget
      model = 'gpt-3.5-turbo';
      maxTokens = 100;
      temperature = 0.3; // More deterministic to reduce retries
    } else if (remainingTokens > 10000) {
      // Can afford premium model
      model = 'gpt-4';
      maxTokens = 300;
    }

    // Cache frequent requests
    const cacheKey = this.generateCacheKey(request);
    const cached = await this.getCachedResponse(cacheKey);
    
    if (cached) {
      return cached;
    }

    return { model, maxTokens, temperature };
  }

  private generateCacheKey(request: any): string {
    // Create hash of request for caching
    const hash = require('crypto').createHash('sha256');
    hash.update(JSON.stringify(request));
    return hash.digest('hex');
  }

  private async getCachedResponse(key: string): Promise<any | null> {
    // Implement Redis-based caching
    return null;
  }
}

Security Best Practices

API Security

// src/middleware/security.ts
import jwt from 'jsonwebtoken';
import rateLimit from 'express-rate-limit';
import slowDown from 'express-slow-down';

export const authenticateAPIKey = (req: express.Request, res: express.Response, next: express.NextFunction) => {
  const apiKey = req.headers['x-api-key'];
  
  if (!apiKey) {
    return res.status(401).json({ error: 'API key required' });
  }

  // Validate API key (implement your validation logic)
  if (!isValidAPIKey(apiKey as string)) {
    return res.status(401).json({ error: 'Invalid API key' });
  }

  // Add user info to request
  req.user = getUserFromAPIKey(apiKey as string);
  next();
};

export const sensitiveDataFilter = (req: express.Request, res: express.Response, next: express.NextFunction) => {
  // Remove sensitive data from requests
  if (req.body.message) {
    req.body.message = filterSensitiveData(req.body.message);
  }
  
  next();
};

export const aiSpecificRateLimit = rateLimit({
  windowMs: 60 * 1000, // 1 minute
  max: (req) => {
    // Different limits based on user tier
    const user = req.user as any;
    switch (user?.tier) {
      case 'enterprise': return 1000;
      case 'pro': return 100;
      default: return 10;
    }
  },
  message: 'AI request rate limit exceeded',
});

function filterSensitiveData(text: string): string {
  // Remove credit card numbers, SSNs, emails, etc.
  return text
    .replace(/\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/g, '[REDACTED-CC]')
    .replace(/\b\d{3}-\d{2}-\d{4}\b/g, '[REDACTED-SSN]')
    .replace(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g, '[REDACTED-EMAIL]');
}

Conclusion

Deploying LLM applications to production requires careful attention to scalability, monitoring, security, and cost optimization. The examples in this guide provide a solid foundation, but remember that production deployments are iterative processes.

Key takeaways for successful LLM deployments:

  1. Start with solid foundations: Proper error handling, logging, and monitoring
  2. Plan for scale: Use containerization and orchestration tools
  3. Monitor everything: AI applications have unique failure modes
  4. Optimize costs: Implement intelligent model selection and caching
  5. Secure by design: Filter sensitive data and implement proper authentication
  6. Iterate and improve: Use metrics to continuously optimize performance

The LLM landscape is evolving rapidly, but these foundational practices will serve you well regardless of which models or services you're using.


Ready to deploy your LLM application? Check out our production deployment checklist to ensure you haven't missed any critical steps.