Distributed Tracing

The system implements OpenTelemetry-based distributed tracing to provide end-to-end visibility across all services and operations. Traces help debug performance issues, understand request flows, and monitor system health.

Tracing Architecture

Core Components

OpenTelemetry SDK: Industry-standard tracing framework
Zipkin Exporter: Trace visualization and storage
Auto-Instrumentation: Automatic tracing for common libraries
Custom Spans: Application-specific trace instrumentation

Tracing Stack

Application Code → OpenTelemetry API → SDK → Zipkin Exporter → Zipkin UI
                                   ↓
                           Auto-Instrumentation

Trace Configuration

SDK Initialization

// Initialize before NestJS bootstrap
const tracingSdk = initializeTracing();

async function initializeTracing(): Promise<NodeSDK | null> {
  if (process.env.OTEL_TRACE_ENABLED === "false") {
    return null;
  }

  const samplingRate = parseFloat(process.env.OTEL_TRACE_SAMPLING || "1.0");
  const zipkinEndpoint = process.env.OTEL_EXPORTER_ZIPKIN_ENDPOINT ||
    "http://localhost:9411/api/v2/spans";

  const zipkinExporter = new ZipkinExporter({ url: zipkinEndpoint });
  const sampler = new TraceIdRatioBasedSampler(samplingRate);

  return new NodeSDK({
    resource: createResource(),
    traceExporter: zipkinExporter,
    spanProcessor: new BatchSpanProcessor(zipkinExporter),
    sampler,
    instrumentations: [getNodeAutoInstrumentations({
      "@opentelemetry/instrumentation-http": { enabled: true },
      "@opentelemetry/instrumentation-pg": { enabled: true },
      "@opentelemetry/instrumentation-ioredis": { enabled: true },
      "@opentelemetry/instrumentation-nestjs-core": { enabled: true },
    })],
  });
}

Environment Configuration

# Tracing control
OTEL_TRACE_ENABLED=true
OTEL_TRACE_SAMPLING=1.0  # 0.0 to 1.0 (0% to 100%)

# Zipkin configuration
OTEL_EXPORTER_ZIPKIN_ENDPOINT=http://localhost:9411/api/v2/spans

# Service metadata
OTEL_SERVICE_NAME=vcecom-backend
OTEL_SERVICE_VERSION=1.0.0

Automatic Instrumentation

HTTP Requests

// Automatic tracing for all HTTP requests
app.use((req, res, next) => {
  // OpenTelemetry automatically creates spans for HTTP requests
  // Spans include: method, url, status code, duration
  next();
});

Database Operations

// Automatic tracing for PostgreSQL queries
await db.select().from(products); // Automatically traced

// Automatic tracing for Redis operations
await redis.get('key'); // Automatically traced

Framework Operations

// NestJS controller methods automatically traced
@Controller('products')
export class ProductsController {
  @Get()
  async findAll() { // Automatically traced
    return this.productsService.findAll();
  }
}

Custom Span Creation

Manual Tracing

import { trace } from "@opentelemetry/api";

const tracer = trace.getTracer("vcecom-backend");

async function createOrder(orderData: CreateOrderDto): Promise<Order> {
  const span = tracer.startSpan("createOrder", {
    attributes: {
      "order.customer_id": orderData.customerId,
      "order.item_count": orderData.items.length,
      "order.total_amount": orderData.total,
    },
  });

  try {
    // Business logic here
    const order = await this.processOrder(orderData);

    span.setAttributes({
      "order.id": order.id,
      "order.status": order.status,
    });

    span.setStatus({ code: SpanStatusCode.OK });
    return order;
  } catch (error) {
    span.setStatus({
      code: SpanStatusCode.ERROR,
      message: error.message,
    });
    span.recordException(error);
    throw error;
  } finally {
    span.end();
  }
}

Child Spans

async processOrder(orderData: CreateOrderDto): Promise<Order> {
  const span = trace.getActiveSpan();

  // Create child span for inventory check
  const inventorySpan = tracer.startSpan("checkInventory", {
    attributes: { "operation.type": "validation" }
  }, span ? trace.setSpan(context.active(), span) : undefined);

  try {
    await this.checkInventory(orderData.items);
    inventorySpan.setStatus({ code: SpanStatusCode.OK });
  } finally {
    inventorySpan.end();
  }

  // Create child span for payment processing
  const paymentSpan = tracer.startSpan("processPayment", {
    attributes: { "payment.amount": orderData.total }
  }, span ? trace.setSpan(context.active(), span) : undefined);

  try {
    await this.processPayment(orderData.payment);
    paymentSpan.setStatus({ code: SpanStatusCode.OK });
  } finally {
    paymentSpan.end();
  }

  // Continue with order creation...
}

Trace Context Propagation

HTTP Headers

// Automatic trace context injection in HTTP requests
const axiosConfig = {
  headers: {
    'traceparent': '00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01',
    'tracestate': 'vendorname=opaqueValue',
  }
};

Database Operations

// Trace context automatically propagated to database queries
await db.insert(orders).values(orderData); // Includes trace context

Redis Operations

// Trace context automatically propagated to Redis commands
await redis.set('order:123', orderData); // Includes trace context

Sampling Strategies

Probability Sampling

// Sample 10% of all traces
const sampler = new TraceIdRatioBasedSampler(0.1);

// Sample all traces (development)
const sampler = new TraceIdRatioBasedSampler(1.0);

Custom Sampling Rules

class CustomSampler implements Sampler {
  shouldSample(context: Context, traceId: string, spanName: string): SamplingResult {
    // Sample all errors
    if (spanName.includes('error')) {
      return { decision: SamplingDecision.RECORD_AND_SAMPLE };
    }

    // Sample 50% of checkout operations
    if (spanName.includes('checkout')) {
      return Math.random() < 0.5
        ? { decision: SamplingDecision.RECORD_AND_SAMPLE }
        : { decision: SamplingDecision.NOT_RECORD };
    }

    // Default sampling
    return { decision: SamplingDecision.NOT_RECORD };
  }
}

Resource Detection

Service Metadata

function createResource(): Resource {
  return new Resource({
    [SemanticResourceAttributes.SERVICE_NAME]: process.env.OTEL_SERVICE_NAME || "vcecom-backend",
    [SemanticResourceAttributes.SERVICE_VERSION]: BUILD_INFO.version || "0.0.1",
    [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV || "development",
    [SemanticResourceAttributes.CLOUD_REGION]: process.env.DEPLOYMENT_REGION || "local",
    [SemanticResourceAttributes.HOST_NAME]: os.hostname(),
    [SemanticResourceAttributes.PROCESS_PID]: process.pid,
  });
}

Custom Attributes

// Add custom resource attributes
const resource = new Resource({
  "service.instance.id": process.env.INSTANCE_ID,
  "service.cluster": process.env.CLUSTER_NAME,
  "deployment.version": BUILD_INFO.commitHash,
});

Zipkin Integration

Trace Visualization

// Traces exported to Zipkin for visualization
const zipkinExporter = new ZipkinExporter({
  url: "http://localhost:9411/api/v2/spans",
  headers: {
    "authorization": "Bearer <token>" // If authentication required
  }
});

Zipkin UI Features

Service Map: Visual representation of service dependencies
Trace Timeline: Detailed span timing and hierarchy
Error Tracking: Failed spans with error details
Performance Analysis: Slow operation identification

Performance Monitoring

Span Metrics

// Automatic span metrics collection
span_duration_seconds: histogram
span_count_total: counter
span_error_total: counter

Custom Metrics

// Business-specific span attributes
span.setAttributes({
  "order.type": "guest_checkout",
  "order.value": order.total,
  "order.items": order.items.length,
  "customer.type": customer.verified ? "verified" : "guest",
});

Error Tracking

Exception Recording

try {
  await riskyOperation();
} catch (error) {
  span.recordException(error);
  span.setStatus({
    code: SpanStatusCode.ERROR,
    message: error.message,
  });
  throw error;
}

Error Context

span.setAttributes({
  "error.type": error.name,
  "error.message": error.message,
  "error.stack": error.stack?.substring(0, 1000), // Limit stack trace
  "operation.context": JSON.stringify(operationContext),
});

Best Practices

Span Naming

Descriptive Names: Use clear, descriptive span names
Consistent Naming: Follow consistent naming conventions
Hierarchical Structure: Reflect operation hierarchy in span names
Avoid Dynamic Names: Don't include IDs in span names

Attribute Standards

Semantic Attributes: Use OpenTelemetry semantic conventions
Consistent Keys: Standardize attribute names across services
Appropriate Values: Include relevant context without sensitive data
Performance: Avoid large attribute values

Span Lifecycle

Proper Cleanup: Always call span.end() in finally blocks
Resource Management: Don't hold references to ended spans
Context Propagation: Ensure trace context flows through async operations
Error Handling: Record exceptions and set appropriate status codes

Sampling Decisions

Business Critical: Sample 100% of critical business operations
High Volume: Use lower sampling rates for high-volume operations
Error Cases: Always sample error scenarios
Development: Sample 100% in development environments

Performance Considerations

Overhead Awareness: Tracing adds small performance overhead
Sampling Trade-offs: Balance observability with performance
Resource Limits: Configure appropriate span limits and timeouts
Storage Costs: Consider trace storage and retention costs

Security Considerations

Sensitive Data: Don't include PII in span attributes
Access Control: Restrict trace access to authorized personnel
Data Retention: Implement appropriate trace retention policies
Transport Security: Use secure connections for trace export

Troubleshooting

Common Issues

Missing Traces

// Check if tracing is enabled
console.log("Tracing enabled:", process.env.OTEL_TRACE_ENABLED);

// Check sampling rate
console.log("Sampling rate:", process.env.OTEL_TRACE_SAMPLING);

// Verify Zipkin connectivity
// Check Zipkin logs for connection errors

Broken Trace Context

// Ensure context propagation
const currentSpan = trace.getActiveSpan();
if (!currentSpan) {
  console.warn("No active span found - trace context may be broken");
}

// Check async context preservation
// Ensure AsyncLocalStorage is properly configured

Performance Impact

// Monitor span creation overhead
const startTime = Date.now();
const span = tracer.startSpan("operation");
// ... operation ...
span.end();
const overhead = Date.now() - startTime;

// Implement sampling for high-frequency operations
if (Math.random() < 0.01) { // 1% sampling for frequent ops
  // Create span
}

Integration Examples

Database Query Tracing

// Automatic instrumentation traces all database operations
const orders = await db
  .select()
  .from(ordersTable)
  .where(eq(ordersTable.customerId, customerId));
// Span created automatically with query details

External API Calls

// Automatic HTTP instrumentation
const response = await axios.get('https://api.payment-gateway.com/charge');
// Span created with HTTP method, URL, status, duration

Redis Operations

// Automatic Redis instrumentation
await redis.set('order:123', orderData);
// Span created with command, key, duration

Message Queue Operations

// Custom instrumentation for message queues
const span = tracer.startSpan("publishMessage", {
  attributes: {
    "messaging.system": "redis",
    "messaging.destination": "order-events",
    "messaging.operation": "publish",
  }
});

await redis.publish("order-events", JSON.stringify(event));
span.end();

Monitoring & Alerting

Trace-Based Alerts

# Alert on high error rate
- alert: HighErrorRate
  expr: rate(span_error_total[5m]) / rate(span_count_total[5m]) > 0.05
  labels:
    severity: critical

# Alert on slow operations
- alert: SlowOperations
  expr: histogram_quantile(0.95, rate(span_duration_seconds_bucket[5m])) > 5
  labels:
    severity: warning

Dashboard Metrics

// Key tracing metrics
total_traces: counter
trace_duration_seconds: histogram
span_count_per_trace: histogram
error_spans_percentage: gauge
sampled_traces_percentage: gauge

Tracing Architecture​

Core Components​

Tracing Stack​

Trace Configuration​

SDK Initialization​

Environment Configuration​

Automatic Instrumentation​

HTTP Requests​

Database Operations​

Framework Operations​

Custom Span Creation​

Manual Tracing​

Child Spans​

Trace Context Propagation​

HTTP Headers​

Database Operations​

Redis Operations​

Sampling Strategies​

Probability Sampling​

Custom Sampling Rules​

Resource Detection​

Service Metadata​

Custom Attributes​

Zipkin Integration​

Trace Visualization​

Zipkin UI Features​

Performance Monitoring​

Span Metrics​

Custom Metrics​

Error Tracking​

Exception Recording​

Error Context​

Best Practices​

Span Naming​

Attribute Standards​

Span Lifecycle​

Sampling Decisions​

Performance Considerations​

Security Considerations​

Troubleshooting​

Common Issues​

Missing Traces​

Broken Trace Context​

Performance Impact​

Integration Examples​

Database Query Tracing​

External API Calls​

Redis Operations​

Message Queue Operations​

Monitoring & Alerting​

Trace-Based Alerts​

Dashboard Metrics​

Tracing Architecture

Core Components

Tracing Stack

Trace Configuration

SDK Initialization

Environment Configuration

Automatic Instrumentation

HTTP Requests

Database Operations

Framework Operations

Custom Span Creation

Manual Tracing

Child Spans

Trace Context Propagation

HTTP Headers

Database Operations

Redis Operations

Sampling Strategies

Probability Sampling

Custom Sampling Rules

Resource Detection

Service Metadata

Custom Attributes

Zipkin Integration

Trace Visualization

Zipkin UI Features

Performance Monitoring

Span Metrics

Custom Metrics

Error Tracking

Exception Recording

Error Context

Best Practices

Span Naming

Attribute Standards

Span Lifecycle

Sampling Decisions

Performance Considerations

Security Considerations

Troubleshooting

Common Issues

Missing Traces

Broken Trace Context

Performance Impact

Integration Examples

Database Query Tracing

External API Calls

Redis Operations

Message Queue Operations

Monitoring & Alerting

Trace-Based Alerts

Dashboard Metrics