Distributed Tracing
The system implements OpenTelemetry-based distributed tracing to provide end-to-end visibility across all services and operations. Traces help debug performance issues, understand request flows, and monitor system health.
Tracing Architecture
Core Components
- OpenTelemetry SDK: Industry-standard tracing framework
- Zipkin Exporter: Trace visualization and storage
- Auto-Instrumentation: Automatic tracing for common libraries
- Custom Spans: Application-specific trace instrumentation
Tracing Stack
Application Code → OpenTelemetry API → SDK → Zipkin Exporter → Zipkin UI
↓
Auto-Instrumentation
Trace Configuration
SDK Initialization
// Initialize before NestJS bootstrap
const tracingSdk = initializeTracing();
async function initializeTracing(): Promise<NodeSDK | null> {
if (process.env.OTEL_TRACE_ENABLED === "false") {
return null;
}
const samplingRate = parseFloat(process.env.OTEL_TRACE_SAMPLING || "1.0");
const zipkinEndpoint = process.env.OTEL_EXPORTER_ZIPKIN_ENDPOINT ||
"http://localhost:9411/api/v2/spans";
const zipkinExporter = new ZipkinExporter({ url: zipkinEndpoint });
const sampler = new TraceIdRatioBasedSampler(samplingRate);
return new NodeSDK({
resource: createResource(),
traceExporter: zipkinExporter,
spanProcessor: new BatchSpanProcessor(zipkinExporter),
sampler,
instrumentations: [getNodeAutoInstrumentations({
"@opentelemetry/instrumentation-http": { enabled: true },
"@opentelemetry/instrumentation-pg": { enabled: true },
"@opentelemetry/instrumentation-ioredis": { enabled: true },
"@opentelemetry/instrumentation-nestjs-core": { enabled: true },
})],
});
}
Environment Configuration
# Tracing control
OTEL_TRACE_ENABLED=true
OTEL_TRACE_SAMPLING=1.0 # 0.0 to 1.0 (0% to 100%)
# Zipkin configuration
OTEL_EXPORTER_ZIPKIN_ENDPOINT=http://localhost:9411/api/v2/spans
# Service metadata
OTEL_SERVICE_NAME=vcecom-backend
OTEL_SERVICE_VERSION=1.0.0
Automatic Instrumentation
HTTP Requests
// Automatic tracing for all HTTP requests
app.use((req, res, next) => {
// OpenTelemetry automatically creates spans for HTTP requests
// Spans include: method, url, status code, duration
next();
});
Database Operations
// Automatic tracing for PostgreSQL queries
await db.select().from(products); // Automatically traced
// Automatic tracing for Redis operations
await redis.get('key'); // Automatically traced
Framework Operations
// NestJS controller methods automatically traced
@Controller('products')
export class ProductsController {
@Get()
async findAll() { // Automatically traced
return this.productsService.findAll();
}
}
Custom Span Creation
Manual Tracing
import { trace } from "@opentelemetry/api";
const tracer = trace.getTracer("vcecom-backend");
async function createOrder(orderData: CreateOrderDto): Promise<Order> {
const span = tracer.startSpan("createOrder", {
attributes: {
"order.customer_id": orderData.customerId,
"order.item_count": orderData.items.length,
"order.total_amount": orderData.total,
},
});
try {
// Business logic here
const order = await this.processOrder(orderData);
span.setAttributes({
"order.id": order.id,
"order.status": order.status,
});
span.setStatus({ code: SpanStatusCode.OK });
return order;
} catch (error) {
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message,
});
span.recordException(error);
throw error;
} finally {
span.end();
}
}
Child Spans
async processOrder(orderData: CreateOrderDto): Promise<Order> {
const span = trace.getActiveSpan();
// Create child span for inventory check
const inventorySpan = tracer.startSpan("checkInventory", {
attributes: { "operation.type": "validation" }
}, span ? trace.setSpan(context.active(), span) : undefined);
try {
await this.checkInventory(orderData.items);
inventorySpan.setStatus({ code: SpanStatusCode.OK });
} finally {
inventorySpan.end();
}
// Create child span for payment processing
const paymentSpan = tracer.startSpan("processPayment", {
attributes: { "payment.amount": orderData.total }
}, span ? trace.setSpan(context.active(), span) : undefined);
try {
await this.processPayment(orderData.payment);
paymentSpan.setStatus({ code: SpanStatusCode.OK });
} finally {
paymentSpan.end();
}
// Continue with order creation...
}
Trace Context Propagation
HTTP Headers
// Automatic trace context injection in HTTP requests
const axiosConfig = {
headers: {
'traceparent': '00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01',
'tracestate': 'vendorname=opaqueValue',
}
};
Database Operations
// Trace context automatically propagated to database queries
await db.insert(orders).values(orderData); // Includes trace context
Redis Operations
// Trace context automatically propagated to Redis commands
await redis.set('order:123', orderData); // Includes trace context
Sampling Strategies
Probability Sampling
// Sample 10% of all traces
const sampler = new TraceIdRatioBasedSampler(0.1);
// Sample all traces (development)
const sampler = new TraceIdRatioBasedSampler(1.0);
Custom Sampling Rules
class CustomSampler implements Sampler {
shouldSample(context: Context, traceId: string, spanName: string): SamplingResult {
// Sample all errors
if (spanName.includes('error')) {
return { decision: SamplingDecision.RECORD_AND_SAMPLE };
}
// Sample 50% of checkout operations
if (spanName.includes('checkout')) {
return Math.random() < 0.5
? { decision: SamplingDecision.RECORD_AND_SAMPLE }
: { decision: SamplingDecision.NOT_RECORD };
}
// Default sampling
return { decision: SamplingDecision.NOT_RECORD };
}
}
Resource Detection
Service Metadata
function createResource(): Resource {
return new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: process.env.OTEL_SERVICE_NAME || "vcecom-backend",
[SemanticResourceAttributes.SERVICE_VERSION]: BUILD_INFO.version || "0.0.1",
[SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV || "development",
[SemanticResourceAttributes.CLOUD_REGION]: process.env.DEPLOYMENT_REGION || "local",
[SemanticResourceAttributes.HOST_NAME]: os.hostname(),
[SemanticResourceAttributes.PROCESS_PID]: process.pid,
});
}
Custom Attributes
// Add custom resource attributes
const resource = new Resource({
"service.instance.id": process.env.INSTANCE_ID,
"service.cluster": process.env.CLUSTER_NAME,
"deployment.version": BUILD_INFO.commitHash,
});
Zipkin Integration
Trace Visualization
// Traces exported to Zipkin for visualization
const zipkinExporter = new ZipkinExporter({
url: "http://localhost:9411/api/v2/spans",
headers: {
"authorization": "Bearer <token>" // If authentication required
}
});
Zipkin UI Features
- Service Map: Visual representation of service dependencies
- Trace Timeline: Detailed span timing and hierarchy
- Error Tracking: Failed spans with error details
- Performance Analysis: Slow operation identification
Performance Monitoring
Span Metrics
// Automatic span metrics collection
span_duration_seconds: histogram
span_count_total: counter
span_error_total: counter
Custom Metrics
// Business-specific span attributes
span.setAttributes({
"order.type": "guest_checkout",
"order.value": order.total,
"order.items": order.items.length,
"customer.type": customer.verified ? "verified" : "guest",
});
Error Tracking
Exception Recording
try {
await riskyOperation();
} catch (error) {
span.recordException(error);
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message,
});
throw error;
}
Error Context
span.setAttributes({
"error.type": error.name,
"error.message": error.message,
"error.stack": error.stack?.substring(0, 1000), // Limit stack trace
"operation.context": JSON.stringify(operationContext),
});
Best Practices
Span Naming
- Descriptive Names: Use clear, descriptive span names
- Consistent Naming: Follow consistent naming conventions
- Hierarchical Structure: Reflect operation hierarchy in span names
- Avoid Dynamic Names: Don't include IDs in span names
Attribute Standards
- Semantic Attributes: Use OpenTelemetry semantic conventions
- Consistent Keys: Standardize attribute names across services
- Appropriate Values: Include relevant context without sensitive data
- Performance: Avoid large attribute values
Span Lifecycle
- Proper Cleanup: Always call
span.end()in finally blocks - Resource Management: Don't hold references to ended spans
- Context Propagation: Ensure trace context flows through async operations
- Error Handling: Record exceptions and set appropriate status codes
Sampling Decisions
- Business Critical: Sample 100% of critical business operations
- High Volume: Use lower sampling rates for high-volume operations
- Error Cases: Always sample error scenarios
- Development: Sample 100% in development environments
Performance Considerations
- Overhead Awareness: Tracing adds small performance overhead
- Sampling Trade-offs: Balance observability with performance
- Resource Limits: Configure appropriate span limits and timeouts
- Storage Costs: Consider trace storage and retention costs
Security Considerations
- Sensitive Data: Don't include PII in span attributes
- Access Control: Restrict trace access to authorized personnel
- Data Retention: Implement appropriate trace retention policies
- Transport Security: Use secure connections for trace export
Troubleshooting
Common Issues
Missing Traces
// Check if tracing is enabled
console.log("Tracing enabled:", process.env.OTEL_TRACE_ENABLED);
// Check sampling rate
console.log("Sampling rate:", process.env.OTEL_TRACE_SAMPLING);
// Verify Zipkin connectivity
// Check Zipkin logs for connection errors
Broken Trace Context
// Ensure context propagation
const currentSpan = trace.getActiveSpan();
if (!currentSpan) {
console.warn("No active span found - trace context may be broken");
}
// Check async context preservation
// Ensure AsyncLocalStorage is properly configured
Performance Impact
// Monitor span creation overhead
const startTime = Date.now();
const span = tracer.startSpan("operation");
// ... operation ...
span.end();
const overhead = Date.now() - startTime;
// Implement sampling for high-frequency operations
if (Math.random() < 0.01) { // 1% sampling for frequent ops
// Create span
}
Integration Examples
Database Query Tracing
// Automatic instrumentation traces all database operations
const orders = await db
.select()
.from(ordersTable)
.where(eq(ordersTable.customerId, customerId));
// Span created automatically with query details
External API Calls
// Automatic HTTP instrumentation
const response = await axios.get('https://api.payment-gateway.com/charge');
// Span created with HTTP method, URL, status, duration
Redis Operations
// Automatic Redis instrumentation
await redis.set('order:123', orderData);
// Span created with command, key, duration
Message Queue Operations
// Custom instrumentation for message queues
const span = tracer.startSpan("publishMessage", {
attributes: {
"messaging.system": "redis",
"messaging.destination": "order-events",
"messaging.operation": "publish",
}
});
await redis.publish("order-events", JSON.stringify(event));
span.end();
Monitoring & Alerting
Trace-Based Alerts
# Alert on high error rate
- alert: HighErrorRate
expr: rate(span_error_total[5m]) / rate(span_count_total[5m]) > 0.05
labels:
severity: critical
# Alert on slow operations
- alert: SlowOperations
expr: histogram_quantile(0.95, rate(span_duration_seconds_bucket[5m])) > 5
labels:
severity: warning
Dashboard Metrics
// Key tracing metrics
total_traces: counter
trace_duration_seconds: histogram
span_count_per_trace: histogram
error_spans_percentage: gauge
sampled_traces_percentage: gauge