Introduction

Observability is crucial for understanding and maintaining complex distributed systems. This post shares my experience implementing comprehensive observability solutions across large-scale distributed applications.

The Three Pillars of Observability

1. Metrics

Implementing custom metrics in Go:

type MetricsCollector struct {
    httpRequestDuration *prometheus.HistogramVec
    errorCounter       *prometheus.CounterVec
    activeRequests     *prometheus.GaugeVec
}

func NewMetricsCollector() *MetricsCollector {
    return &MetricsCollector{
        httpRequestDuration: prometheus.NewHistogramVec(
            prometheus.HistogramOpts{
                Name:    "http_request_duration_seconds",
                Help:    "HTTP request duration in seconds",
                Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10},
            },
            []string{"handler", "method", "status"},
        ),
        errorCounter: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "error_total",
                Help: "Total number of errors by type",
            },
            []string{"type"},
        ),
        activeRequests: prometheus.NewGaugeVec(
            prometheus.GaugeOpts{
                Name: "active_requests",
                Help: "Number of active requests",
            },
            []string{"handler"},
        ),
    }
}

2. Distributed Tracing

Implementing OpenTelemetry tracing:

func initTracer() (*trace.TracerProvider, error) {
    exporter, err := jaeger.New(jaeger.WithCollectorEndpoint(
        jaeger.WithEndpoint("http://jaeger:14268/api/traces"),
    ))
    if err != nil {
        return nil, err
    }

    tp := trace.NewTracerProvider(
        trace.WithSampler(trace.AlwaysSample()),
        trace.WithBatcher(exporter),
        trace.WithResource(resource.NewWithAttributes(
            semconv.SchemaURL,
            semconv.ServiceNameKey.String("my-service"),
            attribute.String("environment", "production"),
        )),
    )

    otel.SetTracerProvider(tp)
    return tp, nil
}

func tracingMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        ctx := r.Context()
        tracer := otel.Tracer("")

        ctx, span := tracer.Start(ctx, r.URL.Path,
            trace.WithAttributes(
                semconv.HTTPMethodKey.String(r.Method),
                semconv.HTTPURLKey.String(r.URL.String()),
            ),
        )
        defer span.End()

        next.ServeHTTP(w, r.WithContext(ctx))
    })
}

3. Structured Logging

Implementing context-aware structured logging:

type LogEntry struct {
    logger *zap.Logger
    fields []zap.Field
}

func NewLogger() (*LogEntry, error) {
    config := zap.NewProductionConfig()
    config.EncoderConfig.TimeKey = "timestamp"
    config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder

    logger, err := config.Build(zap.AddCallerSkip(1))
    if err != nil {
        return nil, err
    }

    return &LogEntry{
        logger: logger,
    }, nil
}

func (l *LogEntry) With(fields ...zap.Field) *LogEntry {
    return &LogEntry{
        logger: l.logger,
        fields: append(l.fields, fields...),
    }
}

func (l *LogEntry) Error(msg string, err error) {
    fields := append(l.fields, zap.Error(err))
    l.logger.Error(msg, fields...)
}

Correlation and Context Propagation

Request Context Management

type RequestContext struct {
    TraceID    string
    SpanID     string
    RequestID  string
    UserID     string
    SessionID  string
}

func FromContext(ctx context.Context) *RequestContext {
    if rc, ok := ctx.Value(requestContextKey).(*RequestContext); ok {
        return rc
    }
    return &RequestContext{}
}

func WithRequestContext(ctx context.Context, rc *RequestContext) context.Context {
    return context.WithValue(ctx, requestContextKey, rc)
}

Alert Management

Alert Configuration

alerts:
  - name: high_error_rate
    condition: rate(error_total[5m]) > 0.1
    severity: critical
    annotations:
      summary: High error rate detected
      description: Error rate exceeded 10% in the last 5 minutes

  - name: high_latency
    condition: histogram_quantile(0.95, http_request_duration_seconds) > 1
    severity: warning
    annotations:
      summary: High latency detected
      description: 95th percentile latency exceeded 1 second

Alert Routing

type AlertRouter struct {
    routes map[string][]AlertHandler
}

func (r *AlertRouter) Route(alert *Alert) error {
    handlers, exists := r.routes[alert.Severity]
    if !exists {
        return fmt.Errorf("no handlers for severity: %s", alert.Severity)
    }

    for _, handler := range handlers {
        if err := handler.Handle(alert); err != nil {
            log.Printf("Error handling alert: %v", err)
        }
    }
    return nil
}

Performance Analysis

Continuous Profiling

func startProfiler() error {
    runtime.SetMutexProfileFraction(5)
    runtime.SetBlockProfileRate(1)

    go func() {
        server := &http.Server{
            Addr:    ":6060",
            Handler: pprofHandler(),
        }

        if err := server.ListenAndServe(); err != nil {
            log.Printf("Profiler server error: %v", err)
        }
    }()

    return nil
}

Dashboarding and Visualization

Custom Grafana Dashboards

{
  "dashboard": {
    "id": null,
    "title": "Service Overview",
    "panels": [
      {
        "title": "Request Rate",
        "type": "graph",
        "datasource": "Prometheus",
        "targets": [
          {
            "expr": "rate(http_request_total[5m])",
            "legendFormat": "{{handler}}"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "graph",
        "datasource": "Prometheus",
        "targets": [
          {
            "expr": "rate(error_total[5m])",
            "legendFormat": "{{type}}"
          }
        ]
      }
    ]
  }
}

Conclusion

Building truly observable systems requires a holistic approach combining metrics, traces, and logs. The examples provided here demonstrate practical implementations that can be adapted for various scales of distributed systems. Remember that observability is not just about collecting data—it’s about gaining actionable insights that help maintain and improve system reliability.