Files
poseidon/pkg/monitoring/influxdb2_middleware.go
Maximilian Paß 08c3a3d53d Decouple InfluxDB writings from request handling.
With #451, we found that writing an InfluxDB data point might block and lead to high latencies.
2024-01-28 10:57:01 +01:00

202 lines
7.1 KiB
Go

package monitoring
import (
"bytes"
"context"
"github.com/gorilla/mux"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
influxdb2API "github.com/influxdata/influxdb-client-go/v2/api"
http2 "github.com/influxdata/influxdb-client-go/v2/api/http"
"github.com/influxdata/influxdb-client-go/v2/api/write"
"github.com/openHPI/poseidon/internal/config"
"github.com/openHPI/poseidon/pkg/dto"
"github.com/openHPI/poseidon/pkg/logging"
"io"
"net/http"
"strconv"
"time"
)
const (
// influxdbContextKey is a key (runner.ContextKey) to reference the influxdb data point in the request context.
influxdbContextKey dto.ContextKey = "influxdb data point"
// measurementPrefix allows easier filtering in influxdb.
measurementPrefix = "poseidon_"
measurementPoolSize = measurementPrefix + "poolsize"
MeasurementNomadEvents = measurementPrefix + "nomad_events"
MeasurementNomadAllocations = measurementPrefix + "nomad_allocations"
MeasurementIdleRunnerNomad = measurementPrefix + "nomad_idle_runners"
MeasurementExecutionsAWS = measurementPrefix + "aws_executions"
MeasurementExecutionsNomad = measurementPrefix + "nomad_executions"
MeasurementEnvironments = measurementPrefix + "environments"
MeasurementUsedRunner = measurementPrefix + "used_runners"
MeasurementFileDownload = measurementPrefix + "file_download"
// The keys for the monitored tags and fields.
InfluxKeyRunnerID = dto.KeyRunnerID
InfluxKeyEnvironmentID = dto.KeyEnvironmentID
InfluxKeyJobID = "job_id"
InfluxKeyClientStatus = "client_status"
InfluxKeyNomadNode = "nomad_agent"
InfluxKeyActualContentLength = "actual_length"
InfluxKeyExpectedContentLength = "expected_length"
InfluxKeyDuration = "duration"
InfluxKeyStartupDuration = "startup_" + InfluxKeyDuration
influxKeyEnvironmentPrewarmingPoolSize = "prewarming_pool_size"
influxKeyRequestSize = "request_size"
)
var (
log = logging.GetLogger("monitoring")
influxClient influxdb2API.WriteAPI
)
func InitializeInfluxDB(db *config.InfluxDB) (cancel func()) {
if db.URL == "" {
return func() {}
}
// How often to retry to write data.
const maxRetries = 50
// How long to wait before retrying to write data.
const retryInterval = 5 * time.Second
// How old the data can be before we stop retrying to write it. Should be larger than maxRetries * retryInterval.
const retryExpire = 10 * time.Minute
// How many batches are buffered before dropping the oldest.
const retryBufferLimit = 100_000
// Set options for retrying with the influx client.
options := influxdb2.DefaultOptions()
options.SetRetryInterval(uint(retryInterval.Milliseconds()))
options.SetMaxRetries(maxRetries)
options.SetMaxRetryTime(uint(retryExpire.Milliseconds()))
options.SetRetryBufferLimit(retryBufferLimit)
// Create a new influx client.
client := influxdb2.NewClientWithOptions(db.URL, db.Token, options)
influxClient = client.WriteAPI(db.Organization, db.Bucket)
influxClient.SetWriteFailedCallback(func(_ string, error http2.Error, retryAttempts uint) bool {
log.WithError(&error).WithField("retryAttempts", retryAttempts).Trace("Retrying to write influx data...")
// retryAttempts means number of retries, 0 if it failed during first write.
if retryAttempts == options.MaxRetries() {
log.WithError(&error).Warn("Could not write influx data.")
return false // Disable retry. We failed to retry writing the data in time.
}
return true // Enable retry (default)
})
// Flush the influx client on shutdown.
cancel = func() {
influxClient.Flush()
influxClient = nil
client.Close()
}
return cancel
}
// InfluxDB2Middleware is a middleware to send events to an influx database.
func InfluxDB2Middleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
route := mux.CurrentRoute(r).GetName()
p := influxdb2.NewPointWithMeasurement(measurementPrefix + route)
start := time.Now().UTC()
p.SetTime(time.Now())
ctx := context.WithValue(r.Context(), influxdbContextKey, p)
requestWithPoint := r.WithContext(ctx)
lrw := logging.NewLoggingResponseWriter(w)
next.ServeHTTP(lrw, requestWithPoint)
p.AddField(InfluxKeyDuration, time.Now().UTC().Sub(start).Nanoseconds())
p.AddTag("status", strconv.Itoa(lrw.StatusCode))
WriteInfluxPoint(p)
})
}
// AddRunnerMonitoringData adds the data of the runner we want to monitor.
func AddRunnerMonitoringData(request *http.Request, runnerID string, environmentID dto.EnvironmentID) {
addRunnerID(request, runnerID)
addEnvironmentID(request, environmentID)
}
// addRunnerID adds the runner id to the influx data point for the current request.
func addRunnerID(r *http.Request, id string) {
addInfluxDBTag(r, InfluxKeyRunnerID, id)
}
// addEnvironmentID adds the environment id to the influx data point for the current request.
func addEnvironmentID(r *http.Request, id dto.EnvironmentID) {
addInfluxDBTag(r, InfluxKeyEnvironmentID, id.ToString())
}
// AddRequestSize adds the size of the request body to the influx data point for the current request.
func AddRequestSize(r *http.Request) {
body, err := io.ReadAll(r.Body)
if err != nil {
log.WithContext(r.Context()).WithError(err).Debug("Failed to read request body")
return
}
err = r.Body.Close()
if err != nil {
log.WithContext(r.Context()).WithError(err).Debug("Failed to close request body")
return
}
r.Body = io.NopCloser(bytes.NewBuffer(body))
addInfluxDBField(r, influxKeyRequestSize, len(body))
}
func ChangedPrewarmingPoolSize(id dto.EnvironmentID, count uint) {
p := influxdb2.NewPointWithMeasurement(measurementPoolSize)
p.AddTag(InfluxKeyEnvironmentID, id.ToString())
p.AddField(influxKeyEnvironmentPrewarmingPoolSize, count)
WriteInfluxPoint(p)
}
// WriteInfluxPoint schedules the influx data point to be sent.
func WriteInfluxPoint(p *write.Point) {
if influxClient != nil {
p.AddTag("stage", config.Config.InfluxDB.Stage)
// We identified that the influxClient is not truly asynchronous. See #541.
go func() { influxClient.WritePoint(p) }()
} else {
entry := log.WithField("name", p.Name())
for _, tag := range p.TagList() {
if tag.Key == "event_type" && tag.Value == "periodically" {
return
}
entry = entry.WithField(tag.Key, tag.Value)
}
for _, field := range p.FieldList() {
entry = entry.WithField(field.Key, field.Value)
}
entry.Trace("Influx data point")
}
}
// addInfluxDBTag adds a tag to the influxdb data point in the request.
func addInfluxDBTag(r *http.Request, key, value string) {
dataPointFromRequest(r).AddTag(key, value)
}
// addInfluxDBField adds a field to the influxdb data point in the request.
func addInfluxDBField(r *http.Request, key string, value interface{}) {
dataPointFromRequest(r).AddField(key, value)
}
// dataPointFromRequest returns the data point in the passed request.
func dataPointFromRequest(r *http.Request) *write.Point {
p, ok := r.Context().Value(influxdbContextKey).(*write.Point)
if !ok {
log.WithContext(r.Context()).Error("All http request must contain an influxdb data point!")
}
return p
}