Configure Systemd Watchdog
that monitors the reachability of Poseidon and automatically restarts Poseidon if required.
This commit is contained in:

committed by
Sebastian Serth

parent
2d34854450
commit
b48c7fe8b6
19
.github/workflows/ci.yml
vendored
19
.github/workflows/ci.yml
vendored
@ -208,6 +208,10 @@ jobs:
|
||||
./poseidon | tee poseidon.log &
|
||||
until curl -s --fail http://localhost:7200/api/v1/health ; do sleep 1; done
|
||||
make e2e-test
|
||||
- name: Write Environment Variables to file
|
||||
run: |
|
||||
echo "${{ vars }}"
|
||||
if: ${{ success() || failure() }}
|
||||
- name: Setup Poseidon Socket
|
||||
run: |
|
||||
killall poseidon
|
||||
@ -216,13 +220,22 @@ jobs:
|
||||
cat ./.github/workflows/resources/poseidon-minimal.service | envsubst > ~/.config/systemd/user/poseidon.service
|
||||
systemctl --user daemon-reload
|
||||
systemctl --user start poseidon.socket
|
||||
if: ${{ success() || failure() }}
|
||||
- name: Print Poseidon Failure logs
|
||||
if: failure()
|
||||
run: journalctl -xen --no-pager
|
||||
run: journalctl --user -xen --no-pager
|
||||
- name: Run e2e recovery tests
|
||||
run: make e2e-test-recovery
|
||||
if: ${{ success() || failure() }}
|
||||
- name: Print Systemd Failure logs
|
||||
run: |
|
||||
tail -f /var/log/syslog &
|
||||
make e2e-test-recovery
|
||||
/usr/bin/systemctl --user show poseidon.service -p NRestarts
|
||||
journalctl --user -xe -u poseidon.service --no-pager
|
||||
if: failure()
|
||||
- name: Stop Poseidon to flush the coverage file
|
||||
run: |
|
||||
systemctl --user stop poseidon.service poseidon.socket
|
||||
ls -lah ${GOCOVERDIR}
|
||||
if: ${{ success() || failure() }}
|
||||
- name: Convert coverage reports
|
||||
run: make convert-run-coverage
|
||||
|
@ -8,5 +8,10 @@ Requires=poseidon.socket
|
||||
[Service]
|
||||
WorkingDirectory=${GITHUB_WORKSPACE}
|
||||
ExecStart=${GITHUB_WORKSPACE}/poseidon
|
||||
Restart=always
|
||||
Environment="POSEIDON_SERVER_SYSTEMDSOCKETACTIVATION=TRUE"
|
||||
|
||||
Restart=always
|
||||
StartLimitBurst=0
|
||||
|
||||
Type=notify
|
||||
WatchdogSec=5
|
||||
|
@ -2,11 +2,14 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/coreos/go-systemd/v22/activation"
|
||||
"github.com/coreos/go-systemd/v22/daemon"
|
||||
"github.com/getsentry/sentry-go"
|
||||
sentryhttp "github.com/getsentry/sentry-go/http"
|
||||
"github.com/gorilla/mux"
|
||||
"github.com/openHPI/poseidon/internal/api"
|
||||
"github.com/openHPI/poseidon/internal/config"
|
||||
"github.com/openHPI/poseidon/internal/environment"
|
||||
@ -155,11 +158,12 @@ func watchMemoryAndAlert(options config.Profiling) {
|
||||
}
|
||||
}
|
||||
|
||||
func runServer(server *http.Server, cancel context.CancelFunc) {
|
||||
func runServer(router *mux.Router, server *http.Server, cancel context.CancelFunc) {
|
||||
defer cancel()
|
||||
defer shutdownSentry() // shutdownSentry must be executed in the main goroutine.
|
||||
|
||||
httpListeners := getHTTPListeners(server)
|
||||
notifySystemd(router)
|
||||
serveHTTPListeners(server, httpListeners)
|
||||
}
|
||||
|
||||
@ -214,6 +218,67 @@ func serveHTTPListener(server *http.Server, l net.Listener) {
|
||||
}
|
||||
}
|
||||
|
||||
func notifySystemd(router *mux.Router) {
|
||||
notify, err := daemon.SdNotify(false, daemon.SdNotifyReady)
|
||||
switch {
|
||||
case err == nil && !notify:
|
||||
log.Debug("Systemd Readiness Notification not supported")
|
||||
case err != nil:
|
||||
log.WithError(err).WithField("notify", notify).Warn("Failed notifying Readiness to Systemd")
|
||||
default:
|
||||
log.Trace("Notified Readiness to Systemd")
|
||||
}
|
||||
|
||||
interval, err := daemon.SdWatchdogEnabled(false)
|
||||
if err != nil || interval == 0 {
|
||||
log.WithError(err).Error("Systemd Watchdog not supported")
|
||||
return
|
||||
}
|
||||
go notifyWatchdog(context.Background(), router, interval)
|
||||
}
|
||||
|
||||
func notifyWatchdog(ctx context.Context, router *mux.Router, interval time.Duration) {
|
||||
healthRoute, err := router.Get(api.HealthPath).URL()
|
||||
if err != nil {
|
||||
log.WithError(err).Error("Failed to parse Health route")
|
||||
return
|
||||
}
|
||||
// We do not verify the certificate as we (intend to) perform only requests to the local server.
|
||||
tlsConfig := &tls.Config{InsecureSkipVerify: true} // #nosec G402 The default min tls version is secure.
|
||||
client := &http.Client{Transport: &http.Transport{TLSClientConfig: tlsConfig}}
|
||||
|
||||
// notificationIntervalFactor defines how many more notifications we send than required.
|
||||
const notificationIntervalFactor = 2
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(interval / notificationIntervalFactor):
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, config.Config.Server.URL().String()+healthRoute.String(), http.NoBody)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
// We do not check for resp.StatusCode == 503 as Poseidon's error recovery will try to handle such errors
|
||||
// by itself. The Watchdog should just check that Poseidon handles http requests at all.
|
||||
continue
|
||||
}
|
||||
_ = resp.Body.Close()
|
||||
|
||||
notify, err := daemon.SdNotify(false, daemon.SdNotifyWatchdog)
|
||||
switch {
|
||||
case err == nil && !notify:
|
||||
log.Debug("Systemd Watchdog Notification not supported")
|
||||
case err != nil:
|
||||
log.WithError(err).WithField("notify", notify).Warn("Failed notifying Systemd Watchdog")
|
||||
default:
|
||||
log.Trace("Notified Systemd Watchdog")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type managerCreator func(ctx context.Context) (
|
||||
runnerManager runner.Manager, environmentManager environment.ManagerHandler)
|
||||
|
||||
@ -279,15 +344,19 @@ func createAWSManager(ctx context.Context) (
|
||||
return runnerManager, environment.NewAWSEnvironmentManager(runnerManager)
|
||||
}
|
||||
|
||||
// initServer builds the http server and configures it with the chain of responsibility for multiple managers.
|
||||
func initServer(ctx context.Context) *http.Server {
|
||||
// initRouter builds a router that serves the API with the chain of responsibility for multiple managers.
|
||||
func initRouter(ctx context.Context) *mux.Router {
|
||||
runnerManager, environmentManager := createManagerHandler(createNomadManager, config.Config.Nomad.Enabled,
|
||||
nil, nil, ctx)
|
||||
runnerManager, environmentManager = createManagerHandler(createAWSManager, config.Config.AWS.Enabled,
|
||||
runnerManager, environmentManager, ctx)
|
||||
|
||||
handler := api.NewRouter(runnerManager, environmentManager)
|
||||
sentryHandler := sentryhttp.New(sentryhttp.Options{}).Handle(handler)
|
||||
return api.NewRouter(runnerManager, environmentManager)
|
||||
}
|
||||
|
||||
// initServer creates a server that serves the routes provided by the router.
|
||||
func initServer(router *mux.Router) *http.Server {
|
||||
sentryHandler := sentryhttp.New(sentryhttp.Options{}).Handle(router)
|
||||
|
||||
return &http.Server{
|
||||
Addr: config.Config.Server.URL().Host,
|
||||
@ -347,7 +416,8 @@ func main() {
|
||||
go watchMemoryAndAlert(config.Config.Profiling)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
server := initServer(ctx)
|
||||
go runServer(server, cancel)
|
||||
router := initRouter(ctx)
|
||||
server := initServer(router)
|
||||
go runServer(router, server, cancel)
|
||||
shutdownOnOSSignal(server, ctx, stopProfiling)
|
||||
}
|
||||
|
@ -54,7 +54,7 @@ func (s *MainTestSuite) TestShutdownOnOSSignal_Profiling() {
|
||||
s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal waits for an exit after stopping the profiling.
|
||||
s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal triggers a os.Signal Goroutine.
|
||||
|
||||
server := initServer(disableRecovery)
|
||||
server := initServer(initRouter(disableRecovery))
|
||||
go shutdownOnOSSignal(server, context.Background(), func() {
|
||||
called = true
|
||||
})
|
||||
|
@ -14,6 +14,9 @@ import (
|
||||
"github.com/stretchr/testify/suite"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
@ -120,3 +123,19 @@ func (s *E2ERecoveryTestSuite) TestEnvironmentStatistics() {
|
||||
s.Equal(uint(PrewarmingPoolSize), environmentStatistics.IdleRunners)
|
||||
s.Equal(uint(1), environmentStatistics.UsedRunners)
|
||||
}
|
||||
|
||||
func (s *E2ERecoveryTestSuite) TestWatchdogNotifications() {
|
||||
// Wait for `WatchdogSec` to be passed.
|
||||
<-time.After((5 + 1) * time.Second)
|
||||
|
||||
// If the Watchdog has not received the notification by now it will restart Poseidon.
|
||||
cmd := exec.Command("/usr/bin/systemctl", "--user", "show", "poseidon.service", "-p", "NRestarts")
|
||||
s.Require().NoError(cmd.Err)
|
||||
out, err := cmd.Output()
|
||||
s.Require().NoError(err)
|
||||
|
||||
restarts, err := strconv.Atoi(strings.Trim(strings.ReplaceAll(string(out), "NRestarts=", ""), "\n"))
|
||||
s.Require().NoError(err)
|
||||
// If Poseidon would not notify the systemd watchdog, we would have one more restart than expected.
|
||||
s.Equal(PoseidonRestartCount, restarts)
|
||||
}
|
||||
|
@ -46,6 +46,8 @@ func waitForPoseidon() {
|
||||
}
|
||||
}
|
||||
|
||||
var PoseidonRestartCount = 0
|
||||
|
||||
func killPoseidon() {
|
||||
processes, err := process.Processes()
|
||||
if err != nil {
|
||||
@ -62,6 +64,7 @@ func killPoseidon() {
|
||||
log.WithError(err).Error("Error killing Poseidon")
|
||||
} else {
|
||||
log.Info("Killed Poseidon")
|
||||
PoseidonRestartCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user