Configure Systemd Watchdog
that monitors the reachability of Poseidon and automatically restarts Poseidon if required.
This commit is contained in:

committed by
Sebastian Serth

parent
2d34854450
commit
b48c7fe8b6
19
.github/workflows/ci.yml
vendored
19
.github/workflows/ci.yml
vendored
@ -208,6 +208,10 @@ jobs:
|
|||||||
./poseidon | tee poseidon.log &
|
./poseidon | tee poseidon.log &
|
||||||
until curl -s --fail http://localhost:7200/api/v1/health ; do sleep 1; done
|
until curl -s --fail http://localhost:7200/api/v1/health ; do sleep 1; done
|
||||||
make e2e-test
|
make e2e-test
|
||||||
|
- name: Write Environment Variables to file
|
||||||
|
run: |
|
||||||
|
echo "${{ vars }}"
|
||||||
|
if: ${{ success() || failure() }}
|
||||||
- name: Setup Poseidon Socket
|
- name: Setup Poseidon Socket
|
||||||
run: |
|
run: |
|
||||||
killall poseidon
|
killall poseidon
|
||||||
@ -216,13 +220,22 @@ jobs:
|
|||||||
cat ./.github/workflows/resources/poseidon-minimal.service | envsubst > ~/.config/systemd/user/poseidon.service
|
cat ./.github/workflows/resources/poseidon-minimal.service | envsubst > ~/.config/systemd/user/poseidon.service
|
||||||
systemctl --user daemon-reload
|
systemctl --user daemon-reload
|
||||||
systemctl --user start poseidon.socket
|
systemctl --user start poseidon.socket
|
||||||
|
if: ${{ success() || failure() }}
|
||||||
- name: Print Poseidon Failure logs
|
- name: Print Poseidon Failure logs
|
||||||
if: failure()
|
if: failure()
|
||||||
run: journalctl -xen --no-pager
|
run: journalctl --user -xen --no-pager
|
||||||
- name: Run e2e recovery tests
|
- name: Run e2e recovery tests
|
||||||
|
run: make e2e-test-recovery
|
||||||
|
if: ${{ success() || failure() }}
|
||||||
|
- name: Print Systemd Failure logs
|
||||||
run: |
|
run: |
|
||||||
tail -f /var/log/syslog &
|
/usr/bin/systemctl --user show poseidon.service -p NRestarts
|
||||||
make e2e-test-recovery
|
journalctl --user -xe -u poseidon.service --no-pager
|
||||||
|
if: failure()
|
||||||
|
- name: Stop Poseidon to flush the coverage file
|
||||||
|
run: |
|
||||||
|
systemctl --user stop poseidon.service poseidon.socket
|
||||||
|
ls -lah ${GOCOVERDIR}
|
||||||
if: ${{ success() || failure() }}
|
if: ${{ success() || failure() }}
|
||||||
- name: Convert coverage reports
|
- name: Convert coverage reports
|
||||||
run: make convert-run-coverage
|
run: make convert-run-coverage
|
||||||
|
@ -8,5 +8,10 @@ Requires=poseidon.socket
|
|||||||
[Service]
|
[Service]
|
||||||
WorkingDirectory=${GITHUB_WORKSPACE}
|
WorkingDirectory=${GITHUB_WORKSPACE}
|
||||||
ExecStart=${GITHUB_WORKSPACE}/poseidon
|
ExecStart=${GITHUB_WORKSPACE}/poseidon
|
||||||
Restart=always
|
|
||||||
Environment="POSEIDON_SERVER_SYSTEMDSOCKETACTIVATION=TRUE"
|
Environment="POSEIDON_SERVER_SYSTEMDSOCKETACTIVATION=TRUE"
|
||||||
|
|
||||||
|
Restart=always
|
||||||
|
StartLimitBurst=0
|
||||||
|
|
||||||
|
Type=notify
|
||||||
|
WatchdogSec=5
|
||||||
|
@ -2,11 +2,14 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"crypto/tls"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/coreos/go-systemd/v22/activation"
|
"github.com/coreos/go-systemd/v22/activation"
|
||||||
|
"github.com/coreos/go-systemd/v22/daemon"
|
||||||
"github.com/getsentry/sentry-go"
|
"github.com/getsentry/sentry-go"
|
||||||
sentryhttp "github.com/getsentry/sentry-go/http"
|
sentryhttp "github.com/getsentry/sentry-go/http"
|
||||||
|
"github.com/gorilla/mux"
|
||||||
"github.com/openHPI/poseidon/internal/api"
|
"github.com/openHPI/poseidon/internal/api"
|
||||||
"github.com/openHPI/poseidon/internal/config"
|
"github.com/openHPI/poseidon/internal/config"
|
||||||
"github.com/openHPI/poseidon/internal/environment"
|
"github.com/openHPI/poseidon/internal/environment"
|
||||||
@ -155,11 +158,12 @@ func watchMemoryAndAlert(options config.Profiling) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runServer(server *http.Server, cancel context.CancelFunc) {
|
func runServer(router *mux.Router, server *http.Server, cancel context.CancelFunc) {
|
||||||
defer cancel()
|
defer cancel()
|
||||||
defer shutdownSentry() // shutdownSentry must be executed in the main goroutine.
|
defer shutdownSentry() // shutdownSentry must be executed in the main goroutine.
|
||||||
|
|
||||||
httpListeners := getHTTPListeners(server)
|
httpListeners := getHTTPListeners(server)
|
||||||
|
notifySystemd(router)
|
||||||
serveHTTPListeners(server, httpListeners)
|
serveHTTPListeners(server, httpListeners)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -214,6 +218,67 @@ func serveHTTPListener(server *http.Server, l net.Listener) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func notifySystemd(router *mux.Router) {
|
||||||
|
notify, err := daemon.SdNotify(false, daemon.SdNotifyReady)
|
||||||
|
switch {
|
||||||
|
case err == nil && !notify:
|
||||||
|
log.Debug("Systemd Readiness Notification not supported")
|
||||||
|
case err != nil:
|
||||||
|
log.WithError(err).WithField("notify", notify).Warn("Failed notifying Readiness to Systemd")
|
||||||
|
default:
|
||||||
|
log.Trace("Notified Readiness to Systemd")
|
||||||
|
}
|
||||||
|
|
||||||
|
interval, err := daemon.SdWatchdogEnabled(false)
|
||||||
|
if err != nil || interval == 0 {
|
||||||
|
log.WithError(err).Error("Systemd Watchdog not supported")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
go notifyWatchdog(context.Background(), router, interval)
|
||||||
|
}
|
||||||
|
|
||||||
|
func notifyWatchdog(ctx context.Context, router *mux.Router, interval time.Duration) {
|
||||||
|
healthRoute, err := router.Get(api.HealthPath).URL()
|
||||||
|
if err != nil {
|
||||||
|
log.WithError(err).Error("Failed to parse Health route")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// We do not verify the certificate as we (intend to) perform only requests to the local server.
|
||||||
|
tlsConfig := &tls.Config{InsecureSkipVerify: true} // #nosec G402 The default min tls version is secure.
|
||||||
|
client := &http.Client{Transport: &http.Transport{TLSClientConfig: tlsConfig}}
|
||||||
|
|
||||||
|
// notificationIntervalFactor defines how many more notifications we send than required.
|
||||||
|
const notificationIntervalFactor = 2
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-time.After(interval / notificationIntervalFactor):
|
||||||
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, config.Config.Server.URL().String()+healthRoute.String(), http.NoBody)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
resp, err := client.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
// We do not check for resp.StatusCode == 503 as Poseidon's error recovery will try to handle such errors
|
||||||
|
// by itself. The Watchdog should just check that Poseidon handles http requests at all.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
_ = resp.Body.Close()
|
||||||
|
|
||||||
|
notify, err := daemon.SdNotify(false, daemon.SdNotifyWatchdog)
|
||||||
|
switch {
|
||||||
|
case err == nil && !notify:
|
||||||
|
log.Debug("Systemd Watchdog Notification not supported")
|
||||||
|
case err != nil:
|
||||||
|
log.WithError(err).WithField("notify", notify).Warn("Failed notifying Systemd Watchdog")
|
||||||
|
default:
|
||||||
|
log.Trace("Notified Systemd Watchdog")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type managerCreator func(ctx context.Context) (
|
type managerCreator func(ctx context.Context) (
|
||||||
runnerManager runner.Manager, environmentManager environment.ManagerHandler)
|
runnerManager runner.Manager, environmentManager environment.ManagerHandler)
|
||||||
|
|
||||||
@ -279,15 +344,19 @@ func createAWSManager(ctx context.Context) (
|
|||||||
return runnerManager, environment.NewAWSEnvironmentManager(runnerManager)
|
return runnerManager, environment.NewAWSEnvironmentManager(runnerManager)
|
||||||
}
|
}
|
||||||
|
|
||||||
// initServer builds the http server and configures it with the chain of responsibility for multiple managers.
|
// initRouter builds a router that serves the API with the chain of responsibility for multiple managers.
|
||||||
func initServer(ctx context.Context) *http.Server {
|
func initRouter(ctx context.Context) *mux.Router {
|
||||||
runnerManager, environmentManager := createManagerHandler(createNomadManager, config.Config.Nomad.Enabled,
|
runnerManager, environmentManager := createManagerHandler(createNomadManager, config.Config.Nomad.Enabled,
|
||||||
nil, nil, ctx)
|
nil, nil, ctx)
|
||||||
runnerManager, environmentManager = createManagerHandler(createAWSManager, config.Config.AWS.Enabled,
|
runnerManager, environmentManager = createManagerHandler(createAWSManager, config.Config.AWS.Enabled,
|
||||||
runnerManager, environmentManager, ctx)
|
runnerManager, environmentManager, ctx)
|
||||||
|
|
||||||
handler := api.NewRouter(runnerManager, environmentManager)
|
return api.NewRouter(runnerManager, environmentManager)
|
||||||
sentryHandler := sentryhttp.New(sentryhttp.Options{}).Handle(handler)
|
}
|
||||||
|
|
||||||
|
// initServer creates a server that serves the routes provided by the router.
|
||||||
|
func initServer(router *mux.Router) *http.Server {
|
||||||
|
sentryHandler := sentryhttp.New(sentryhttp.Options{}).Handle(router)
|
||||||
|
|
||||||
return &http.Server{
|
return &http.Server{
|
||||||
Addr: config.Config.Server.URL().Host,
|
Addr: config.Config.Server.URL().Host,
|
||||||
@ -347,7 +416,8 @@ func main() {
|
|||||||
go watchMemoryAndAlert(config.Config.Profiling)
|
go watchMemoryAndAlert(config.Config.Profiling)
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
server := initServer(ctx)
|
router := initRouter(ctx)
|
||||||
go runServer(server, cancel)
|
server := initServer(router)
|
||||||
|
go runServer(router, server, cancel)
|
||||||
shutdownOnOSSignal(server, ctx, stopProfiling)
|
shutdownOnOSSignal(server, ctx, stopProfiling)
|
||||||
}
|
}
|
||||||
|
@ -54,7 +54,7 @@ func (s *MainTestSuite) TestShutdownOnOSSignal_Profiling() {
|
|||||||
s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal waits for an exit after stopping the profiling.
|
s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal waits for an exit after stopping the profiling.
|
||||||
s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal triggers a os.Signal Goroutine.
|
s.ExpectedGoroutingIncrease++ // The shutdownOnOSSignal triggers a os.Signal Goroutine.
|
||||||
|
|
||||||
server := initServer(disableRecovery)
|
server := initServer(initRouter(disableRecovery))
|
||||||
go shutdownOnOSSignal(server, context.Background(), func() {
|
go shutdownOnOSSignal(server, context.Background(), func() {
|
||||||
called = true
|
called = true
|
||||||
})
|
})
|
||||||
|
@ -14,6 +14,9 @@ import (
|
|||||||
"github.com/stretchr/testify/suite"
|
"github.com/stretchr/testify/suite"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
@ -120,3 +123,19 @@ func (s *E2ERecoveryTestSuite) TestEnvironmentStatistics() {
|
|||||||
s.Equal(uint(PrewarmingPoolSize), environmentStatistics.IdleRunners)
|
s.Equal(uint(PrewarmingPoolSize), environmentStatistics.IdleRunners)
|
||||||
s.Equal(uint(1), environmentStatistics.UsedRunners)
|
s.Equal(uint(1), environmentStatistics.UsedRunners)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *E2ERecoveryTestSuite) TestWatchdogNotifications() {
|
||||||
|
// Wait for `WatchdogSec` to be passed.
|
||||||
|
<-time.After((5 + 1) * time.Second)
|
||||||
|
|
||||||
|
// If the Watchdog has not received the notification by now it will restart Poseidon.
|
||||||
|
cmd := exec.Command("/usr/bin/systemctl", "--user", "show", "poseidon.service", "-p", "NRestarts")
|
||||||
|
s.Require().NoError(cmd.Err)
|
||||||
|
out, err := cmd.Output()
|
||||||
|
s.Require().NoError(err)
|
||||||
|
|
||||||
|
restarts, err := strconv.Atoi(strings.Trim(strings.ReplaceAll(string(out), "NRestarts=", ""), "\n"))
|
||||||
|
s.Require().NoError(err)
|
||||||
|
// If Poseidon would not notify the systemd watchdog, we would have one more restart than expected.
|
||||||
|
s.Equal(PoseidonRestartCount, restarts)
|
||||||
|
}
|
||||||
|
@ -46,6 +46,8 @@ func waitForPoseidon() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var PoseidonRestartCount = 0
|
||||||
|
|
||||||
func killPoseidon() {
|
func killPoseidon() {
|
||||||
processes, err := process.Processes()
|
processes, err := process.Processes()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -62,6 +64,7 @@ func killPoseidon() {
|
|||||||
log.WithError(err).Error("Error killing Poseidon")
|
log.WithError(err).Error("Error killing Poseidon")
|
||||||
} else {
|
} else {
|
||||||
log.Info("Killed Poseidon")
|
log.Info("Killed Poseidon")
|
||||||
|
PoseidonRestartCount++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user