polyphem/internal/runner/k8s_runner.go

package runner

import (
	"context"
	"errors"
	"fmt"
	"github.com/google/uuid"
	"github.com/openHPI/poseidon/pkg/dto"
	"github.com/openHPI/poseidon/pkg/monitoring"
	"github.com/openHPI/poseidon/pkg/storage"
	"io"
	corev1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/client-go/kubernetes"
	"net/http"
	"time"
)

var ErrPodCreationFailed = errors.New("failed to create pod")

var (
	ErrorUnknownExecution                  = errors.New("unknown execution")
	ErrFileNotFound                        = errors.New("file not found or insufficient permissions")
	ErrOOMKilled             DestroyReason = errors.New("the runner was killed due to out of memory")
	ErrDestroyedByAPIRequest DestroyReason = errors.New("the client wants to stop the runner")
)

// KubernetesPodWorkload is an abstraction to manage a Kubernetes pod.
// It is not persisted on a Poseidon restart.
// The InactivityTimer is used actively. It stops and deletes the pod.
type KubernetesPodWorkload struct {
	InactivityTimer
	id                string
	fs                map[dto.FilePath][]byte
	executions        storage.Storage[*dto.ExecutionRequest]
	runningExecutions map[string]context.CancelFunc
	onDestroy         DestroyRunnerHandler
	environment       ExecutionEnvironment
	ctx               context.Context
	cancel            context.CancelFunc
	clientset         *kubernetes.Clientset
	podName           string
	namespace         string
}

// NewKubernetesPodWorkload creates a new KubernetesPodWorkload with the provided id.
func NewKubernetesPodWorkload(
	environment ExecutionEnvironment, onDestroy DestroyRunnerHandler, clientset *kubernetes.Clientset) (*KubernetesPodWorkload, error) {
	newUUID, err := uuid.NewUUID()
	if err != nil {
		return nil, fmt.Errorf("failed generating runner id: %w", err)
	}

	ctx, cancel := context.WithCancel(context.Background())
	workload := &KubernetesPodWorkload{
		id:                newUUID.String(),
		fs:                make(map[dto.FilePath][]byte),
		runningExecutions: make(map[string]context.CancelFunc),
		onDestroy:         onDestroy,
		environment:       environment,
		ctx:               ctx,
		cancel:            cancel,
		clientset:         clientset,
		namespace:         "default", // You might want to make this configurable
		podName:           fmt.Sprintf("workload-%s", newUUID.String()),
	}
	workload.executions = storage.NewMonitoredLocalStorage[*dto.ExecutionRequest](
		monitoring.MeasurementExecutionsK8s, monitorExecutionsRunnerID(environment.ID(), workload.id), time.Minute, ctx)
	workload.InactivityTimer = NewInactivityTimer(workload, func(_ Runner) error {
		return workload.Destroy(nil)
	})
	return workload, nil
}

func (w *KubernetesPodWorkload) ID() string {
	return w.id
}

func (w *KubernetesPodWorkload) Environment() dto.EnvironmentID {
	return w.environment.ID()
}

func (w *KubernetesPodWorkload) MappedPorts() []*dto.MappedPort {
	// Implement port mapping logic for Kubernetes
	return []*dto.MappedPort{}
}

func (w *KubernetesPodWorkload) StoreExecution(id string, request *dto.ExecutionRequest) {
	w.executions.Add(id, request)
}

func (w *KubernetesPodWorkload) ExecutionExists(id string) bool {
	_, ok := w.executions.Get(id)
	return ok
}

// ExecuteInteractively runs the execution request in a Kubernetes pod.
func (w *KubernetesPodWorkload) ExecuteInteractively(
	id string, _ io.ReadWriter, stdout, stderr io.Writer, ctx context.Context) (
	<-chan ExitInfo, context.CancelFunc, error) {
	w.ResetTimeout()
	request, ok := w.executions.Pop(id)
	if !ok {
		return nil, nil, ErrorUnknownExecution
	}
	hideEnvironmentVariablesK8s(request, "K8S")
	command, executionCtx, cancel := prepareExecution(request, w.ctx)
	exitInternal := make(chan ExitInfo)
	exit := make(chan ExitInfo, 1)

	go w.executeCommand(executionCtx, command, stdout, stderr, exitInternal)
	go w.handleRunnerTimeout(executionCtx, exitInternal, exit, id)

	return exit, cancel, nil
}

func (w *KubernetesPodWorkload) ListFileSystem(path string, recursive bool, writer io.Writer, humanReadable bool, ctx context.Context) error {
	// Implement file system listing for Kubernetes pods
	return dto.ErrNotSupported
}

func (w *KubernetesPodWorkload) UpdateFileSystem(request *dto.UpdateFileSystemRequest, ctx context.Context) error {
	// Implement file system update for Kubernetes pods
	return nil
}

func (w *KubernetesPodWorkload) GetFileContent(path string, writer http.ResponseWriter, humanReadable bool, ctx context.Context) error {
	// Implement file content retrieval for Kubernetes pods
	return dto.ErrNotSupported
}

func (w *KubernetesPodWorkload) Destroy(_ DestroyReason) error {
	w.cancel()
	err := w.clientset.CoreV1().Pods(w.namespace).Delete(context.Background(), w.podName, metav1.DeleteOptions{})
	if err != nil {
		return fmt.Errorf("error while destroying kubernetes pod: %w", err)
	}
	if err := w.onDestroy(w); err != nil {
		return fmt.Errorf("error while destroying kubernetes runner: %w", err)
	}
	return nil
}

func (w *KubernetesPodWorkload) executeCommand(ctx context.Context, command string,
	stdout, stderr io.Writer, exit chan<- ExitInfo,
) {
	defer close(exit)

	pod := &corev1.Pod{
		ObjectMeta: metav1.ObjectMeta{
			Name: w.podName,
		},
		Spec: corev1.PodSpec{
			RestartPolicy: corev1.RestartPolicyNever,
			Containers: []corev1.Container{
				{
					Name:    "workload",
					Image:   w.environment.Image(),
					Command: []string{"/bin/sh", "-c", command},
				},
			},
		},
	}

	_, err := w.clientset.CoreV1().Pods(w.namespace).Create(ctx, pod, metav1.CreateOptions{})
	if err != nil {
		exit <- ExitInfo{1, fmt.Errorf("%w: %v", ErrPodCreationFailed, err)}
		return
	}

	req := w.clientset.CoreV1().Pods(w.namespace).GetLogs(w.podName, &corev1.PodLogOptions{
		Follow: true,
	})
	podLogs, err := req.Stream(ctx)
	if err != nil {
		exit <- ExitInfo{1, fmt.Errorf("error in opening stream: %v", err)}
		return
	}
	defer func(podLogs io.ReadCloser) {
		err := podLogs.Close()
		if err != nil {
			exit <- ExitInfo{1, fmt.Errorf("error in closing stream: %v", err)}
		}
	}(podLogs)

	_, err = io.Copy(stdout, podLogs)
	if err != nil {
		exit <- ExitInfo{1, fmt.Errorf("error in copying logs: %v", err)}
		return
	}

	// Wait for the pod to complete
	watch, err := w.clientset.CoreV1().Pods(w.namespace).Watch(ctx, metav1.ListOptions{
		FieldSelector: fmt.Sprintf("metadata.name=%s", w.podName),
	})
	if err != nil {
		exit <- ExitInfo{1, fmt.Errorf("error watching pod: %v", err)}
		return
	}
	defer watch.Stop()

	for event := range watch.ResultChan() {
		pod, ok := event.Object.(*corev1.Pod)
		if !ok {
			continue
		}
		if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
			exitCode := uint8(0)
			if pod.Status.Phase == corev1.PodFailed {
				exitCode = 1
			}
			exit <- ExitInfo{exitCode, nil}
			return
		}
	}
}

func (w *KubernetesPodWorkload) handleRunnerTimeout(ctx context.Context,
	exitInternal <-chan ExitInfo, exit chan<- ExitInfo, executionID string) {
	executionCtx, cancelExecution := context.WithCancel(ctx)
	w.runningExecutions[executionID] = cancelExecution
	defer delete(w.runningExecutions, executionID)
	defer close(exit)

	select {
	case exitInfo := <-exitInternal:
		exit <- exitInfo
	case <-executionCtx.Done():
		exit <- ExitInfo{255, ErrorRunnerInactivityTimeout}
	}
}

// hideEnvironmentVariables sets the CODEOCEAN variable and unsets all variables starting with the passed prefix.
func hideEnvironmentVariablesK8s(request *dto.ExecutionRequest, unsetPrefix string) {
	if request.Environment == nil {
		request.Environment = make(map[string]string)
	}
	request.Command = "unset \"${!" + unsetPrefix + "@}\" && " + request.Command
}

func prepareExecution(request *dto.ExecutionRequest, environmentCtx context.Context) (
	command string, ctx context.Context, cancel context.CancelFunc,
) {
	command = request.FullCommand()
	if request.TimeLimit == 0 {
		ctx, cancel = context.WithCancel(environmentCtx)
	} else {
		ctx, cancel = context.WithTimeout(environmentCtx, time.Duration(request.TimeLimit)*time.Second)
	}
	return command, ctx, cancel
}