added k8s stub adapter for execution environment
This commit is contained in:
122
internal/kubernetes/api_querier.go
Normal file
122
internal/kubernetes/api_querier.go
Normal file
@ -0,0 +1,122 @@
|
||||
package kubernetes
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
nomadApi "github.com/hashicorp/nomad/api"
|
||||
"io"
|
||||
appv1 "k8s.io/api/apps/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
"k8s.io/client-go/rest"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrorNoAllocationFound = errors.New("no allocation found")
|
||||
)
|
||||
|
||||
// apiQuerier provides access to the Nomad functionality.
|
||||
type apiQuerier interface {
|
||||
// init prepares an apiClient to be able to communicate to a provided Nomad API.
|
||||
init(nomadConfig *rest.Config) (err error)
|
||||
|
||||
// LoadJobList loads the list of jobs from the Nomad API.
|
||||
LoadJobList() (list []*appv1.DeploymentList, err error)
|
||||
|
||||
// JobScale returns the scale of the passed job.
|
||||
JobScale(jobID string) (jobScale uint, err error)
|
||||
|
||||
// SetJobScale sets the scaling count of the passed job to Nomad.
|
||||
SetJobScale(jobID string, count uint, reason string) (err error)
|
||||
|
||||
// DeleteJob deletes the Job with the given ID.
|
||||
DeleteDeployment(name string) (err error)
|
||||
|
||||
// Execute runs a command in the passed job.
|
||||
Execute(jobID string, ctx context.Context, command string, tty bool,
|
||||
stdin io.Reader, stdout, stderr io.Writer) (int, error)
|
||||
|
||||
// listJobs loads all jobs with the specified prefix.
|
||||
listDeployments(namespace string) (jobListStub []*appv1.DeploymentList, err error)
|
||||
|
||||
// job returns the job of the given jobID.
|
||||
deployment(name string) (deployment appv1.Deployment, err error)
|
||||
|
||||
// listAllocations loads all allocations.
|
||||
listAllocations() (allocationListStub []*nomadApi.AllocationListStub, err error)
|
||||
|
||||
// allocation returns the first allocation of the given job.
|
||||
allocation(jobID string) (*nomadApi.Allocation, error)
|
||||
|
||||
// RegisterKubernetesDeployment registers a deployment with Kubernetes.
|
||||
// It returns the deployment ID that can be used when listening to the Kubernetes event stream.
|
||||
RegisterKubernetesDeployment(deployment appv1.Deployment) (string, error)
|
||||
|
||||
// EventStream returns a Nomad event stream filtered to return only allocation and evaluation events.
|
||||
EventStream(ctx context.Context) (<-chan *nomadApi.Events, error)
|
||||
}
|
||||
|
||||
// nomadAPIClient implements the nomadApiQuerier interface and provides access to a real Nomad API.
|
||||
type kubernetesAPIClient struct {
|
||||
client *kubernetes.Clientset
|
||||
namespace string
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) init(nomadConfig *rest.Config) (err error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) LoadJobList() (list []*appv1.DeploymentList, err error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) JobScale(jobID string) (jobScale uint, err error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) SetJobScale(jobID string, count uint, reason string) (err error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) DeleteDeployment(name string) (err error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) Execute(jobID string, ctx context.Context, command string, tty bool, stdin io.Reader, stdout, stderr io.Writer) (int, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) listDeployments(namespace string) (jobListStub []*appv1.DeploymentList, err error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) deployment(name string) (deployment appv1.Deployment, err error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) listAllocations() (allocationListStub []*nomadApi.AllocationListStub, err error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) allocation(jobID string) (*nomadApi.Allocation, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) RegisterKubernetesDeployment(deployment appv1.Deployment) (string, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (k kubernetesAPIClient) EventStream(ctx context.Context) (<-chan *nomadApi.Events, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
125
internal/kubernetes/deployment.go
Normal file
125
internal/kubernetes/deployment.go
Normal file
@ -0,0 +1,125 @@
|
||||
package kubernetes
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
nomadApi "github.com/hashicorp/nomad/api"
|
||||
"github.com/openHPI/poseidon/pkg/dto"
|
||||
appv1 "k8s.io/api/apps/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
TemplateJobPrefix = "template"
|
||||
TaskGroupName = "default-group"
|
||||
TaskName = "default-task"
|
||||
TaskCount = 1
|
||||
TaskDriver = "docker"
|
||||
TaskCommand = "sleep"
|
||||
ConfigTaskGroupName = "config"
|
||||
ConfigTaskName = "config"
|
||||
ConfigTaskDriver = "exec"
|
||||
ConfigTaskCommand = "true"
|
||||
ConfigMetaUsedKey = "used"
|
||||
ConfigMetaUsedValue = "true"
|
||||
ConfigMetaUnusedValue = "false"
|
||||
ConfigMetaTimeoutKey = "timeout"
|
||||
ConfigMetaPoolSizeKey = "prewarmingPoolSize"
|
||||
TemplateJobNameParts = 2
|
||||
RegisterTimeout = 10 * time.Second
|
||||
RunnerTimeoutFallback = 60 * time.Second
|
||||
)
|
||||
|
||||
var (
|
||||
ErrorInvalidJobID = errors.New("invalid job id")
|
||||
ErrorMissingTaskGroup = errors.New("couldn't find config task group in job")
|
||||
TaskArgs = []string{"infinity"}
|
||||
)
|
||||
|
||||
func (a *APIClient) RegisterRunnerJob(template *appv1.Deployment) error {
|
||||
evalID, err := a.apiQuerier.RegisterKubernetesDeployment(*template)
|
||||
if err != nil {
|
||||
return fmt.Errorf("couldn't register runner job: %w", err)
|
||||
}
|
||||
|
||||
registerTimeout, cancel := context.WithTimeout(context.Background(), RegisterTimeout)
|
||||
defer cancel()
|
||||
return a.MonitorEvaluation(evalID, registerTimeout)
|
||||
}
|
||||
|
||||
// SetForcePullFlag sets the flag of a job if the image should be pulled again.
|
||||
func SetForcePullFlag(deployment *appv1.Deployment, value bool) {
|
||||
for _, container := range deployment.Spec.Template.Spec.Containers {
|
||||
if container.Name == TaskName {
|
||||
if value {
|
||||
container.ImagePullPolicy = v1.PullAlways
|
||||
} else {
|
||||
container.ImagePullPolicy = v1.PullIfNotPresent
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// IsEnvironmentTemplateID checks if the passed job id belongs to a template job.
|
||||
func IsEnvironmentTemplateID(jobID string) bool {
|
||||
parts := strings.Split(jobID, "-")
|
||||
if len(parts) != TemplateJobNameParts || parts[0] != TemplateJobPrefix {
|
||||
return false
|
||||
}
|
||||
|
||||
_, err := EnvironmentIDFromTemplateJobID(jobID)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// RunnerJobID returns the nomad job id of the runner with the given environmentID and id.
|
||||
func RunnerJobID(environmentID dto.EnvironmentID, id string) string {
|
||||
return fmt.Sprintf("%d-%s", environmentID, id)
|
||||
}
|
||||
|
||||
// TemplateJobID returns the id of the nomad job for the environment with the given id.
|
||||
func TemplateJobID(id dto.EnvironmentID) string {
|
||||
return fmt.Sprintf("%s-%d", TemplateJobPrefix, id)
|
||||
}
|
||||
|
||||
// EnvironmentIDFromRunnerID returns the environment id that is part of the passed runner job id.
|
||||
func EnvironmentIDFromRunnerID(jobID string) (dto.EnvironmentID, error) {
|
||||
return partOfJobID(jobID, 0)
|
||||
}
|
||||
|
||||
// EnvironmentIDFromTemplateJobID returns the environment id that is part of the passed environment job id.
|
||||
func EnvironmentIDFromTemplateJobID(id string) (dto.EnvironmentID, error) {
|
||||
return partOfJobID(id, 1)
|
||||
}
|
||||
|
||||
func partOfJobID(id string, part uint) (dto.EnvironmentID, error) {
|
||||
parts := strings.Split(id, "-")
|
||||
if len(parts) == 0 {
|
||||
return 0, fmt.Errorf("empty job id: %w", ErrorInvalidJobID)
|
||||
}
|
||||
environmentID, err := strconv.Atoi(parts[part])
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid environment id par %v: %w", err, ErrorInvalidJobID)
|
||||
}
|
||||
return dto.EnvironmentID(environmentID), nil
|
||||
}
|
||||
|
||||
func isOOMKilled(alloc *nomadApi.Allocation) bool {
|
||||
state, ok := alloc.TaskStates[TaskName]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
var oomKilledCount uint64
|
||||
for _, event := range state.Events {
|
||||
if oomString, ok := event.Details["oom_killed"]; ok {
|
||||
if oom, err := strconv.ParseBool(oomString); err == nil && oom {
|
||||
oomKilledCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
return oomKilledCount >= state.Restarts
|
||||
}
|
221
internal/kubernetes/kubernetes.go
Normal file
221
internal/kubernetes/kubernetes.go
Normal file
@ -0,0 +1,221 @@
|
||||
package kubernetes
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
nomadApi "github.com/hashicorp/nomad/api"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/influxdata/influxdb-client-go/v2/api/write"
|
||||
"github.com/openHPI/poseidon/pkg/dto"
|
||||
"github.com/openHPI/poseidon/pkg/logging"
|
||||
"github.com/openHPI/poseidon/pkg/monitoring"
|
||||
"github.com/openHPI/poseidon/pkg/storage"
|
||||
"io"
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
"k8s.io/client-go/rest"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var (
|
||||
log = logging.GetLogger("kubernetes")
|
||||
ErrorExecutorCommunicationFailed = errors.New("communication with executor failed")
|
||||
ErrorEvaluation = errors.New("evaluation could not complete")
|
||||
ErrorPlacingAllocations = errors.New("failed to place all allocations")
|
||||
ErrorLoadingJob = errors.New("failed to load job")
|
||||
ErrorNoAllocatedResourcesFound = errors.New("no allocated resources found")
|
||||
ErrorLocalDestruction RunnerDeletedReason = errors.New("the destruction should not cause external changes")
|
||||
ErrorOOMKilled RunnerDeletedReason = fmt.Errorf("%s: %w", dto.ErrOOMKilled.Error(), ErrorLocalDestruction)
|
||||
ErrorAllocationRescheduled RunnerDeletedReason = fmt.Errorf("the allocation was rescheduled: %w", ErrorLocalDestruction)
|
||||
ErrorAllocationStopped RunnerDeletedReason = errors.New("the allocation was stopped")
|
||||
ErrorAllocationStoppedUnexpectedly RunnerDeletedReason = fmt.Errorf("%w unexpectedly", ErrorAllocationStopped)
|
||||
ErrorAllocationRescheduledUnexpectedly RunnerDeletedReason = fmt.Errorf(
|
||||
"%w correctly but rescheduled", ErrorAllocationStopped)
|
||||
// ErrorAllocationCompleted is for reporting the reason for the stopped allocation.
|
||||
// We do not consider it as an error but add it anyway for a complete reporting.
|
||||
ErrorAllocationCompleted RunnerDeletedReason = errors.New("the allocation completed")
|
||||
)
|
||||
|
||||
type allocationData struct {
|
||||
// allocClientStatus defines the state defined by Nomad.
|
||||
allocClientStatus string
|
||||
// allocDesiredStatus defines if the allocation wants to be running or being stopped.
|
||||
allocDesiredStatus string
|
||||
jobID string
|
||||
start time.Time
|
||||
// stopExpected is used to suppress warnings that could be triggered by a race condition
|
||||
// between the Inactivity timer and an external event leadng to allocation rescheduling.
|
||||
stopExpected bool
|
||||
// Just debugging information
|
||||
allocNomadNode string
|
||||
}
|
||||
|
||||
// resultChannelWriteTimeout is to detect the error when more element are written into a channel than expected.
|
||||
const resultChannelWriteTimeout = 10 * time.Millisecond
|
||||
|
||||
type DeletedAllocationProcessor func(jobID string, RunnerDeletedReason error) (removedByPoseidon bool)
|
||||
type NewAllocationProcessor func(*nomadApi.Allocation, time.Duration)
|
||||
|
||||
// AllocationProcessing includes the callbacks to interact with allocation events.
|
||||
type AllocationProcessing struct {
|
||||
OnNew NewAllocationProcessor
|
||||
OnDeleted DeletedAllocationProcessor
|
||||
}
|
||||
type RunnerDeletedReason error
|
||||
|
||||
// ExecutorAPI provides access to a container orchestration solution.
|
||||
type ExecutorAPI interface {
|
||||
apiQuerier
|
||||
|
||||
// LoadEnvironmentJobs loads all environment jobs.
|
||||
LoadEnvironmentJobs() ([]*appsv1.Deployment, error)
|
||||
|
||||
// LoadRunnerJobs loads all runner jobs specific for the environment.
|
||||
LoadRunnerJobs(environmentID dto.EnvironmentID) ([]*appsv1.Deployment, error)
|
||||
|
||||
// LoadRunnerIDs returns the IDs of all runners with the specified id prefix which are not about to
|
||||
// get stopped.
|
||||
LoadRunnerIDs(prefix string) (runnerIds []string, err error)
|
||||
|
||||
// LoadRunnerPortMappings returns the mapped ports of the runner.
|
||||
LoadRunnerPortMappings(runnerID string) ([]v1.ContainerPort, error)
|
||||
|
||||
// RegisterRunnerJob creates a runner job based on the template job.
|
||||
// It registers the job and waits until the registration completes.
|
||||
RegisterRunnerJob(template *appsv1.Deployment) error
|
||||
|
||||
// MonitorEvaluation monitors the given evaluation ID.
|
||||
// It waits until the evaluation reaches one of the states complete, canceled or failed.
|
||||
// If the evaluation was not successful, an error containing the failures is returned.
|
||||
// See also https://github.com/hashicorp/nomad/blob/7d5a9ecde95c18da94c9b6ace2565afbfdd6a40d/command/monitor.go#L175
|
||||
MonitorEvaluation(evaluationID string, ctx context.Context) error
|
||||
|
||||
// WatchEventStream listens on the Nomad event stream for allocation and evaluation events.
|
||||
// Depending on the incoming event, any of the given function is executed.
|
||||
// Do not run multiple times simultaneously.
|
||||
WatchEventStream(ctx context.Context, callbacks *AllocationProcessing) error
|
||||
|
||||
// ExecuteCommand executes the given command in the job/runner with the given id.
|
||||
// It writes the output of the command to stdout/stderr and reads input from stdin.
|
||||
// If tty is true, the command will run with a tty.
|
||||
// Iff privilegedExecution is true, the command will be executed privileged.
|
||||
// The command is passed in the shell form (not the exec array form) and will be executed in a shell.
|
||||
ExecuteCommand(jobID string, ctx context.Context, command string, tty bool, privilegedExecution bool,
|
||||
stdin io.Reader, stdout, stderr io.Writer) (int, error)
|
||||
|
||||
// MarkRunnerAsUsed marks the runner with the given ID as used. It also stores the timeout duration in the metadata.
|
||||
MarkRunnerAsUsed(runnerID string, duration int) error
|
||||
}
|
||||
|
||||
// APIClient implements the ExecutorAPI interface and can be used to perform different operations on the real
|
||||
// Executor API and its return values.
|
||||
type APIClient struct {
|
||||
apiQuerier
|
||||
evaluations storage.Storage[chan error]
|
||||
// allocations contain management data for all pending and running allocations.
|
||||
allocations storage.Storage[*allocationData]
|
||||
isListening bool
|
||||
}
|
||||
|
||||
func (A APIClient) LoadEnvironmentJobs() ([]*appsv1.Deployment, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (a *APIClient) LoadRunnerJobs(environmentID dto.EnvironmentID) ([]*appsv1.Deployment, error) {
|
||||
go a.initializeAllocations(environmentID)
|
||||
|
||||
runnerIDs, err := a.LoadRunnerIDs(RunnerJobID(environmentID, ""))
|
||||
if err != nil {
|
||||
return []*appsv1.Deployment{}, fmt.Errorf("couldn't load jobs: %w", err)
|
||||
}
|
||||
|
||||
var occurredError error
|
||||
jobs := make([]*appsv1.Deployment, 0, len(runnerIDs))
|
||||
for _, id := range runnerIDs {
|
||||
job, err := a.apiQuerier.deployment(id)
|
||||
if err != nil {
|
||||
if occurredError == nil {
|
||||
occurredError = ErrorLoadingJob
|
||||
}
|
||||
occurredError = fmt.Errorf("%w: couldn't load job info for runner %s - %v", occurredError, id, err)
|
||||
continue
|
||||
}
|
||||
jobs = append(jobs, &job)
|
||||
}
|
||||
return jobs, occurredError
|
||||
}
|
||||
|
||||
func (A APIClient) LoadRunnerIDs(prefix string) (runnerIds []string, err error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (A APIClient) LoadRunnerPortMappings(runnerID string) ([]v1.ContainerPort, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (A APIClient) MonitorEvaluation(evaluationID string, ctx context.Context) error {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (A APIClient) WatchEventStream(ctx context.Context, callbacks *AllocationProcessing) error {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (A APIClient) ExecuteCommand(jobID string, ctx context.Context, command string, tty bool, privilegedExecution bool, stdin io.Reader, stdout, stderr io.Writer) (int, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (A APIClient) MarkRunnerAsUsed(runnerID string, duration int) error {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
// NewExecutorAPI creates a new api client.
|
||||
// One client is usually sufficient for the complete runtime of the API.
|
||||
func NewExecutorAPI(kubernetesConfig *rest.Config) (ExecutorAPI, error) {
|
||||
client := &APIClient{
|
||||
apiQuerier: &kubernetesAPIClient{},
|
||||
evaluations: storage.NewLocalStorage[chan error](),
|
||||
allocations: storage.NewMonitoredLocalStorage[*allocationData](monitoring.MeasurementNomadAllocations,
|
||||
func(p *write.Point, object *allocationData, _ storage.EventType) {
|
||||
p.AddTag(monitoring.InfluxKeyJobID, object.jobID)
|
||||
p.AddTag(monitoring.InfluxKeyClientStatus, object.allocClientStatus)
|
||||
p.AddTag(monitoring.InfluxKeyNomadNode, object.allocNomadNode)
|
||||
}, 0, nil),
|
||||
}
|
||||
err := client.init(kubernetesConfig)
|
||||
return client, err
|
||||
}
|
||||
|
||||
func (a *APIClient) initializeAllocations(environmentID dto.EnvironmentID) {
|
||||
allocationStubs, err := a.listAllocations()
|
||||
if err != nil {
|
||||
log.WithError(err).Warn("Could not initialize allocations")
|
||||
} else {
|
||||
for _, stub := range allocationStubs {
|
||||
switch {
|
||||
case IsEnvironmentTemplateID(stub.JobID):
|
||||
continue
|
||||
case !strings.HasPrefix(stub.JobID, RunnerJobID(environmentID, "")):
|
||||
continue
|
||||
case stub.ClientStatus == structs.AllocClientStatusPending || stub.ClientStatus == structs.AllocClientStatusRunning:
|
||||
log.WithField("jobID", stub.JobID).WithField("status", stub.ClientStatus).Debug("Recovered Allocation")
|
||||
a.allocations.Add(stub.ID, &allocationData{
|
||||
allocClientStatus: stub.ClientStatus,
|
||||
allocDesiredStatus: stub.DesiredStatus,
|
||||
jobID: stub.JobID,
|
||||
start: time.Unix(0, stub.CreateTime),
|
||||
allocNomadNode: stub.NodeName,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user