Handle OutOfMemory runner errors with gVisor

2023-07-06 19:03:24 +02:00
parent 567694fe03
commit fd2d94568a
5 changed files with 36 additions and 18 deletions
--- a/app/controllers/submissions_controller.rb
+++ b/app/controllers/submissions_controller.rb
@@ -99,8 +99,7 @@ class SubmissionsController < ApplicationController
    end
  end
-  # rubocop:disable Metrics/CyclomaticComplexity
+  def run # rubocop:disable Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
  def run
    # These method-local socket variables are required in order to use one socket
    # in the callbacks of the other socket. As the callbacks for the client socket
    # are registered first, the runner socket may still be nil.
@@ -199,12 +198,6 @@ class SubmissionsController < ApplicationController
          end
        stream = @testrun[:status] == :ok ? :stdout : :stderr
        send_and_store client_socket, {cmd: :write, stream:, data: "#{exit_statement}\n"}
        if exit_code == 137
          send_and_store client_socket, {cmd: :status, status: :out_of_memory}
          @testrun[:status] = :out_of_memory
        end
        # The client connection will be closed once the file listing finished.
      end
      runner_socket.on :files do |files|
@@ -213,30 +206,33 @@ class SubmissionsController < ApplicationController
          js_tree = FileTree.new(downloadable_files).to_js_tree
          send_and_store client_socket, {cmd: :files, data: js_tree}
        end
        close_client_connection(client_socket)
      end
    end
    @testrun[:container_execution_time] = durations[:execution_duration]
    @testrun[:waiting_for_container_time] = durations[:waiting_duration]
  rescue Runner::Error::ExecutionTimeout => e
    send_and_store client_socket, {cmd: :status, status: :timeout}
    close_client_connection(client_socket)
    Rails.logger.debug { "Running a submission timed out: #{e.message}" }
    @testrun[:status] ||= :timeout
    @testrun[:output] = "timeout: #{@testrun[:output]}"
    extract_durations(e)
  rescue Runner::Error::OutOfMemory => e
    send_and_store client_socket, {cmd: :status, status: :out_of_memory}
    Rails.logger.debug { "Running a submission caused an out of memory error: #{e.message}" }
    @testrun[:status] ||= :out_of_memory
    @testrun[:exit_code] ||= 137
    @testrun[:output] = "out_of_memory: #{@testrun[:output]}"
    extract_durations(e)
  rescue Runner::Error => e
    # Regardless of the specific error cause, we send a `container_depleted` status to the client.
    send_and_store client_socket, {cmd: :status, status: :container_depleted}
    close_client_connection(client_socket)
    @testrun[:status] ||= :container_depleted
    Rails.logger.debug { "Runner error while running a submission: #{e.message}" }
    extract_durations(e)
  ensure
    close_client_connection(client_socket)
    save_testrun_output 'run'
  end
  # rubocop:enable Metrics/CyclomaticComplexity:
  def score
    client_socket = nil
@@ -256,14 +252,14 @@ class SubmissionsController < ApplicationController
    client_socket&.send_data(JSON.dump(@submission.calculate_score))
    # To enable hints when scoring a submission, uncomment the next line:
    # send_hints(client_socket, StructuredError.where(submission: @submission))
    kill_client_socket(client_socket)
  rescue Runner::Error => e
    extract_durations(e)
    send_and_store client_socket, {cmd: :status, status: :container_depleted}
    kill_client_socket(client_socket)
    Rails.logger.debug { "Runner error while scoring submission #{@submission.id}: #{e.message}" }
    @testrun[:passed] = false
    save_testrun_output 'assess'
  ensure
    kill_client_socket(client_socket)
  end
  def create
@@ -289,7 +285,6 @@ class SubmissionsController < ApplicationController
    # The score is stored separately, we can forward it to the client immediately
    client_socket&.send_data(JSON.dump(@submission.test(@file)))
    kill_client_socket(client_socket)
  rescue Runner::Error => e
    extract_durations(e)
    send_and_store client_socket, {cmd: :status, status: :container_depleted}
@@ -297,6 +292,8 @@ class SubmissionsController < ApplicationController
    Rails.logger.debug { "Runner error while testing submission #{@submission.id}: #{e.message}" }
    @testrun[:passed] = false
    save_testrun_output 'assess'
  ensure
    kill_client_socket(client_socket)
  end
  private
--- a/app/errors/runner/error.rb
+++ b/app/errors/runner/error.rb
@@ -24,6 +24,8 @@ class Runner
    class WorkspaceError < Error; end
    class OutOfMemory < Error; end
    class Unknown < Error; end
  end
 end
--- a/app/models/runner.rb
+++ b/app/models/runner.rb
@@ -137,6 +137,9 @@ class Runner < ApplicationRecord
    rescue Runner::Error::ExecutionTimeout => e
      Rails.logger.debug { "Running command `#{command}` timed out: #{e.message}" }
      output.merge!(status: :timeout, container_execution_time: e.execution_duration)
    rescue Runner::Error::OutOfMemory => e
      Rails.logger.debug { "Running command `#{command}` caused an out of memory error: #{e.message}" }
      output.merge!(status: :out_of_memory, container_execution_time: e.execution_duration)
    rescue Runner::Error::RunnerNotFound => e
      Rails.logger.debug { "Running command `#{command}` failed for the first time: #{e.message}" }
      try += 1
--- a/lib/runner/connection.rb
+++ b/lib/runner/connection.rb
@@ -161,9 +161,19 @@ class Runner::Connection
        # However, it might not be required for Poseidon.
        @strategy.destroy_at_management
        @error = Runner::Error::ExecutionTimeout.new('Execution exceeded its time limit')
      when :out_of_memory
        # This status is only used by Poseidon (with gVisor).
        # The runner will be destroyed (and recreated) automatically.
        @error = Runner::Error::OutOfMemory.new('Execution exceeded its memory limit')
      when :terminated_by_codeocean, :terminated_by_management
        # Poseidon (without gVisor) and DockerContainerPool do not handle memory limits explicitly.
        # Instead, they signal that the program was terminated with exit code 137 (128 + 9).
        if @exit_code == 137
          @error = Runner::Error::OutOfMemory.new('Execution exceeded its memory limit')
        else
          @exit_callback.call @exit_code
          list_filesystem
        end
      when :terminated_by_client, :error
        @strategy.destroy_at_management
      else # :established
@@ -223,6 +233,11 @@ class Runner::Connection
  end
  def handle_error(event)
    # Poseidon (with gVisor enabled!) sends an error message when the execution exceeds its memory limit.
    # This is not an error in the sense of the runner management but rather a message.
    # We handle it here to avoid the error handling in the default case.
    return @status = :out_of_memory if event[:data] == 'the allocation was OOM Killed'
    # In case of a (Nomad) error during execution, the runner management will notify us with an error message here.
    # This shouldn't happen too often and can be considered an internal server error by the runner management.
    # More information is available in the logs of the runner management or the orchestrator (e.g., Nomad).
--- a/spec/controllers/submissions_controller_spec.rb
+++ b/spec/controllers/submissions_controller_spec.rb
@@ -161,6 +161,7 @@ describe SubmissionsController do
    context 'when no errors occur during execution' do
      before do
        allow_any_instance_of(described_class).to receive(:hijack)
        allow_any_instance_of(described_class).to receive(:close_client_connection)
        allow_any_instance_of(Submission).to receive(:run).and_return({})
        allow_any_instance_of(described_class).to receive(:save_testrun_output)
        perform_request