Handle OutOfMemory runner errors with gVisor

This commit is contained in:
Sebastian Serth
2023-07-06 19:03:24 +02:00
committed by Sebastian Serth
parent 567694fe03
commit fd2d94568a
5 changed files with 36 additions and 18 deletions

View File

@ -99,8 +99,7 @@ class SubmissionsController < ApplicationController
end end
end end
# rubocop:disable Metrics/CyclomaticComplexity def run # rubocop:disable Metrics/PerceivedComplexity, Metrics/CyclomaticComplexity
def run
# These method-local socket variables are required in order to use one socket # These method-local socket variables are required in order to use one socket
# in the callbacks of the other socket. As the callbacks for the client socket # in the callbacks of the other socket. As the callbacks for the client socket
# are registered first, the runner socket may still be nil. # are registered first, the runner socket may still be nil.
@ -199,12 +198,6 @@ class SubmissionsController < ApplicationController
end end
stream = @testrun[:status] == :ok ? :stdout : :stderr stream = @testrun[:status] == :ok ? :stdout : :stderr
send_and_store client_socket, {cmd: :write, stream:, data: "#{exit_statement}\n"} send_and_store client_socket, {cmd: :write, stream:, data: "#{exit_statement}\n"}
if exit_code == 137
send_and_store client_socket, {cmd: :status, status: :out_of_memory}
@testrun[:status] = :out_of_memory
end
# The client connection will be closed once the file listing finished.
end end
runner_socket.on :files do |files| runner_socket.on :files do |files|
@ -213,30 +206,33 @@ class SubmissionsController < ApplicationController
js_tree = FileTree.new(downloadable_files).to_js_tree js_tree = FileTree.new(downloadable_files).to_js_tree
send_and_store client_socket, {cmd: :files, data: js_tree} send_and_store client_socket, {cmd: :files, data: js_tree}
end end
close_client_connection(client_socket)
end end
end end
@testrun[:container_execution_time] = durations[:execution_duration] @testrun[:container_execution_time] = durations[:execution_duration]
@testrun[:waiting_for_container_time] = durations[:waiting_duration] @testrun[:waiting_for_container_time] = durations[:waiting_duration]
rescue Runner::Error::ExecutionTimeout => e rescue Runner::Error::ExecutionTimeout => e
send_and_store client_socket, {cmd: :status, status: :timeout} send_and_store client_socket, {cmd: :status, status: :timeout}
close_client_connection(client_socket)
Rails.logger.debug { "Running a submission timed out: #{e.message}" } Rails.logger.debug { "Running a submission timed out: #{e.message}" }
@testrun[:status] ||= :timeout @testrun[:status] ||= :timeout
@testrun[:output] = "timeout: #{@testrun[:output]}" @testrun[:output] = "timeout: #{@testrun[:output]}"
extract_durations(e) extract_durations(e)
rescue Runner::Error::OutOfMemory => e
send_and_store client_socket, {cmd: :status, status: :out_of_memory}
Rails.logger.debug { "Running a submission caused an out of memory error: #{e.message}" }
@testrun[:status] ||= :out_of_memory
@testrun[:exit_code] ||= 137
@testrun[:output] = "out_of_memory: #{@testrun[:output]}"
extract_durations(e)
rescue Runner::Error => e rescue Runner::Error => e
# Regardless of the specific error cause, we send a `container_depleted` status to the client. # Regardless of the specific error cause, we send a `container_depleted` status to the client.
send_and_store client_socket, {cmd: :status, status: :container_depleted} send_and_store client_socket, {cmd: :status, status: :container_depleted}
close_client_connection(client_socket)
@testrun[:status] ||= :container_depleted @testrun[:status] ||= :container_depleted
Rails.logger.debug { "Runner error while running a submission: #{e.message}" } Rails.logger.debug { "Runner error while running a submission: #{e.message}" }
extract_durations(e) extract_durations(e)
ensure ensure
close_client_connection(client_socket)
save_testrun_output 'run' save_testrun_output 'run'
end end
# rubocop:enable Metrics/CyclomaticComplexity:
def score def score
client_socket = nil client_socket = nil
@ -256,14 +252,14 @@ class SubmissionsController < ApplicationController
client_socket&.send_data(JSON.dump(@submission.calculate_score)) client_socket&.send_data(JSON.dump(@submission.calculate_score))
# To enable hints when scoring a submission, uncomment the next line: # To enable hints when scoring a submission, uncomment the next line:
# send_hints(client_socket, StructuredError.where(submission: @submission)) # send_hints(client_socket, StructuredError.where(submission: @submission))
kill_client_socket(client_socket)
rescue Runner::Error => e rescue Runner::Error => e
extract_durations(e) extract_durations(e)
send_and_store client_socket, {cmd: :status, status: :container_depleted} send_and_store client_socket, {cmd: :status, status: :container_depleted}
kill_client_socket(client_socket)
Rails.logger.debug { "Runner error while scoring submission #{@submission.id}: #{e.message}" } Rails.logger.debug { "Runner error while scoring submission #{@submission.id}: #{e.message}" }
@testrun[:passed] = false @testrun[:passed] = false
save_testrun_output 'assess' save_testrun_output 'assess'
ensure
kill_client_socket(client_socket)
end end
def create def create
@ -289,7 +285,6 @@ class SubmissionsController < ApplicationController
# The score is stored separately, we can forward it to the client immediately # The score is stored separately, we can forward it to the client immediately
client_socket&.send_data(JSON.dump(@submission.test(@file))) client_socket&.send_data(JSON.dump(@submission.test(@file)))
kill_client_socket(client_socket)
rescue Runner::Error => e rescue Runner::Error => e
extract_durations(e) extract_durations(e)
send_and_store client_socket, {cmd: :status, status: :container_depleted} send_and_store client_socket, {cmd: :status, status: :container_depleted}
@ -297,6 +292,8 @@ class SubmissionsController < ApplicationController
Rails.logger.debug { "Runner error while testing submission #{@submission.id}: #{e.message}" } Rails.logger.debug { "Runner error while testing submission #{@submission.id}: #{e.message}" }
@testrun[:passed] = false @testrun[:passed] = false
save_testrun_output 'assess' save_testrun_output 'assess'
ensure
kill_client_socket(client_socket)
end end
private private

View File

@ -24,6 +24,8 @@ class Runner
class WorkspaceError < Error; end class WorkspaceError < Error; end
class OutOfMemory < Error; end
class Unknown < Error; end class Unknown < Error; end
end end
end end

View File

@ -137,6 +137,9 @@ class Runner < ApplicationRecord
rescue Runner::Error::ExecutionTimeout => e rescue Runner::Error::ExecutionTimeout => e
Rails.logger.debug { "Running command `#{command}` timed out: #{e.message}" } Rails.logger.debug { "Running command `#{command}` timed out: #{e.message}" }
output.merge!(status: :timeout, container_execution_time: e.execution_duration) output.merge!(status: :timeout, container_execution_time: e.execution_duration)
rescue Runner::Error::OutOfMemory => e
Rails.logger.debug { "Running command `#{command}` caused an out of memory error: #{e.message}" }
output.merge!(status: :out_of_memory, container_execution_time: e.execution_duration)
rescue Runner::Error::RunnerNotFound => e rescue Runner::Error::RunnerNotFound => e
Rails.logger.debug { "Running command `#{command}` failed for the first time: #{e.message}" } Rails.logger.debug { "Running command `#{command}` failed for the first time: #{e.message}" }
try += 1 try += 1

View File

@ -161,9 +161,19 @@ class Runner::Connection
# However, it might not be required for Poseidon. # However, it might not be required for Poseidon.
@strategy.destroy_at_management @strategy.destroy_at_management
@error = Runner::Error::ExecutionTimeout.new('Execution exceeded its time limit') @error = Runner::Error::ExecutionTimeout.new('Execution exceeded its time limit')
when :out_of_memory
# This status is only used by Poseidon (with gVisor).
# The runner will be destroyed (and recreated) automatically.
@error = Runner::Error::OutOfMemory.new('Execution exceeded its memory limit')
when :terminated_by_codeocean, :terminated_by_management when :terminated_by_codeocean, :terminated_by_management
# Poseidon (without gVisor) and DockerContainerPool do not handle memory limits explicitly.
# Instead, they signal that the program was terminated with exit code 137 (128 + 9).
if @exit_code == 137
@error = Runner::Error::OutOfMemory.new('Execution exceeded its memory limit')
else
@exit_callback.call @exit_code @exit_callback.call @exit_code
list_filesystem list_filesystem
end
when :terminated_by_client, :error when :terminated_by_client, :error
@strategy.destroy_at_management @strategy.destroy_at_management
else # :established else # :established
@ -223,6 +233,11 @@ class Runner::Connection
end end
def handle_error(event) def handle_error(event)
# Poseidon (with gVisor enabled!) sends an error message when the execution exceeds its memory limit.
# This is not an error in the sense of the runner management but rather a message.
# We handle it here to avoid the error handling in the default case.
return @status = :out_of_memory if event[:data] == 'the allocation was OOM Killed'
# In case of a (Nomad) error during execution, the runner management will notify us with an error message here. # In case of a (Nomad) error during execution, the runner management will notify us with an error message here.
# This shouldn't happen too often and can be considered an internal server error by the runner management. # This shouldn't happen too often and can be considered an internal server error by the runner management.
# More information is available in the logs of the runner management or the orchestrator (e.g., Nomad). # More information is available in the logs of the runner management or the orchestrator (e.g., Nomad).

View File

@ -161,6 +161,7 @@ describe SubmissionsController do
context 'when no errors occur during execution' do context 'when no errors occur during execution' do
before do before do
allow_any_instance_of(described_class).to receive(:hijack) allow_any_instance_of(described_class).to receive(:hijack)
allow_any_instance_of(described_class).to receive(:close_client_connection)
allow_any_instance_of(Submission).to receive(:run).and_return({}) allow_any_instance_of(Submission).to receive(:run).and_return({})
allow_any_instance_of(described_class).to receive(:save_testrun_output) allow_any_instance_of(described_class).to receive(:save_testrun_output)
perform_request perform_request