paris-saclay-cds · maikia · Dec 16, 2020 · Dec 16, 2020 · Dec 16, 2020 · Dec 16, 2020
diff --git a/ramp-engine/ramp_engine/aws/worker.py b/ramp-engine/ramp_engine/aws/worker.py
@@ -166,7 +166,7 @@ def collect_results(self):
         except Exception as e:
             logger.error("Error occurred when downloading the logs"
                          f" from the submission: {e}")
-            exit_status = 1
+            exit_status = 2
             error_msg = str(e)
             self.status = 'error'
         if exit_status == 0:
@@ -189,7 +189,7 @@ def collect_results(self):
                 error_msg = _get_traceback(
                     aws._get_log_content(self.config, self.submission))
                 self.status = 'collected'
-                exit_status, error_msg = 1, ""
+                exit_status = 1
         logger.info(repr(self))
         return exit_status, error_msg
 

diff --git a/ramp-engine/ramp_engine/dispatcher.py b/ramp-engine/ramp_engine/dispatcher.py
@@ -212,7 +212,7 @@ def collect_result(self, session):
         for worker, (submission_id, submission_name) in zip(workers,
                                                             submissions):
             dt = worker.time_since_last_status_check()
-            if dt is not None and dt < self.time_between_collection:
+            if (dt is not None) and (dt < self.time_between_collection):
                 self._processing_worker_queue.put_nowait(
                     (worker, (submission_id, submission_name)))
                 time.sleep(0)
@@ -231,20 +231,24 @@ def collect_result(self, session):
             else:
                 self._logger.info(f'Collecting results from worker {worker}')
                 returncode, stderr = worker.collect_results()
+
                 if returncode:
                     if returncode == 124:
                         self._logger.info(
                             f'Worker {worker} killed due to timeout.'
                         )
+                        submission_status = 'checking_error'
+                    elif returncode == 2:
+                        # Error occurred when downloading the logs
+                        submission_status = 'checking_error'
                     else:
                         self._logger.info(
                             f'Worker {worker} killed due to an error '
                             f'during training: {stderr}'
                         )
-                    submission_status = 'training_error'
+                        submission_status = 'training_error'
                 else:
                     submission_status = 'tested'
-
                 set_submission_state(
                     session, submission_id, submission_status
                 )

diff --git a/ramp-engine/ramp_engine/tests/test_aws.py b/ramp-engine/ramp_engine/tests/test_aws.py
@@ -105,8 +105,9 @@ class DummyInstance:
     exit_status, error_msg = worker.collect_results()
     assert 'Error occurred when downloading the logs' in caplog.text
     assert 'Trying to download the log once again' in caplog.text
-    assert exit_status == 1
+    assert exit_status == 2
     assert 'test' in error_msg
+    assert worker.status == 'error'
 
 
 @mock.patch('ramp_engine.aws.api._rsync')

diff --git a/ramp-engine/ramp_engine/tests/test_dispatcher.py b/ramp-engine/ramp_engine/tests/test_dispatcher.py
@@ -1,7 +1,7 @@
-import shutil
 import os
-
 import pytest
+import shutil
+from unittest import mock
 
 from ramp_utils import read_config
 from ramp_utils.testing import database_config_template
@@ -228,7 +228,6 @@ def test_dispatcher_worker_retry(session_toy):
 
     while not dispatcher._processing_worker_queue.empty():
         dispatcher.collect_result(session_toy)
-
     submissions = get_submissions(session_toy, 'iris_test', 'new')
     assert submission_name in [sub[1] for sub in submissions]
 
@@ -253,7 +252,82 @@ def test_dispatcher_aws_not_launching(session_toy_aws, caplog):
     assert 'training' not in caplog.text
     num_running_workers = dispatcher._processing_worker_queue.qsize()
     assert num_running_workers == 0
-
     submissions2 = get_submissions(session_toy_aws, 'iris_aws_test', 'new')
     # assert that all the submissions are still in the 'new' state
     assert len(submissions) == len(submissions2)
+
+
+@mock.patch('ramp_engine.aws.api.download_log')
+@mock.patch('ramp_engine.aws.api.check_instance_status')
+@mock.patch('ramp_engine.aws.api._get_log_content')
+@mock.patch('ramp_engine.aws.api._training_successful')
+@mock.patch('ramp_engine.aws.api._training_finished')
+@mock.patch('ramp_engine.aws.api.is_spot_terminated')
+@mock.patch('ramp_engine.aws.api.launch_train')
+@mock.patch('ramp_engine.aws.api.upload_submission')
+@mock.patch('ramp_engine.aws.api.launch_ec2_instances')
+def test_info_on_training_error(test_launch_ec2_instances, upload_submission,
+                                launch_train,
+                                is_spot_terminated, training_finished,
+                                training_successful,
+                                get_log_content, check_instance_status,
+                                download_log,
+                                session_toy_aws,
+                                caplog):
+    # make sure that the Python error from the solution is passed to the
+    # dispatcher
+    # everything shoud be mocked as correct output from AWS instances
+    # on setting up the instance and loading the submission
+    # mock dummy AWS instance
+    class DummyInstance:
+        id = 1
+    test_launch_ec2_instances.return_value = (DummyInstance(),), 0
+    upload_submission.return_value = 0
+    launch_train.return_value = 0
+    is_spot_terminated.return_value = 0
+    training_finished.return_value = False
+    download_log.return_value = 0
+
+    config = read_config(database_config_template())
+    event_config = read_config(ramp_aws_config_template())
+
+    dispatcher = Dispatcher(config=config,
+                            event_config=event_config,
+                            worker=AWSWorker, n_workers=10,
+                            hunger_policy='exit')
+    dispatcher.fetch_from_db(session_toy_aws)
+    dispatcher.launch_workers(session_toy_aws)
+    num_running_workers = dispatcher._processing_worker_queue.qsize()
+    # worker, (submission_id, submission_name) = \
+    #     dispatcher._processing_worker_queue.get()
+    # assert worker.status == 'running'
+    submissions = get_submissions(session_toy_aws,
+                                  'iris_aws_test',
+                                  'training')
+    ids = [submissions[idx][0] for idx in range(len(submissions))]
+    assert len(submissions) > 1
+    assert num_running_workers == len(ids)
+
+    dispatcher.time_between_collection = 0
+    training_successful.return_value = False
+
+    # now we will end the submission with training error
+    training_finished.return_value = True
+    training_error_msg = 'Python error here'
+    get_log_content.return_value = training_error_msg
+    check_instance_status.return_value = 'finished'
+
+    dispatcher.collect_result(session_toy_aws)
+
+    # the worker which we were using should have been teared down
+    num_running_workers = dispatcher._processing_worker_queue.qsize()
+
+    assert num_running_workers == 0
+
+    submissions = get_submissions(session_toy_aws,
+                                  'iris_aws_test',
+                                  'training_error')
+    assert len(submissions) == len(ids)
+
+    submission = get_submission_by_id(session_toy_aws, submissions[0][0])
+    assert training_error_msg in submission.error_msg