From f537e18d96588caae0b6cec50652e98c13d27154 Mon Sep 17 00:00:00 2001 From: felihong Date: Thu, 24 Nov 2022 10:46:07 +0100 Subject: [PATCH 1/4] Update docker images due to GPG error caused by Nvidia pubkey rotation. For more details see https://github.com/NVIDIA/nvidia-docker/issues/1632 --- docker/tf_1/Dockerfile | 3 +++ docker/tf_2/Dockerfile | 3 +++ 2 files changed, 6 insertions(+) diff --git a/docker/tf_1/Dockerfile b/docker/tf_1/Dockerfile index 7408249..bc5e9f6 100644 --- a/docker/tf_1/Dockerfile +++ b/docker/tf_1/Dockerfile @@ -1,6 +1,9 @@ # Base Image FROM tensorflow/tensorflow:1.15.2-gpu-py3 +RUN rm /etc/apt/sources.list.d/cuda.list +RUN rm /etc/apt/sources.list.d/nvidia-ml.list + RUN apt-get update && apt-get -q -y install git-core wget # Installing the Tensorflow Object Detection API diff --git a/docker/tf_2/Dockerfile b/docker/tf_2/Dockerfile index fd7b99d..4856778 100644 --- a/docker/tf_2/Dockerfile +++ b/docker/tf_2/Dockerfile @@ -1,6 +1,9 @@ # Base Image FROM tensorflow/tensorflow:2.6.0-gpu +RUN rm /etc/apt/sources.list.d/cuda.list +RUN rm /etc/apt/sources.list.d/nvidia-ml.list + RUN apt-get update && apt-get -q -y install git-core wget # Installing the Tensorflow Object Detection API From 1db51962b8206331c1727f69d833232e61ac3ef9 Mon Sep 17 00:00:00 2001 From: felihong Date: Thu, 24 Nov 2022 10:48:05 +0100 Subject: [PATCH 2/4] Update requirements.txt in docker image environment. Update OpenSSL lib version to fix training submission error. --- docker/tf_2/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/tf_2/requirements.txt b/docker/tf_2/requirements.txt index 5583efe..ac0b728 100644 --- a/docker/tf_2/requirements.txt +++ b/docker/tf_2/requirements.txt @@ -13,4 +13,5 @@ Pillow==8.4.0 jellyfish==0.8.2 matplotlib==3.3.4 opencv-python-headless==4.5.3.56 -pyparsing==2.4.7 \ No newline at end of file +pyparsing==2.4.7 +pyOpenSSL==22.0.0 \ No newline at end of file From bc7102eb4c39aa022392925bc9f91dc9dd912c12 Mon Sep 17 00:00:00 2001 From: felihong Date: Thu, 24 Nov 2022 10:56:21 +0100 Subject: [PATCH 3/4] Degrade Jinja2 and protobuf to old version due to recent Jinja template changes. --- src/deployment/conda_env.yml | 2 ++ src/deployment/conda_env_tf2.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/deployment/conda_env.yml b/src/deployment/conda_env.yml index c0f4247..2877624 100644 --- a/src/deployment/conda_env.yml +++ b/src/deployment/conda_env.yml @@ -11,5 +11,7 @@ dependencies: - tensorflow-gpu==1.15 - numpy==1.21.5 - itsdangerous==2.0.1 + - Jinja2==3.0.3 + - protobuf==3.20.0 name: basic_env \ No newline at end of file diff --git a/src/deployment/conda_env_tf2.yml b/src/deployment/conda_env_tf2.yml index d93e585..c7986c4 100644 --- a/src/deployment/conda_env_tf2.yml +++ b/src/deployment/conda_env_tf2.yml @@ -11,5 +11,7 @@ dependencies: - tensorflow==2.8.0 - numpy==1.21.5 - itsdangerous==2.0.1 + - Jinja2==3.0.3 + - protobuf==3.20.0 name: basic_env \ No newline at end of file From 38f043a63808f3f796f840346c8e0e1c1860afc3 Mon Sep 17 00:00:00 2001 From: felihong Date: Thu, 24 Nov 2022 10:59:36 +0100 Subject: [PATCH 4/4] Add-on feature: enable batch size specification in training job submission. This is handy when using training instances with multiple GPU kernels attached. --- src/training/exp_config_sample.json | 1 + src/training/submit_training.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/training/exp_config_sample.json b/src/training/exp_config_sample.json index 9a677ec..732e569 100644 --- a/src/training/exp_config_sample.json +++ b/src/training/exp_config_sample.json @@ -9,6 +9,7 @@ "IMAGE_TYPE" : "", "TRAIN_CSV" : "", "TEST_CSV" : "", + "BATCH_SIZE": 1, "EVAL_CONF" : 0.5, "RUN_PARAMS": { "STEPS": 1000 diff --git a/src/training/submit_training.py b/src/training/submit_training.py index 833e12b..17c76f5 100644 --- a/src/training/submit_training.py +++ b/src/training/submit_training.py @@ -43,6 +43,7 @@ def main(): img_type = exp_config['IMAGE_TYPE'] train_csv = exp_config['TRAIN_CSV'] test_csv = exp_config['TEST_CSV'] + batch_size = exp_config['BATCH_SIZE'] base_model = (exp_config['MODEL_PARAMS'] ['BASE_MODEL']) steps = (exp_config['RUN_PARAMS'] @@ -87,6 +88,7 @@ def main(): '--image_type', img_type, '--train_csv', train_csv, '--test_csv', test_csv, + '--batch_size', batch_size, '--base_model', base_model, '--steps', steps, '--fs_nms_iou', fs_nms_iou, @@ -109,6 +111,7 @@ def main(): '--image_type', img_type, '--train_csv', train_csv, '--test_csv', test_csv, + '--batch_size', batch_size, '--base_model', base_model, '--steps', steps, '--eval_conf', eval_conf]