From 614f17308f84051b287f230bec7e546d58ceb93b Mon Sep 17 00:00:00 2001 From: haixiw Date: Tue, 20 Jun 2023 21:45:35 +0000 Subject: [PATCH 1/5] test log --- src/sagemaker_xgboost_container/metrics/custom_metrics.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/sagemaker_xgboost_container/metrics/custom_metrics.py b/src/sagemaker_xgboost_container/metrics/custom_metrics.py index 5fc3f46d..463bd068 100644 --- a/src/sagemaker_xgboost_container/metrics/custom_metrics.py +++ b/src/sagemaker_xgboost_container/metrics/custom_metrics.py @@ -21,6 +21,7 @@ r2_score, recall_score, ) +import logging # From 1.2, custom evaluation metric receives raw prediction. @@ -133,6 +134,8 @@ def rmse(preds, dtrain): :return: Metric name, root mean squared error """ labels = dtrain.get_label() + logging.info(f"Here's the labels: {labels}") + logging.info(f"Here's the preds {preds}") return "rmse", mean_squared_error(labels, preds, squared=False) From 9be1569da81987a995fd426b4cc0824e642692d1 Mon Sep 17 00:00:00 2001 From: haixiw Date: Wed, 21 Jun 2023 21:43:32 +0000 Subject: [PATCH 2/5] Fix the distribute CPU training error when validation data can't be well divided --- .../algorithm_mode/train.py | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/sagemaker_xgboost_container/algorithm_mode/train.py b/src/sagemaker_xgboost_container/algorithm_mode/train.py index 24641614..e3c9709d 100644 --- a/src/sagemaker_xgboost_container/algorithm_mode/train.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/train.py @@ -198,6 +198,9 @@ def sagemaker_train( train_dmatrix, val_dmatrix, train_val_dmatrix = get_validated_dmatrices( train_path, val_path, file_type, csv_weights, is_pipe, combine_train_val ) + + missing_validation_data = validation_channel and not val_dmatrix + train_args = dict( train_cfg=validated_train_config, train_dmatrix=train_dmatrix, @@ -210,22 +213,34 @@ def sagemaker_train( # Wait for hosts to find each other logging.info(f"Distributed node training with {num_hosts} hosts: {sm_hosts}") distributed.wait_hostname_resolution(sm_hosts) + include_in_training = True if not train_dmatrix: logging.warning( - "Host {} does not have data. Will broadcast to cluster and will not be used in distributed" - " training.".format(sm_current_host) + f"Host {sm_current_host} does not have training data. Will broadcast to cluster and this host {sm_current_host} " + f"will not be used in distributed training. Please divide the training data across instances properly. " + f"See https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html#Instance-XGBoost-distributed-training-divide-data. " + ) + include_in_training = False + elif missing_validation_data: + logging.warning( + f"Host {sm_current_host} does not have validation data in the validation channel : {validation_channel}. " + f"Will broadcast to cluster and this host {sm_current_host} will not be used in distributed training. " + f"Please divide the training data across instances properly. " + f"See https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html#Instance-XGBoost-distributed-training-divide-data. " ) + include_in_training = False + distributed.rabit_run( exec_fun=train_job, args=train_args, - include_in_training=(train_dmatrix is not None), + include_in_training=include_in_training, hosts=sm_hosts, current_host=sm_current_host, update_rabit_args=True, ) elif num_hosts == 1: if train_dmatrix: - if validation_channel and not val_dmatrix: + if missing_validation_data: raise exc.UserError(f"No data in validation channel path {val_path}") logging.info("Single node training.") train_args.update({"is_master": True}) @@ -277,6 +292,8 @@ def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix, model_di logging.info(f"Train matrix has {train_dmatrix.num_row()} rows and {train_dmatrix.num_col()} columns") if val_dmatrix: logging.info(f"Validation matrix has {val_dmatrix.num_row()} rows") + else: + logging.info("No validation data is collected for this training job.") try: kfold = train_cfg.pop("_kfold", None) From b2e45630d5d6c5429144c2ad6fc1295ba4c88321 Mon Sep 17 00:00:00 2001 From: haixiw Date: Wed, 21 Jun 2023 22:00:10 +0000 Subject: [PATCH 3/5] remove some logs --- src/sagemaker_xgboost_container/metrics/custom_metrics.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/sagemaker_xgboost_container/metrics/custom_metrics.py b/src/sagemaker_xgboost_container/metrics/custom_metrics.py index 463bd068..5fc3f46d 100644 --- a/src/sagemaker_xgboost_container/metrics/custom_metrics.py +++ b/src/sagemaker_xgboost_container/metrics/custom_metrics.py @@ -21,7 +21,6 @@ r2_score, recall_score, ) -import logging # From 1.2, custom evaluation metric receives raw prediction. @@ -134,8 +133,6 @@ def rmse(preds, dtrain): :return: Metric name, root mean squared error """ labels = dtrain.get_label() - logging.info(f"Here's the labels: {labels}") - logging.info(f"Here's the preds {preds}") return "rmse", mean_squared_error(labels, preds, squared=False) From 69e1e6f789a26e7cde56935bd7e0a442809ed799 Mon Sep 17 00:00:00 2001 From: haixiw Date: Thu, 22 Jun 2023 00:09:23 +0000 Subject: [PATCH 4/5] fix flake8 --- .../algorithm_mode/train.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/sagemaker_xgboost_container/algorithm_mode/train.py b/src/sagemaker_xgboost_container/algorithm_mode/train.py index e3c9709d..60248e58 100644 --- a/src/sagemaker_xgboost_container/algorithm_mode/train.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/train.py @@ -216,17 +216,20 @@ def sagemaker_train( include_in_training = True if not train_dmatrix: logging.warning( - f"Host {sm_current_host} does not have training data. Will broadcast to cluster and this host {sm_current_host} " - f"will not be used in distributed training. Please divide the training data across instances properly. " - f"See https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html#Instance-XGBoost-distributed-training-divide-data. " + f"Host {sm_current_host} does not have training data. Will broadcast to " + f"cluster and this host {sm_current_host} will not be used in distributed training. " + f"Please divide the training data across instances properly. See https://docs.aws.amazon.com/" + f"sagemaker/latest/dg/xgboost.html#Instance-XGBoost-distributed-training-divide-data. " ) include_in_training = False - elif missing_validation_data: + if missing_validation_data: logging.warning( - f"Host {sm_current_host} does not have validation data in the validation channel : {validation_channel}. " - f"Will broadcast to cluster and this host {sm_current_host} will not be used in distributed training. " - f"Please divide the training data across instances properly. " - f"See https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html#Instance-XGBoost-distributed-training-divide-data. " + f"Host {sm_current_host} does not have validation data " + f"in the validation channel : {validation_channel}. " + f"Will broadcast to cluster and this host {sm_current_host} will not be used " + f"in distributed training. Please divide the training data across instances properly. " + f"See https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html" + f"#Instance-XGBoost-distributed-training-divide-data. " ) include_in_training = False From fa371c78f5dfcd967f06b5b708900dbd0dd7f34f Mon Sep 17 00:00:00 2001 From: haixiw Date: Thu, 22 Jun 2023 04:47:52 +0000 Subject: [PATCH 5/5] minor change --- src/sagemaker_xgboost_container/algorithm_mode/train.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/sagemaker_xgboost_container/algorithm_mode/train.py b/src/sagemaker_xgboost_container/algorithm_mode/train.py index 60248e58..37b34365 100644 --- a/src/sagemaker_xgboost_container/algorithm_mode/train.py +++ b/src/sagemaker_xgboost_container/algorithm_mode/train.py @@ -227,7 +227,7 @@ def sagemaker_train( f"Host {sm_current_host} does not have validation data " f"in the validation channel : {validation_channel}. " f"Will broadcast to cluster and this host {sm_current_host} will not be used " - f"in distributed training. Please divide the training data across instances properly. " + f"in distributed training. Please divide the validation data across instances properly. " f"See https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html" f"#Instance-XGBoost-distributed-training-divide-data. " ) @@ -295,8 +295,6 @@ def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix, model_di logging.info(f"Train matrix has {train_dmatrix.num_row()} rows and {train_dmatrix.num_col()} columns") if val_dmatrix: logging.info(f"Validation matrix has {val_dmatrix.num_row()} rows") - else: - logging.info("No validation data is collected for this training job.") try: kfold = train_cfg.pop("_kfold", None)