NVIDIA/COSMOFLOW implementation , mxnet error when increasing batch size

37 views
Skip to first unread message

Karan Singh

unread,
Mar 21, 2022, 1:18:03 PM3/21/22
to public
Benchmark running on Dell R750 XA server with 4 X A100s (80GB) .
Below is the working batch size == 16 trace

`root@7881f8df9636:/workspace/cosmoflow# numactl --physcpubind=0-15,64-79 -- python train.py --log-prefix 'run_220315140551795650176_{}_1.log' --data-root-dir /mnt --num-epochs 3 --target-mae 0.124 --base-lr 0.004 --initial-lr 0.001 --momentum 0.9 --weight-decay 0.0 --warmup-epochs 1 --lr-scheduler-epochs 16 32 --lr-scheduler-decays 0.25 0.125 --training-batch-size 16 --validation-batch-size 16 --training-samples -1 --validation-samples -1 --data-layout NDHWC --data-shard-multiplier 1 --dali-num-threads 6 --shard-type local --seed 23739 --grad-prediv-factor 1.0 --instances 1 --spatial-span 1 --load-checkpoint '' --save-checkpoint /results/checkpoint.data --apply-log-transform --shuffle --preshuffle --use-amp Namespace(apply_log_transform=True, base_lr=0.004, config_file=None, cuda_profiler_range='', dali_num_threads=6, dali_use_mmap=False, data_layout='NDHWC', data_root_dir=PosixPath('/mnt'), data_shard_multiplier=1, dropout=0.5, grad_prediv_factor=1.0, initial_lr=0.001, instances=1, load_checkpoint='', log_prefix='run_220315140551795650176_{}_1.log', lr_scheduler_decays=[0.25, 0.125], lr_scheduler_epochs=[16, 32], momentum=0.9, num_epochs=3, preshuffle=True, prestage=False, profile=False, save_checkpoint='/results/checkpoint.data', seed=23739, shard_type='local', shuffle=True, spatial_span=1, static_loss_scale=16384, target_mae=0.124, training_batch_size=16, training_samples=-1, use_amp=True, use_fp16=False, validation_batch_size=16, validation_samples=-1, warmup_epochs=1, weight_decay=0.0) :::MLLOG {"namespace": "", "time_ms": 1647415346630, "event_type": "POINT_IN_TIME", "key": "cache_clear", "value": null, "metadata": {"file": "train.py", "lineno": 134}} :::MLLOG {"namespace": "", "time_ms": 1647415346673, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "train.py", "lineno": 135}} :::MLLOG {"namespace": "", "time_ms": 1647415346673, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "cosmoflow", "metadata": {"file": "train.py", "lineno": 137}} :::MLLOG {"namespace": "", "time_ms": 1647415346673, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "NVIDIA", "metadata": {"file": "train.py", "lineno": 139}} :::MLLOG {"namespace": "", "time_ms": 1647415346674, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "train.py", "lineno": 140}} :::MLLOG {"namespace": "", "time_ms": 1647415346674, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "train.py", "lineno": 141}} :::MLLOG {"namespace": "", "time_ms": 1647415346674, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xNVIDIA DGX A100", "metadata": {"file": "train.py", "lineno": 142}} :::MLLOG {"namespace": "", "time_ms": 1647415346674, "event_type": "POINT_IN_TIME", "key": "number_of_nodes", "value": 1, "metadata": {"file": "train.py", "lineno": 145}} :::MLLOG {"namespace": "", "time_ms": 1647415346674, "event_type": "POINT_IN_TIME", "key": "accelerators_per_node", "value": 1, "metadata": {"file": "train.py", "lineno": 146}} [07:22:26] ../src/storage/storage.cc:199: Using Pooled (Naive) StorageManager for CPU [07:22:29] ../src/storage/storage.cc:199: Using Pooled (Naive) StorageManager for GPU :::MLLOG {"namespace": "", "time_ms": 1647415377491, "event_type": "POINT_IN_TIME", "key": "opt_weight_decay", "value": 0.0, "metadata": {"file": "train.py", "lineno": 165}} :::MLLOG {"namespace": "", "time_ms": 1647415377492, "event_type": "POINT_IN_TIME", "key": "dropout", "value": 0.5, "metadata": {"file": "train.py", "lineno": 167}} :::MLLOG {"namespace": "", "time_ms": 1647415377734, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 16, "metadata": {"file": "/workspace/cosmoflow/data.py", "lineno": 352}} :::MLLOG {"namespace": "", "time_ms": 1647415377735, "event_type": "POINT_IN_TIME", "key": "train_samples", "value": 117557, "metadata": {"file": "/workspace/cosmoflow/data.py", "lineno": 354}} :::MLLOG {"namespace": "", "time_ms": 1647415377735, "event_type": "POINT_IN_TIME", "key": "eval_samples", "value": 166, "metadata": {"file": "/workspace/cosmoflow/data.py", "lineno": 355}} :::MLLOG {"namespace": "", "time_ms": 1647415377735, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.004, "metadata": {"file": "train.py", "lineno": 92}} :::MLLOG {"namespace": "", "time_ms": 1647415377736, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_epochs", "value": 1, "metadata": {"file": "train.py", "lineno": 94}} :::MLLOG {"namespace": "", "time_ms": 1647415377736, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 4, "metadata": {"file": "train.py", "lineno": 96}} :::MLLOG {"namespace": "", "time_ms": 1647415377736, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_decay_boundary_epochs", "value": [16, 32], "metadata": {"file": "train.py", "lineno": 98}} :::MLLOG {"namespace": "", "time_ms": 1647415377736, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_decay_factor", "value": [0.25, 0.125], "metadata": {"file": "train.py", "lineno": 100}} :::MLLOG {"namespace": "", "time_ms": 1647415377736, "event_type": "POINT_IN_TIME", "key": "opt_name", "value": "sgd", "metadata": {"file": "train.py", "lineno": 184}} :::MLLOG {"namespace": "", "time_ms": 1647415377737, "event_type": "INTERVAL_END", "key": "init_stop", "value": null, "metadata": {"file": "/workspace/cosmoflow/utils.py", "lineno": 144}} :::MLLOG {"namespace": "", "time_ms": 1647415377738, "event_type": "INTERVAL_START", "key": "run_start", "value": null, "metadata": {"file": "train.py", "lineno": 206}} :::MLLOG {"namespace": "", "time_ms": 1647415377738, "event_type": "INTERVAL_START", "key": "staging_start", "value": null, "metadata": {"file": "/workspace/cosmoflow/data.py", "lineno": 359}} :::MLLOG {"namespace": "", "time_ms": 1647415380101, "event_type": "INTERVAL_END", "key": "staging_stop", "value": null, "metadata": {"file": "/workspace/cosmoflow/data.py", "lineno": 362, "staging_duration": 2.3632540702819824}} :::MLLOG {"namespace": "", "time_ms": 1647415380101, "event_type": "INTERVAL_START", "key": "epoch_start", "value": null, "metadata": {"file": "train.py", "lineno": 215, "epoch_num": 1}} `

Below is the failing trace with batch size == 32 

root@7881f8df9636:/workspace/cosmoflow# numactl --physcpubind=0-15,64-79 -- python train.py --log-prefix 'run_220315140551795650176_{}_1.log' --data-root-dir /mnt --num-epochs 3 --target-mae 0.124 --base-lr 0.004 --initial-lr 0.001 --momentum 0.9 --weight-decay 0.0 --warmup-epochs 1 --lr-scheduler-epochs 16 32 --lr-scheduler-decays 0.25 0.125 --training-batch-size 32 --validation-batch-size 32 --training-samples -1 --validation-samples -1 --data-layout NDHWC --data-shard-multiplier 1 --dali-num-threads 6 --shard-type local --seed 23739 --grad-prediv-factor 1.0 --instances 1 --spatial-span 1 --load-checkpoint '' --save-checkpoint /results/checkpoint.data --apply-log-transform --shuffle --preshuffle --use-amp Namespace(apply_log_transform=True, base_lr=0.004, config_file=None, cuda_profiler_range='', dali_num_threads=6, dali_use_mmap=False, data_layout='NDHWC', data_root_dir=PosixPath('/mnt'), data_shard_multiplier=1, dropout=0.5, grad_prediv_factor=1.0, initial_lr=0.001, instances=1, load_checkpoint='', log_prefix='run_220315140551795650176_{}_1.log', lr_scheduler_decays=[0.25, 0.125], lr_scheduler_epochs=[16, 32], momentum=0.9, num_epochs=3, preshuffle=True, prestage=False, profile=False, save_checkpoint='/results/checkpoint.data', seed=23739, shard_type='local', shuffle=True, spatial_span=1, static_loss_scale=16384, target_mae=0.124, training_batch_size=32, training_samples=-1, use_amp=True, use_fp16=False, validation_batch_size=32, validation_samples=-1, warmup_epochs=1, weight_decay=0.0) :::MLLOG {"namespace": "", "time_ms": 1647415934445, "event_type": "POINT_IN_TIME", "key": "cache_clear", "value": null, "metadata": {"file": "train.py", "lineno": 134}} :::MLLOG {"namespace": "", "time_ms": 1647415934488, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "train.py", "lineno": 135}} :::MLLOG {"namespace": "", "time_ms": 1647415934488, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "cosmoflow", "metadata": {"file": "train.py", "lineno": 137}} :::MLLOG {"namespace": "", "time_ms": 1647415934488, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "NVIDIA", "metadata": {"file": "train.py", "lineno": 139}} :::MLLOG {"namespace": "", "time_ms": 1647415934488, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "train.py", "lineno": 140}} :::MLLOG {"namespace": "", "time_ms": 1647415934489, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "train.py", "lineno": 141}} :::MLLOG {"namespace": "", "time_ms": 1647415934489, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xNVIDIA DGX A100", "metadata": {"file": "train.py", "lineno": 142}} :::MLLOG {"namespace": "", "time_ms": 1647415934489, "event_type": "POINT_IN_TIME", "key": "number_of_nodes", "value": 1, "metadata": {"file": "train.py", "lineno": 145}} :::MLLOG {"namespace": "", "time_ms": 1647415934489, "event_type": "POINT_IN_TIME", "key": "accelerators_per_node", "value": 1, "metadata": {"file": "train.py", "lineno": 146}} [07:32:14] ../src/storage/storage.cc:199: Using Pooled (Naive) StorageManager for CPU [07:32:16] ../src/storage/storage.cc:199: Using Pooled (Naive) StorageManager for GPU Traceback (most recent call last): File "train.py", line 389, in <module> main(parser.parse_args()) File "train.py", line 157, in main model = initialize_model(dist_desc, use_amp=args.use_amp, File "train.py", line 270, in initialize_model network.init(mx.gpu(dist_desc.local_rank), batch_size, File "/workspace/cosmoflow/model.py", line 170, in init self.warmup(ctx, dist_desc, use_amp, batch_size) File "/workspace/cosmoflow/model.py", line 207, in warmup _ = output.asnumpy() File "/opt/mxnet/python/mxnet/ndarray/ndarray.py", line 2578, in asnumpy check_call(_LIB.MXNDArraySyncCopyToCPU( File "/opt/mxnet/python/mxnet/base.py", line 246, in check_call raise get_last_ffi_error() mxnet.base.MXNetError: Traceback (most recent call last): File "../include/mxnet/./tuple.h", line 421 MXNetError: Check failed: dim_size >= -1 (-2147483648 vs. -1) : shape dim size must be >= -1, while received -2147483648

Do we need to change anything in here "model.py"

def init(self, ctx, batch_size: int, use_amp: bool, use_wd: bool, dist_desc: utils.DistributedEnvDesc, checkpoint: Optional[str] = None, dtype: str = "float32"): self.model.initialize(init=DefaultInitializer(layout=self.layout), ctx=ctx) input_shape = (batch_size, 4, 128, 128, 128) if self.layout == "NCDHW" \ else (batch_size, 128, 128, 128, 4) random_batch = (mx.nd.random.uniform(shape=input_shape, ctx=ctx, dtype=dtype), mx.nd.random.uniform(shape=(batch_size, 4), ctx=ctx, dtype="float32"))
 
Reply all
Reply to author
Forward
0 new messages