Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

onegpu.sh 1.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
  1. #!/usr/bin/bash
  2. #SBATCH --job-name=epa_ddp
  3. #SBATCH --nodes=1
  4. #SBATCH --cpus-per-task=8
  5. #SBATCH --mem=24G
  6. #SBATCH --account=standby
  7. #SBATCH --gres=gpu:1
  8. #SBATCH --time=4:00:00
  9. #SBATCH --output=epa_ddp_%j.out
  10. #SBATCH --error=epa_ddp_%j.err
  11. module load cuda/12.1.1 cudnn/9.2.0.82-12 conda
  12. #conda activate /scratch/gilbreth/rai53/fire
  13. #export MLFLOW_TRACKING_USERNAME=$MLFLOW_USERNAME
  14. #export MLFLOW_TRACKING_PASSWORD=$MLFLOW_TOKEN
  15. export NCCL_NET_MERGE_LEVEL=LOC
  16. export NCCL_NET_MERGE_LEVEL=LOC
  17. export NCCL_SOCKET_IFNAME=^lo,docker
  18. export TORCH_NCCL_BLOCKING_WAIT=1
  19. export NCCL_TIMEOUT_SEC=1200 # 20 minutes timeout
  20. export NCCL_IB_DISABLE=0 # Enable InfiniBand if available
  21. export NCCL_P2P_DISABLE=0 # Enable P2P if available
  22. export NCCL_SHM_DISABLE=0 # Enable shared memory
  23. export OMP_NUM_THREADS=8
  24. export NCCL_DEBUG=INFO
  25. export NCCL_DEBUG_SUBSYS=ALL
  26. torchrun --standalone --nnodes=1 --nproc_per_node=1 \
  27. epa_seq2seq.py \
  28. ../config/training_config_v1.json \
  29. ../data/group0.json \
  30. ../data/in_situ_prediction_dataset_full \
  31. ../cpts
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...