Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

resume.py 1.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
  1. # Resume all interrupted trainings in yolov5/ dir including DDP trainings
  2. # Usage: $ python utils/aws/resume.py
  3. import os
  4. import sys
  5. from pathlib import Path
  6. import torch
  7. import yaml
  8. FILE = Path(__file__).resolve()
  9. ROOT = FILE.parents[2] # YOLOv5 root directory
  10. if str(ROOT) not in sys.path:
  11. sys.path.append(str(ROOT)) # add ROOT to PATH
  12. port = 0 # --master_port
  13. path = Path('').resolve()
  14. for last in path.rglob('*/**/last.pt'):
  15. ckpt = torch.load(last)
  16. if ckpt['optimizer'] is None:
  17. continue
  18. # Load opt.yaml
  19. with open(last.parent.parent / 'opt.yaml') as f:
  20. opt = yaml.safe_load(f)
  21. # Get device count
  22. d = opt['device'].split(',') # devices
  23. nd = len(d) # number of devices
  24. ddp = nd > 1 or (nd == 0 and torch.cuda.device_count() > 1) # distributed data parallel
  25. if ddp: # multi-GPU
  26. port += 1
  27. cmd = f'python -m torch.distributed.run --nproc_per_node {nd} --master_port {port} train.py --resume {last}'
  28. else: # single-GPU
  29. cmd = f'python train.py --resume {last}'
  30. cmd += ' > /dev/null 2>&1 &' # redirect output to dev/null and run in daemon thread
  31. print(cmd)
  32. os.system(cmd)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...