Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

#661 documentation on using configuration files

Merged
Ghost merged 1 commits into Deci-AI:master from deci-ai:feature/SG-608_configuration_files
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  1. import os
  2. from typing import Optional
  3. from super_gradients.common.abstractions.abstract_logger import get_logger
  4. from super_gradients.common.sg_loggers.base_sg_logger import BaseSGLogger
  5. from super_gradients.common.environment.ddp_utils import multi_process_safe
  6. from super_gradients.common.plugins.deci_client import DeciClient
  7. logger = get_logger(__name__)
  8. TENSORBOARD_EVENTS_PREFIX = "events.out.tfevents"
  9. LOGS_PREFIX = "log_"
  10. class DeciPlatformSGLogger(BaseSGLogger):
  11. """Logger responsible to push logs and tensorboard artifacts to Deci platform."""
  12. def __init__(
  13. self,
  14. project_name: str,
  15. experiment_name: str,
  16. storage_location: str,
  17. resumed: bool,
  18. training_params: dict,
  19. checkpoints_dir_path: str,
  20. tb_files_user_prompt: bool = False,
  21. launch_tensorboard: bool = False,
  22. tensorboard_port: int = None,
  23. save_checkpoints_remote: bool = True,
  24. save_tensorboard_remote: bool = True,
  25. save_logs_remote: bool = True,
  26. monitor_system: bool = True,
  27. model_name: Optional[str] = None,
  28. ):
  29. super().__init__(
  30. project_name=project_name,
  31. experiment_name=experiment_name,
  32. storage_location=storage_location,
  33. resumed=resumed,
  34. training_params=training_params,
  35. checkpoints_dir_path=checkpoints_dir_path,
  36. tb_files_user_prompt=tb_files_user_prompt,
  37. launch_tensorboard=launch_tensorboard,
  38. tensorboard_port=tensorboard_port,
  39. save_checkpoints_remote=save_checkpoints_remote,
  40. save_tensorboard_remote=save_tensorboard_remote,
  41. save_logs_remote=save_logs_remote,
  42. monitor_system=monitor_system,
  43. )
  44. if model_name is None:
  45. logger.warning(
  46. "'model_name' parameter not passed. "
  47. "The experiment won't be connected to an architecture in the Deci platform. "
  48. "To pass a model_name, please use the 'sg_logger_params.model_name' field in the training recipe."
  49. )
  50. self.platform_client = DeciClient()
  51. self.platform_client.register_experiment(name=experiment_name, model_name=model_name if model_name else None)
  52. self.checkpoints_dir_path = checkpoints_dir_path
  53. @multi_process_safe
  54. def upload(self):
  55. """
  56. Upload both to the destination specified by the user (base behavior), and to Deci platform.
  57. """
  58. # Upload to the destination specified by the user
  59. super(DeciPlatformSGLogger, self).upload()
  60. # Upload to Deci platform
  61. if not os.path.isdir(self.checkpoints_dir_path):
  62. raise ValueError("Provided directory does not exist")
  63. self._upload_latest_file_starting_with(start_with=TENSORBOARD_EVENTS_PREFIX)
  64. self._upload_latest_file_starting_with(start_with=LOGS_PREFIX)
  65. @multi_process_safe
  66. def _upload_latest_file_starting_with(self, start_with: str):
  67. """
  68. Upload the most recent file starting with a specific prefix to the Deci platform.
  69. :param start_with: prefix of the file to upload
  70. """
  71. files_path = [
  72. os.path.join(self.checkpoints_dir_path, file_name) for file_name in os.listdir(self.checkpoints_dir_path) if file_name.startswith(start_with)
  73. ]
  74. most_recent_file_path = max(files_path, key=os.path.getctime)
  75. self.platform_client.save_experiment_file(file_path=most_recent_file_path)
  76. logger.info(f"File saved to Deci platform: {most_recent_file_path}")
Discard
Tip!

Press p or to see the previous file or, n or to see the next file