Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

test_vram.yml 3.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
  1. task: 41
  2. runpod:
  3. entry: |
  4. bash -c "curl -H 'Cache-Control: no-cache' https://raw.githubusercontent.com/utensil/llm-playground/main/scripts/entry/ax_lite_train.sh -sSf | bash"
  5. # "NVIDIA RTX A5000" # "NVIDIA RTX A6000" "NVIDIA GeForce RTX 4090" "NVIDIA RTX 6000 Ada Generation" "NVIDIA A100-SXM4-80GB" "NVIDIA A100 80GB PCIe"
  6. gpu: "NVIDIA RTX A6000"
  7. # pod_type: INTERRUPTABLE
  8. cloud_type: "ALL" # "ALL" "COMMUNITY" "SECURE"
  9. max_bid_per_gpu: 2.0
  10. # template_id: 758uq6u5fc
  11. gpu_count: 1
  12. container_disk_in_gb: 50
  13. volume_in_gb: 100
  14. min_vcpu_count: 8
  15. min_memory_in_gb: 29
  16. # min_download: 2000
  17. # min_upload: 1500
  18. stop_after: 3600
  19. terminate_after: -1
  20. debug: false
  21. # Set to false to stay running after training
  22. one_shot: true
  23. log_eval: true
  24. env:
  25. CUDA_LAUNCH_BLOCKING: 1
  26. TEST_ENV: happy
  27. # deepspeed: true
  28. # 1b: tiiuae/falcon-rw-1b
  29. # 7b: tiiuae/falcon-7b
  30. # 40b: tiiuae/falcon-40b
  31. # base_model: /content/llm-playground/models/tiiuae_falcon-40b
  32. # base_model_config: /content/llm-playground/models/tiiuae_falcon-40b
  33. base_model: tiiuae/falcon-rw-1b
  34. base_model_config: tiiuae/falcon-rw-1b
  35. # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
  36. trust_remote_code: true
  37. model_type: AutoModelForCausalLM
  38. tokenizer_type: AutoTokenizer
  39. load_in_8bit: false
  40. load_in_4bit: false
  41. gptq: false
  42. strict: false
  43. push_dataset_to_hub: utensil
  44. hf_use_auth_token: true
  45. datasets:
  46. - path: QingyiSi/Alpaca-CoT
  47. data_files:
  48. - Chain-of-Thought/formatted_cot_data/gsm8k_train.json
  49. type: alpaca:chat
  50. dataset_prepared_path: last_run_prepared
  51. val_set_size: 0.01
  52. adapter:
  53. lora_model_dir:
  54. sequence_len: 512
  55. max_packed_sequence_len: 512
  56. # hyperparameters from QLoRA paper Appendix B.2
  57. # "We find hyperparameters to be largely robust across datasets"
  58. lora_r: 64
  59. lora_alpha: 16
  60. # 0.1 for models up to 13B
  61. # 0.05 for 33B and 65B models
  62. lora_dropout: 0.05
  63. # add LoRA modules on all linear layers of the base model
  64. lora_target_modules:
  65. lora_target_linear: true
  66. lora_fan_in_fan_out:
  67. wandb_project: falcon-qlora-runner
  68. wandb_watch:
  69. wandb_run_id:
  70. wandb_log_model:
  71. # output_dir: /content/axolotl-trained/falcon-qlora-40b-minotaur/
  72. output_dir: ./qlora-out
  73. # QLoRA paper Table 9
  74. # - 16 for 7b & 13b
  75. # - 32 for 33b, 64 for 64b
  76. # Max size tested on A6000
  77. # - 7b: 40
  78. # - 40b: 4
  79. # decrease if OOM, increase for max VRAM utilization
  80. micro_batch_size: 1
  81. gradient_accumulation_steps: 1
  82. num_epochs: 0.1
  83. # Optimizer for QLoRA
  84. # optimizer: paged_adamw_32bit
  85. # optimizer: adamw_bnb_8bit
  86. # https://github.com/huggingface/transformers/pull/23217
  87. optimizer: adamw_torch
  88. torchdistx_path:
  89. lr_scheduler: cosine
  90. # QLoRA paper Table 9
  91. # - 2e-4 for 7b & 13b
  92. # - 1e-4 for 33b & 64b
  93. learning_rate: 0.0002
  94. train_on_inputs: false
  95. group_by_length: false
  96. bf16: false
  97. fp16: false
  98. tf32: true
  99. gradient_checkpointing: false
  100. # stop training after this many evaluation losses have increased in a row
  101. # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
  102. # early_stopping_patience: 3
  103. resume_from_checkpoint:
  104. auto_resume_from_checkpoints: true
  105. local_rank:
  106. logging_steps: 1
  107. xformers_attention:
  108. flash_attention:
  109. gptq_groupsize:
  110. gptq_model_v1:
  111. warmup_steps: 10
  112. eval_steps: 5
  113. save_steps: 20
  114. debug:
  115. # deepspeed:
  116. weight_decay: 0.01
  117. adam_beta1:
  118. adam_beta2: 0.999
  119. adam_epsilon:
  120. # Gradient clipping max norm
  121. max_grad_norm: 0.3
  122. fsdp:
  123. fsdp_config:
  124. special_tokens:
  125. pad_token: "<|endoftext|>"
  126. bos_token: ">>ABSTRACT<<"
  127. eos_token: "<|endoftext|>"
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...