1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
- import json
- import os
- import random
- from typing import Any, Dict, List
- import numpy as np
- import torch
- from ray.data import DatasetContext
- from ray.train.torch import get_device
- from madewithml.config import mlflow
- DatasetContext.get_current().execution_options.preserve_order = True
- def set_seeds(seed: int = 42):
- """Set seeds for reproducibility."""
- np.random.seed(seed)
- random.seed(seed)
- torch.manual_seed(seed)
- torch.cuda.manual_seed(seed)
- eval("setattr(torch.backends.cudnn, 'deterministic', True)")
- eval("setattr(torch.backends.cudnn, 'benchmark', False)")
- os.environ["PYTHONHASHSEED"] = str(seed)
- def load_dict(path: str) -> Dict:
- """Load a dictionary from a JSON's filepath.
- Args:
- path (str): location of file.
- Returns:
- Dict: loaded JSON data.
- """
- with open(path) as fp:
- d = json.load(fp)
- return d
- def save_dict(d: Dict, path: str, cls: Any = None, sortkeys: bool = False) -> None:
- """Save a dictionary to a specific location.
- Args:
- d (Dict): data to save.
- path (str): location of where to save the data.
- cls (optional): encoder to use on dict data. Defaults to None.
- sortkeys (bool, optional): whether to sort keys alphabetically. Defaults to False.
- """
- directory = os.path.dirname(path)
- if directory and not os.path.exists(directory): # pragma: no cover
- os.makedirs(directory)
- with open(path, "w") as fp:
- json.dump(d, indent=2, fp=fp, cls=cls, sort_keys=sortkeys)
- fp.write("\n")
- def pad_array(arr: np.ndarray, dtype=np.int32) -> np.ndarray:
- """Pad an 2D array with zeros until all rows in the
- 2D array are of the same length as a the longest
- row in the 2D array.
- Args:
- arr (np.array): input array
- Returns:
- np.array: zero padded array
- """
- max_len = max(len(row) for row in arr)
- padded_arr = np.zeros((arr.shape[0], max_len), dtype=dtype)
- for i, row in enumerate(arr):
- padded_arr[i][: len(row)] = row
- return padded_arr
- def collate_fn(batch: Dict[str, np.ndarray]) -> Dict[str, torch.Tensor]: # pragma: no cover, air internal
- """Convert a batch of numpy arrays to tensors (with appropriate padding).
- Args:
- batch (Dict[str, np.ndarray]): input batch as a dictionary of numpy arrays.
- Returns:
- Dict[str, torch.Tensor]: output batch as a dictionary of tensors.
- """
- batch["ids"] = pad_array(batch["ids"])
- batch["masks"] = pad_array(batch["masks"])
- dtypes = {"ids": torch.int32, "masks": torch.int32, "targets": torch.int64}
- tensor_batch = {}
- for key, array in batch.items():
- tensor_batch[key] = torch.as_tensor(array, dtype=dtypes[key], device=get_device())
- return tensor_batch
- def get_run_id(experiment_name: str, trial_id: str) -> str: # pragma: no cover, mlflow functionality
- """Get the MLflow run ID for a specific Ray trial ID.
- Args:
- experiment_name (str): name of the experiment.
- trial_id (str): id of the trial.
- Returns:
- str: run id of the trial.
- """
- trial_name = f"TorchTrainer_{trial_id}"
- run = mlflow.search_runs(experiment_names=[experiment_name], filter_string=f"tags.trial_name = '{trial_name}'").iloc[0]
- return run.run_id
- def dict_to_list(data: Dict, keys: List[str]) -> List[Dict[str, Any]]:
- """Convert a dictionary to a list of dictionaries.
- Args:
- data (Dict): input dictionary.
- keys (List[str]): keys to include in the output list of dictionaries.
- Returns:
- List[Dict[str, Any]]: output list of dictionaries.
- """
- list_of_dicts = []
- for i in range(len(data[keys[0]])):
- new_dict = {key: data[key][i] for key in keys}
- list_of_dicts.append(new_dict)
- return list_of_dicts
|