1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
|
- """
- Custom algorithms for detecting active subglacial lakes and estimating ice
- volume displacement over time.
- """
- try:
- import cudf as xpd
- except ImportError:
- import pandas as xpd
- import numpy as np
- def find_clusters(
- X: xpd.DataFrame,
- eps: float = 3000,
- min_samples: int = 250,
- output_colname: str = "cluster_id",
- **kwargs,
- ) -> xpd.Series:
- """
- Classify a point cloud into several groups, with each group being assigned
- a positive integer label like 1, 2, 3, etc. Unclassified noise points are
- labelled as NaN.
- Uses Density-based spatial clustering of applications with noise (DBSCAN).
- See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering
- *** ** 111 NN
- ** ** * 11 22 N
- * **** --> 1 2222
- ** ** 33 22
- ****** 333333
- Parameters
- ----------
- X : cudf.DataFrame or pandas.DataFrame
- A table of X, Y, Z points to run the clustering algorithm on.
- eps : float
- The maximum distance between 2 points such they reside in the same
- neighborhood. Default is 3000 (metres).
- min_samples : int
- The number of samples in a neighborhood such that this group can be
- considered as an important core point (including the point itself).
- Default is 250 (sample points).
- output_colname : str
- The name of the column for the output Series. Default is 'cluster_id'.
- kwargs : dict
- Extra parameters to pass into the `cuml.cluster.DBSCAN` or
- `sklearn.cluster.DBSCAN` function.
- Returns
- -------
- cluster_labels : cudf.Series or pd.Series
- Which cluster each datapoint belongs to. Noisy samples are labeled as
- NaN.
- """
- try:
- from cuml.cluster import DBSCAN
- except ImportError:
- from sklearn.cluster import DBSCAN
- # Run DBSCAN using {eps} m distance, and minimum of {min_samples} points
- dbscan = DBSCAN(eps=eps, min_samples=min_samples, **kwargs)
- dbscan.fit(X=X)
- cluster_labels = dbscan.labels_ + 1 # noise points -1 becomes 0
- if isinstance(cluster_labels, np.ndarray):
- cluster_labels = xpd.Series(data=cluster_labels, dtype=xpd.Int32Dtype())
- cluster_labels = cluster_labels.mask(cond=cluster_labels == 0) # turn 0 to NaN
- cluster_labels.index = X.index # let labels have same index as input data
- cluster_labels.name = output_colname
- return cluster_labels
- def ice_volume_over_time(
- df_elev: xpd.DataFrame,
- surface_area: float,
- rolling_window: int or str = "91d",
- elev_col: str = "h_anom",
- time_col: str = "utc_time",
- outfile: str = None,
- ) -> xpd.DataFrame:
- """
- Generates a time-series of ice volume displacement. Ice volume change (dvol
- in m^3) is estimated by multiplying the lake area (in m^2) by the mean
- height anomaly (dh in m), following the methodology of Kim et al. 2016 and
- Siegfried et al., 2016. Think of it as a multiplying the circle of a
- cylinder by it's height:
- _~~~~_
- _~ ~_ <--- Lake area (m^2)
- |~__ __~ | *
- | ~~~ | <-- Mean height anomaly (m) from many points inside lake
- | _| =
- ~__ __~ <--- Ice volume displacement (m^3)
- ~~~
- More specifically, this function will:
- 1) Take a set of height anomaly and time values and calculate a rolling
- mean of height change over time.
- 2) Multiply the height change over time sequence in (1) by the lake surface
- area to obtain an estimate of volume change over time.
- Parameters
- ----------
- df_elev : cudf.DataFrame or pandas.DataFrame
- A table of with time and height anomaly columns to run the rolling time
- series calculation on. Ensure that the rows are sorted with time in
- ascending order from oldest to newest.
- surface_area : float
- The ice surface area (of the active lake) experiencing a change in
- height over time. Recommended to provide in unit metres.
- rolling_window : str
- Size of the moving window to calculate the rolling mean, given as a
- time period. Default is '91d' (91 days = 1 ICESat-2 cycle).
- elev_col : str
- The elevation anomaly column name to use from the table data, used to
- calculate the rolling mean height anomaly at every time interval.
- Default is 'h_anom', recommended to provide in unit metres.
- time_col : str
- The time-dimension column name to use from the table data, used in the
- rolling mean algorithm. Default is 'utc_time', ensure that this column
- is provided in a datetime64 format.
- outfile : str
- Optional. Filename to output the time-series data, containing columns
- for time, the elevation anomaly (dh in m^2) +/- standard deviation, and
- the ice volume displacement (dvol in km^3) +/- standard deviation. Note
- that this export requires 'pint' units in the inputs and the
- 'uncertainties' package to be installed.
- Returns
- -------
- dvol : cudf.Series or pd.Series
- A column of delta volume changes over time. If pint metre units were
- provided in df_elev, then this output will be given in cubic metres
- with a one standard deviation uncertainty range.
- Examples
- --------
- >>> import pandas as pd
- >>> import pint
- >>> import pint_pandas
- >>> ureg = pint.UnitRegistry()
- >>> pint_pandas.PintType.ureg = ureg
- >>> h_anom = pd.Series(
- ... data=np.random.RandomState(seed=42).rand(100), dtype="pint[metre]"
- ... )
- >>> utc_time = pd.date_range(
- ... start="2018-10-14", end="2020-09-30", periods=100
- ... )
- >>> df_elev = pd.DataFrame(data={"h_anom": h_anom, "utc_time": utc_time})
- >>> dvol = ice_volume_over_time(
- ... df_elev=df_elev, surface_area=123 * ureg.metre ** 2
- ... )
- >>> print(f"{dvol.iloc[len(dvol)//2]:Lx}")
- \SI[]{12+/-35}{\meter\cubed}
- References:
- - Kim, B.-H., Lee, C.-K., Seo, K.-W., Lee, W. S., & Scambos, T. A. (2016).
- Active subglacial lakes and channelized water flow beneath the Kamb Ice
- Stream. The Cryosphere, 10(6), 2971–2980.
- https://doi.org/10.5194/tc-10-2971-2016
- - Siegfried, M. R., Fricker, H. A., Carter, S. P., & Tulaczyk, S. (2016).
- Episodic ice velocity fluctuations triggered by a subglacial flood in
- West Antarctica. Geophysical Research Letters, 43(6), 2640–2648.
- https://doi.org/10.1002/2016GL067758
- """
- # Get just the elevation anomaly and time columns
- df_: pd.DataFrame = df_elev[[elev_col, time_col]].copy()
- # Detect if pint units are being used
- elev_dtype = df_elev[elev_col].dtype
- has_pint: bool = "pint" in str(elev_dtype)
- if has_pint:
- ureg: pint.UnitRegistry = surface_area._REGISTRY
- # Calculate rolling mean of elevation
- df_roll = df_.rolling(window=rolling_window, on=time_col, min_periods=1)
- elev_mean: pd.Series = df_roll[elev_col].mean()
- # Calculate elevation anomaly as elevation at time=n minus elevation at time=1
- elev_anom: pd.Series = elev_mean - elev_mean[0]
- # If pint units are used, add standard deviation uncertainties to mean.
- # TO-DO refactor after pint.Measurements overhaul in
- # https://github.com/hgrecco/pint/issues/350 is completed so that elev_anom
- # can be kept as a pandas.Series instead of having to convert to np.ndarray
- if has_pint:
- import uncertainties.unumpy
- elev_std: pd.Series = df_roll[elev_col].std()
- elev_anom: np.ndarray = uncertainties.unumpy.uarray(
- nominal_values=elev_anom, std_devs=elev_std
- ) * ureg.Unit(elev_dtype.units)
- # Calculate ice volume displacement (m^3) = area (m^2) x height (m)
- dvol: np.ndarray = surface_area * elev_anom
- ice_dvol: pd.Series = df_[elev_col].__class__(
- data=dvol,
- dtype=f"pint[{dvol.units}]" if has_pint else df_elev[elev_col].dtype,
- index=df_.index,
- )
- # Convert dvol from m**3 to km**3 and save to text file
- if outfile and has_pint:
- dvol_km3 = ice_dvol.pint.to("kilometre ** 3").pint.magnitude
- df_dvol: pd.DataFrame = df_.__class__(
- data={
- time_col: df_elev[time_col],
- "dh": uncertainties.unumpy.nominal_values(arr=elev_anom),
- "dh_std": uncertainties.unumpy.std_devs(arr=elev_anom),
- "dvol_km3": uncertainties.unumpy.nominal_values(arr=dvol_km3),
- "dvol_std": uncertainties.unumpy.std_devs(arr=dvol_km3),
- }
- )
- df_dvol.to_csv(
- path_or_buf=outfile,
- sep="\t",
- index=False,
- na_rep="NaN",
- date_format="%Y-%m-%dT%H:%M:%S.%fZ",
- )
- return ice_dvol
|