weiji14
/
deepicedrain
connected to https://github.com/weiji14/deepicedrain.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
129

	
130

	
131

	
132

	
133

	
134

	
135

	
136

	
137

	
138

	
139

	
140

	
141

	
142

	
143

	
144

	
145

	
146

	
147

	
148

	
149

	
150

	
151

	
152

	
153

	
154

	
155

	
156

	
157

	
158

	
159

	
160

	
161

	
162

	
163

	
164

	
165

	
166

	
167

	
168

	
169

	
170

	
171

	
172

	
173

	
174

	
175

	
176

	
177

	
178

	
179

	
180

	
181

	
182

	
183

	
184

	
185

	
186

	
187

	
188

	
189

	
190

	
191

	
192

	
193

	
194

	
195

	
196

	
197

	
198

	
199

	
200

	
201

	
202

	
203

	
204

	
205

	
206

	
207

	
208

	
209

	
210

	
211

	
212

	
213

	
214

	
215

	
216

	
217

	
218

	
219

	
220

	
221

	
222

	
223

	
224

	
225

	
226

	
227

	
            """
Custom algorithms for detecting active subglacial lakes and estimating ice
volume displacement over time.
"""
try:
    import cudf as xpd
except ImportError:
    import pandas as xpd

import numpy as np


def find_clusters(
    X: xpd.DataFrame,
    eps: float = 3000,
    min_samples: int = 250,
    output_colname: str = "cluster_id",
    **kwargs,
) -> xpd.Series:
    """
    Classify a point cloud into several groups, with each group being assigned
    a positive integer label like 1, 2, 3, etc. Unclassified noise points are
    labelled as NaN.

    Uses Density-based spatial clustering of applications with noise (DBSCAN).
    See also https://www.naftaliharris.com/blog/visualizing-dbscan-clustering

    ***       **         111       NN
    **    **   *         11    22   N
    *     ****     -->   1     2222
      **     **            33     22
    ******               333333

    Parameters
    ----------
    X : cudf.DataFrame or pandas.DataFrame
        A table of X, Y, Z points to run the clustering algorithm on.
    eps : float
        The maximum distance between 2 points such they reside in the same
        neighborhood. Default is 3000 (metres).
    min_samples : int
        The number of samples in a neighborhood such that this group can be
        considered as an important core point (including the point itself).
        Default is 250 (sample points).
    output_colname : str
        The name of the column for the output Series. Default is 'cluster_id'.
    kwargs : dict
        Extra parameters to pass into the `cuml.cluster.DBSCAN` or
        `sklearn.cluster.DBSCAN` function.

    Returns
    -------
    cluster_labels : cudf.Series or pd.Series
        Which cluster each datapoint belongs to. Noisy samples are labeled as
        NaN.
    """
    try:
        from cuml.cluster import DBSCAN
    except ImportError:
        from sklearn.cluster import DBSCAN

    # Run DBSCAN using {eps} m distance, and minimum of {min_samples} points
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, **kwargs)
    dbscan.fit(X=X)

    cluster_labels = dbscan.labels_ + 1  # noise points -1 becomes 0
    if isinstance(cluster_labels, np.ndarray):
        cluster_labels = xpd.Series(data=cluster_labels, dtype=xpd.Int32Dtype())
    cluster_labels = cluster_labels.mask(cond=cluster_labels == 0)  # turn 0 to NaN
    cluster_labels.index = X.index  # let labels have same index as input data
    cluster_labels.name = output_colname

    return cluster_labels


def ice_volume_over_time(
    df_elev: xpd.DataFrame,
    surface_area: float,
    rolling_window: int or str = "91d",
    elev_col: str = "h_anom",
    time_col: str = "utc_time",
    outfile: str = None,
) -> xpd.DataFrame:
    """
    Generates a time-series of ice volume displacement. Ice volume change (dvol
    in m^3) is estimated by multiplying the lake area (in m^2) by the mean
    height anomaly (dh in m), following the methodology of Kim et al. 2016 and
    Siegfried et al., 2016. Think of it as a multiplying the circle of a
    cylinder by it's height:

             _~~~~_
           _~      ~_ <--- Lake area (m^2)
          |~__   __~ |          *
          |   ~~~    | <-- Mean height anomaly (m) from many points inside lake
          |         _|          =
           ~__   __~  <--- Ice volume displacement (m^3)
              ~~~

    More specifically, this function will:
    1) Take a set of height anomaly and time values and calculate a rolling
       mean of height change over time.
    2) Multiply the height change over time sequence in (1) by the lake surface
       area to obtain an estimate of volume change over time.

    Parameters
    ----------
    df_elev : cudf.DataFrame or pandas.DataFrame
        A table of with time and height anomaly columns to run the rolling time
        series calculation on. Ensure that the rows are sorted with time in
        ascending order from oldest to newest.
    surface_area : float
        The ice surface area (of the active lake) experiencing a change in
        height over time. Recommended to provide in unit metres.
    rolling_window : str
        Size of the moving window to calculate the rolling mean, given as a
        time period. Default is '91d' (91 days = 1 ICESat-2 cycle).
    elev_col : str
        The elevation anomaly column name to use from the table data, used to
        calculate the rolling mean height anomaly at every time interval.
        Default is 'h_anom', recommended to provide in unit metres.
    time_col : str
        The time-dimension column name to use from the table data, used in the
        rolling mean algorithm. Default is 'utc_time', ensure that this column
        is provided in a datetime64 format.
    outfile : str
        Optional. Filename to output the time-series data, containing columns
        for time, the elevation anomaly (dh in m^2) +/- standard deviation, and
        the ice volume displacement (dvol in km^3) +/- standard deviation. Note
        that this export requires 'pint' units in the inputs and the
        'uncertainties' package to be installed.

    Returns
    -------
    dvol : cudf.Series or pd.Series
        A column of delta volume changes over time. If pint metre units were
        provided in df_elev, then this output will be given in cubic metres
        with a one standard deviation uncertainty range.

    Examples
    --------
    >>> import pandas as pd
    >>> import pint
    >>> import pint_pandas
    >>> ureg = pint.UnitRegistry()
    >>> pint_pandas.PintType.ureg = ureg

    >>> h_anom = pd.Series(
    ...     data=np.random.RandomState(seed=42).rand(100), dtype="pint[metre]"
    ... )
    >>> utc_time = pd.date_range(
    ...    start="2018-10-14", end="2020-09-30", periods=100
    ... )
    >>> df_elev = pd.DataFrame(data={"h_anom": h_anom, "utc_time": utc_time})

    >>> dvol = ice_volume_over_time(
    ...     df_elev=df_elev, surface_area=123 * ureg.metre ** 2
    ... )
    >>> print(f"{dvol.iloc[len(dvol)//2]:Lx}")
    \SI[]{12+/-35}{\meter\cubed}

    References:
    - Kim, B.-H., Lee, C.-K., Seo, K.-W., Lee, W. S., & Scambos, T. A. (2016).
      Active subglacial lakes and channelized water flow beneath the Kamb Ice
      Stream. The Cryosphere, 10(6), 2971–2980.
      https://doi.org/10.5194/tc-10-2971-2016
    - Siegfried, M. R., Fricker, H. A., Carter, S. P., & Tulaczyk, S. (2016).
      Episodic ice velocity fluctuations triggered by a subglacial flood in
      West Antarctica. Geophysical Research Letters, 43(6), 2640–2648.
      https://doi.org/10.1002/2016GL067758
    """
    # Get just the elevation anomaly and time columns
    df_: pd.DataFrame = df_elev[[elev_col, time_col]].copy()

    # Detect if pint units are being used
    elev_dtype = df_elev[elev_col].dtype
    has_pint: bool = "pint" in str(elev_dtype)
    if has_pint:
        ureg: pint.UnitRegistry = surface_area._REGISTRY

    # Calculate rolling mean of elevation
    df_roll = df_.rolling(window=rolling_window, on=time_col, min_periods=1)
    elev_mean: pd.Series = df_roll[elev_col].mean()

    # Calculate elevation anomaly as elevation at time=n minus elevation at time=1
    elev_anom: pd.Series = elev_mean - elev_mean[0]

    # If pint units are used, add standard deviation uncertainties to mean.
    # TO-DO refactor after pint.Measurements overhaul in
    # https://github.com/hgrecco/pint/issues/350 is completed so that elev_anom
    # can be kept as a pandas.Series instead of having to convert to np.ndarray
    if has_pint:
        import uncertainties.unumpy

        elev_std: pd.Series = df_roll[elev_col].std()
        elev_anom: np.ndarray = uncertainties.unumpy.uarray(
            nominal_values=elev_anom, std_devs=elev_std
        ) * ureg.Unit(elev_dtype.units)

    # Calculate ice volume displacement (m^3) = area (m^2) x height (m)
    dvol: np.ndarray = surface_area * elev_anom
    ice_dvol: pd.Series = df_[elev_col].__class__(
        data=dvol,
        dtype=f"pint[{dvol.units}]" if has_pint else df_elev[elev_col].dtype,
        index=df_.index,
    )

    # Convert dvol from m**3 to km**3 and save to text file
    if outfile and has_pint:
        dvol_km3 = ice_dvol.pint.to("kilometre ** 3").pint.magnitude
        df_dvol: pd.DataFrame = df_.__class__(
            data={
                time_col: df_elev[time_col],
                "dh": uncertainties.unumpy.nominal_values(arr=elev_anom),
                "dh_std": uncertainties.unumpy.std_devs(arr=elev_anom),
                "dvol_km3": uncertainties.unumpy.nominal_values(arr=dvol_km3),
                "dvol_std": uncertainties.unumpy.std_devs(arr=dvol_km3),
            }
        )
        df_dvol.to_csv(
            path_or_buf=outfile,
            sep="\t",
            index=False,
            na_rep="NaN",
            date_format="%Y-%m-%dT%H:%M:%S.%fZ",
        )

    return ice_dvol