leo
/
pv-current


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
129

	
130

	
131

	
132

	
133

	
134

	
135

	
136

	
137

	
138

	
139

	
140

	
141

	
142

	
143

	
144

	
145

	
146

	
147

	
148

	
149

	
150

	
151

	
152

	
153

	
154

	
155

	
156

	
157

	
158

	
159

	
160

	
161

	
162

	
163

	
164

	
165

	
166

	
167

	
168

	
169

	
170

	
171

	
172

	
173

	
174

	
175

	
176

	
177

	
178

	
179

	
180

	
181

	
182

	
183

	
184

	
185

	
186

	
187

	
188

	
189

	
190

	
191

	
192

	
193

	
194

	
195

	
196

	
197

	
198

	
            # Created by: leo
# Created on: 2018.10.9

import pandas as pd
import numpy as np
from typing import Tuple, Dict
from sklearn.cluster import KMeans
import logging
from IPython import embed

CONST_ZERO = -1.1     # 输出电流恒为零时的相关系数异常值
CONST_NONZERO = -1.5  # 输出电流恒为非零时的相关系数异常值
LOW_CURRENT = -2.5    # 低电流异常
OUTLIER = -3          # 当有效数据量不足，或者输入为空时返回的异常值

# 状态值区间的边界值列表，为了保证函数参数的不可变性，使用 Tuple
BINS: Tuple[float] = (-100, OUTLIER, LOW_CURRENT, CONST_NONZERO, CONST_ZERO,
                      0, 0.5, 1)

# 状态值区间名称列表
LABELS: Tuple[str] = ('outlier', 'low_current', 'nonzero_constant', 'all_zeros',
                      'warn', 'watch', 'normal')

TIME_HEADER = 'time'
CLUSTER_HEADER = 'strno'
CURRENT_HEADER = 'current'


def coef_maxcur(inp: pd.DataFrame, min_sample_no: int = 2) -> pd.Series:
    """
    基于最大电流法计算指定数据集所在的时间窗口内所有组串的相关系数向量

    :param inp: 包含时间和电流数值的 Dataframe，需包含3列：时间戳, 组串ID, 电流值
    :param min_sample_no: 最小有效观测数，数据清洗后如果小于此值则返回异常值
    :returns: 包含每个组串与最大电流所在组串比较的相关系数向量，长度为 inp 内包含所有组串数量
    """

    if len(inp) == 0:
        return pd.Series([OUTLIER])

    # 若输入为纵表，转换为宽表（组串ID作列名）
    if CLUSTER_HEADER in inp.columns:
        raw = pd.pivot_table(inp, values=CURRENT_HEADER, index=[TIME_HEADER], columns=[CLUSTER_HEADER])
        data = raw.dropna()
        data.index = pd.to_datetime(data.index)
    else:
        data = inp

    # 校验有效数据长度，若低于最小阈值则返回异常指标序列
    if len(data) < min_sample_no:
        return pd.Series([OUTLIER])

    max_current_id = data.apply(max).idxmax()
    cors = data.corrwith(data[max_current_id])

    # 由于 numpy.var(常数向量) 恒等于 0，所以一个向量与常数向量的 
    # [pearson correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Definition)
    # 值为 NaN（分母中有零），如果一个常数向量是零向量，说明光伏板坏了（没有电流），
    # 如果是非零常数向量，说明是传感器坏了
    errs = cors[cors.isna()]
    for idx in errs.index:
        if sum(abs(data[idx])) < 1e-7:
            cors[idx] = CONST_ZERO
        else:
            cors[idx] = CONST_NONZERO
    return cors


def string_coef_maxcur(inp, str_id: int) -> float:
    """
    基于最大电流法计算指定数据集所在的窗口时间内指定组串的电流相关性系数

    :param inp: 包含时间和电流数值的 Dataframe，需包含3列：时间戳, 组串ID, 电流值
    :param str_id: 指定组串的ID
    :return: 指定组串与最大电流所在组串比较的相关系数
    """
    coefs = coef_maxcur(inp)
    if len(coefs) == 1:
        return coefs[0]
    return coefs[str_id]


def max_current_monthly(inp: pd.DataFrame) -> pd.Series:
    """
    指定数据集中每月的最大电流值

    :param inp: 包含时间和电流数值的 Dataframe，需包含3列：时间戳, 组串ID, 电流值
    :return: 每月的最大电流值
    """
    inp.index = pd.to_datetime(inp.index)
    months = inp.groupby(pd.Grouper(freq='M'))
    return months[CURRENT_HEADER].max()


def cur_month_max(inp: pd.DataFrame, max_cur_table: pd.Series, thr: float) -> float:
    """
    计算输入数据所在月份的最低合理电流阈值

    :param inp: 包含一个时间段的数据分组。例如包含一天所有组串电流值的 dataframe
    :param max_cur_table: 包含所有月份对大电流值的数据表
    :param thr: 月度最大电流的百分比
    :return: 输入数据所在月份的最低合理电流阈值
    """
    if inp.empty:
        return np.inf
    return max_cur_table[inp.first_valid_index().strftime('%Y-%m')].values[0] * thr


def get_thr_from_coefs(coef_table: pd.DataFrame, ngrp: int = 3) -> np.ndarray:
    """
    根据组串相关系数计算分隔阈值

    阈值计算方法：

    1. 去掉相关系数中小于 -1 的值
    2. 对剩余阈值做聚类
    3. 对于从大到小顺序排序的 n (n=3) 个聚类，取每个条带下边界再向下聚类宽度的 20% 作为备选分隔阈值，
       与下个聚类的上界比较，取较大者作为最终阈值

    :param coef_table: 包含各个组串每天相关系数的dict，组串ID作为key，value是这个组串在整个时域上每天的相关系数
    :param ngrp: 分组数量，默认分为3组：正常、关注、告警
    :return: 3个聚类间的2个分隔点
    """
    coef_set = np.unique(coef_table)
    without_outlier = np.extract(coef_set > -1, coef_set)
    # without_outlier = [x for x in coef_set if x > -1]   # 小于 -1 的异常值不参与聚类分组
    # embed()

    coef_arr = np.array(without_outlier).reshape(-1, 1)
    kmeans = KMeans(n_clusters=ngrp, random_state=1)   # the random_state make the group ID in clustering result fixed
    kmeans.fit(coef_arr)
    logging.debug('Cluster centroid positions:\n%s' % np.sort(kmeans.cluster_centers_, axis=0))
    id_coef_df = pd.DataFrame({'id': kmeans.predict(coef_arr),
                               'coefs': np.array(without_outlier)})
    coef_grps = id_coef_df.groupby('id')

    upper_bounds = np.sort(coef_grps.max(), axis=0)
    lower_bounds = np.sort(coef_grps.min(), axis=0)
    thr_cands = lower_bounds - (upper_bounds - lower_bounds) * 0.2
    logging.debug('cluster upper bounds:\n%s' % upper_bounds)
    logging.debug('cluster lower bounds:\n%s' % lower_bounds)
    return np.maximum(upper_bounds[:-1], thr_cands[1:])


def calc_maxcur_coefs(inp: pd.DataFrame, thr: float) -> pd.DataFrame:
    """
    基于最大电流方法计算所有组串在输入 DataFrame 上的所有相关系数，作为后续聚类的输入

    :param inp: 包含多个月份一个汇流箱内所有组串电流值的数据集，需包含3列：时间戳, 组串ID, 电流值
    :param thr: 月度最大电流的百分比，低于此阈值的数据被分入低电流组
    :return: DataFrame, index 为时间，列为输入 DataFrame 中包含的所有组串ID
    """
    max_month_cur = max_current_monthly(inp)
    raw = pd.pivot_table(inp, values=CURRENT_HEADER, index=[TIME_HEADER], columns=[CLUSTER_HEADER])
    data = raw.dropna()
    data.index = pd.to_datetime(data.index)
    daily = data.groupby(pd.Grouper(freq='D'))
    # 训练过程中不必区分低电流、全零电流和空数据，作为无效数据全部剔除即可
    # 相应地，最大电流法训练过程中标记为 OUTLIER 的值不一定是空数据，也有可能是低电流或者全零电流
    normal_cur = daily.filter(lambda x: max(x.max()) > cur_month_max(x, max_month_cur, thr))
    normal_cur_grp = normal_cur.groupby(pd.Grouper(freq='D'))
    coef_tbl = {str_id: normal_cur_grp.apply(string_coef_maxcur, str_id) for str_id in normal_cur.columns}
    return pd.DataFrame(coef_tbl)


def train_thresholds(inp: pd.DataFrame, thr: float) -> np.ndarray:
    """
    光伏故障分类算子训练函数，计算分隔阈值

    :param inp: 包含多个月份一个汇流箱内所有组串电流值的数据集，需包含3列：时间戳, 组串ID, 电流值
    :param thr: 月度最大电流的百分比，低于此阈值的数据被分入低电流组
    :return: 二元组，分别为正常、关注、告警组的分隔阈值
    """
    coefs = calc_maxcur_coefs(inp, thr)
    return get_thr_from_coefs(coefs)


def classifier(inp: pd.DataFrame, lowest_peak_current: float,
               borders: Tuple[float] = BINS,
               status_names: Tuple[str] = LABELS) -> pd.Series:
    """
    根据输入参数对输出数据做状态分类

    :param inp: 输入电流数据，需包含3列：时间戳, 组串ID, 电流值
    :param lowest_peak_current: 合理电流最低阈值
    :param borders: 状态边界值列表
    :param status_names: 状态名称列表，长度必须比状态边界列表长度小1
    :return: 输入数据集中每个组串的状态
    """
    peak_current = inp[CURRENT_HEADER].max()
    logging.debug('peak current: %s' % peak_current)

    if peak_current < lowest_peak_current:
        return pd.Series("low_current", index=inp[CLUSTER_HEADER].unique())

    coefs = coef_maxcur(inp)
    logging.debug('coefs:\n%s' % coefs)
    return pd.cut(coefs, bins=borders, labels=status_names)