leo
/
pv-current


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
            import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
from coef_matrix import classify

cluster = LocalCluster(n_workers=20)
cli = Client(cluster)

raw = dd.read_csv('./hbpv10days.csv',
                  usecols=[0, 1, 2, 3, 4, 6], 
                  names=['time', 'station', 'lev1', 'lev2', 'strno', 'current'], 
                  dtype={'lev1':str, 'lev2':str, 'strno':str}, 
                  parse_dates = ['time']).dropna()

raw = cli.persist(raw)
inp = raw[(raw['current'] >= 0) & (raw['current'] < 10)]
inp['day'] = inp['time'].map(lambda x: str(x.date()))
inp['cid'] = inp.day + inp.station + inp.lev1 + inp.lev2
inp2 = inp.set_index('cid')
inp3 = inp2.drop(['station', 'lev1', 'lev2', 'day'], axis=1)
df = inp3.groupby(inp3.index).apply(classify,
        meta={'coef': 'float64'})
df.to_csv('res-*.csv')