Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

metrics.py 2.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
  1. from __future__ import division
  2. __author__ = 'Victor Ruiz, vmr11@pitt.edu'
  3. from math import log
  4. import pandas as pd
  5. def entropy(data_classes, base=2):
  6. '''
  7. Computes the entropy of a set of labels (class instantiations)
  8. :param base: logarithm base for computation
  9. :param data_classes: Series with labels of examples in a dataset
  10. :return: value of entropy
  11. '''
  12. if not isinstance(data_classes, pd.core.series.Series):
  13. raise AttributeError('input array should be a pandas series')
  14. classes = data_classes.unique()
  15. N = len(data_classes)
  16. ent = 0 # initialize entropy
  17. # iterate over classes
  18. for c in classes:
  19. partition = data_classes[data_classes == c] # data with class = c
  20. proportion = len(partition) / N
  21. # update entropy
  22. ent -= proportion * log(proportion, base)
  23. return ent
  24. def cut_point_information_gain(dataset, cut_point, feature_label, class_label):
  25. '''
  26. Return de information gain obtained by splitting a numeric attribute in two according to cut_point
  27. :param dataset: pandas dataframe with a column for attribute values and a column for class
  28. :param cut_point: threshold at which to partition the numeric attribute
  29. :param feature_label: column label of the numeric attribute values in data
  30. :param class_label: column label of the array of instance classes
  31. :return: information gain of partition obtained by threshold cut_point
  32. '''
  33. if not isinstance(dataset, pd.core.frame.DataFrame):
  34. raise AttributeError('input dataset should be a pandas data frame')
  35. entropy_full = entropy(dataset[class_label]) # compute entropy of full dataset (w/o split)
  36. # split data at cut_point
  37. data_left = dataset[dataset[feature_label] <= cut_point]
  38. data_right = dataset[dataset[feature_label] > cut_point]
  39. (N, N_left, N_right) = (len(dataset), len(data_left), len(data_right))
  40. gain = entropy_full - (N_left / N) * entropy(data_left[class_label]) - \
  41. (N_right / N) * entropy(data_right[class_label])
  42. return gain
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...