Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

prune.py 3.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
  1. from typing import List
  2. from collections import Counter
  3. from imodels.util.rule import Rule
  4. def prune_mins(rules: List[Rule], precision_min: float, recall_min: float) -> List[Rule]:
  5. # Factorize rules before semantic tree filtering
  6. rules_ = [tuple(rule) for rule in rules]
  7. rules_dict = {}
  8. # keep only rules verifying precision_min and recall_min:
  9. for rule, score in rules_:
  10. if score[0] >= precision_min and score[1] >= recall_min:
  11. if rule in rules_dict:
  12. # update the score to the new mean
  13. c = rules_dict[rule][2] + 1
  14. b = rules_dict[rule][1] + 1. / c * (
  15. score[1] - rules_dict[rule][1])
  16. a = rules_dict[rule][0] + 1. / c * (
  17. score[0] - rules_dict[rule][0])
  18. rules_dict[rule] = (a, b, c)
  19. else:
  20. rules_dict[rule] = (score[0], score[1], 1)
  21. rule_tuple_list = sorted(rules_dict.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True)
  22. return [Rule(rule, args=scores) for rule, scores in rule_tuple_list]
  23. def deduplicate(rules: List[Rule], max_depth_dup: int) -> List[Rule]:
  24. if max_depth_dup is not None:
  25. rules = [max(rules_set, key=f1_score) for rules_set in find_similar_rulesets(rules, max_depth_dup)]
  26. return sorted(rules, key=lambda x: - f1_score(x))
  27. def f1_score(rule: Rule) -> float:
  28. return 2 * rule.args[0] * rule.args[1] / \
  29. (rule.args[0] + rule.args[1]) if (rule.args[0] + rule.args[1]) > 0 else 0
  30. def find_similar_rulesets(rules: List[Rule], max_depth_duplication: int = None) -> List[List[Rule]]:
  31. """Create clusters of rules using a decision tree based
  32. on the terms of the rules
  33. Parameters
  34. ----------
  35. rules : List, List of rules
  36. The rules that should be splitted in subsets of similar rules
  37. Returns
  38. -------
  39. rules : List of list of rules
  40. The different set of rules. Each set should be homogeneous
  41. """
  42. def split_with_best_feature(rules, depth, exceptions=[]):
  43. """
  44. Method to find a split of rules given most represented feature
  45. """
  46. if depth == 0:
  47. return rules
  48. rulelist = [rule.split(' and ') for rule, score in rules]
  49. terms = [t.split(' ')[0] for term in rulelist for t in term]
  50. counter = Counter(terms)
  51. # Drop exception list
  52. for exception in exceptions:
  53. del counter[exception]
  54. if len(counter) == 0:
  55. return rules
  56. most_represented_term = counter.most_common()[0][0]
  57. # Proceed to split
  58. rules_splitted = [[], [], []]
  59. for rule in rules:
  60. if (most_represented_term + ' <=') in rule.rule:
  61. rules_splitted[0].append(rule)
  62. elif (most_represented_term + ' >') in rule.rule:
  63. rules_splitted[1].append(rule)
  64. else:
  65. rules_splitted[2].append(rule)
  66. new_exceptions = exceptions + [most_represented_term]
  67. # Choose best term
  68. return [split_with_best_feature(ruleset,
  69. depth - 1,
  70. exceptions=new_exceptions)
  71. for ruleset in rules_splitted]
  72. def breadth_first_search(rules, leaves=None):
  73. if len(rules) == 0 or not isinstance(rules[0], list):
  74. if len(rules) > 0:
  75. return leaves.append(rules)
  76. else:
  77. for rules_child in rules:
  78. breadth_first_search(rules_child, leaves=leaves)
  79. return leaves
  80. leaves = []
  81. res = split_with_best_feature(rules, max_depth_duplication)
  82. breadth_first_search(res, leaves=leaves)
  83. return leaves
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...