Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

ranking_stability.py 3.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
  1. import numpy as np
  2. from scipy.stats import rankdata
  3. def tauAP_b(x, y, decreasing=True):
  4. """
  5. Weighted kendall tau correlation metric, which handles ties.
  6. Proposed in "The Treatment of Ties in AP Correlation" by
  7. Urbano and Marrero (2017). This is the python implementation
  8. of ircor::tauAP_b from R.
  9. Parameters
  10. ----------
  11. x: array-like of shape (n,)
  12. Numeric vector.
  13. y: array-like of shape (n,)
  14. Numeric vector of same length as x.
  15. decreasing: bool
  16. Should the sort order be increasing or decreasing (default)?
  17. Returns
  18. -------
  19. Scalar value between -1 and 1, quantifying how much the
  20. rankings of x and y agree with each other. A higher
  21. values indicates greater similarity.
  22. """
  23. if decreasing:
  24. return tauAP_b(-x, -y, decreasing=False)
  25. else:
  26. return (_tauAP_b_ties(x, y) + _tauAP_b_ties(y, x)) / 2
  27. def _tauAP_b_ties(x, y):
  28. n = len(x)
  29. rx = rankdata(x)
  30. ry = rankdata(y, method="ordinal")
  31. p = rankdata(y, method="min")
  32. c_all = 0
  33. not_top = np.argwhere(p != 1)
  34. for i in not_top:
  35. c_above = 0
  36. for j in np.argwhere(p < p[i]):
  37. sx = np.sign(rx[i] - rx[j])
  38. sy = np.sign(ry[i] - ry[j])
  39. if sx == sy:
  40. c_above = c_above + 1
  41. c_all = c_all + c_above/(p[i] - 1)
  42. return 2 / len(not_top) * c_all - 1
  43. def rbo(s, t, p, k=None, side="top", uneven_lengths=True):
  44. """
  45. Rank-based overlap (RBO) metric.
  46. Proposed in "A Similarity Measure for Indefinite Rankings" by
  47. Webber et al. (2010). This is the python implementation
  48. of gespeR::rbo from R.
  49. Parameters
  50. ----------
  51. s: array-like of shape (n,)
  52. Numeric vector.
  53. t: array-like of shape (n,)
  54. Numeric vector of same length as s.
  55. p: float between 0 and 1
  56. Weighting parameter in [0, 1]. High p implies strong emphasis
  57. on the top-ranked elements (i.e, the larger elements).
  58. k: None or int
  59. Evaluation depth for extrapolation
  60. side: string in {"top", "bottom"}
  61. Evaluate similarity between the top or the bottom of the
  62. ranked lists.
  63. uneven_lengths: bool
  64. Indicator if lists have uneven lengths.
  65. Returns
  66. -------
  67. Scalar value between 0 and 1, quantifying how much the
  68. rankings of x and y agree with each other. A higher
  69. values indicates greater similarity.
  70. """
  71. assert side in ["top", "bottom"]
  72. if k is None:
  73. k = int(np.floor(max(len(s), len(t)) / 2))
  74. if side == "top":
  75. ids = {"s": _select_ids(s, "top"),
  76. "t": _select_ids(t, "top")}
  77. elif side == "bottom":
  78. ids = {"s": _select_ids(s, "bottom"),
  79. "t": _select_ids(t, "bottom")}
  80. return min(1, _rbo_ext(ids["s"], ids["t"], p, k, uneven_lengths=uneven_lengths))
  81. def _select_ids(x, side="top"):
  82. assert side in ["top", "bottom"]
  83. if side == "top":
  84. return np.argsort(-x)
  85. elif side == "bottom":
  86. return np.argsort(x)
  87. def _rbo_ext(x, y, p, k, uneven_lengths=True):
  88. if len(x) <= len(y):
  89. S = x
  90. L = y
  91. else:
  92. S = y
  93. L = x
  94. l = min(k, len(L))
  95. s = min(k, len(S))
  96. if uneven_lengths:
  97. Xd = [len(np.intersect1d(S[:(i+1)], L[:(i+1)])) for i in range(l)]
  98. if l > s:
  99. sl_range = np.arange(s+1, l+1)
  100. else:
  101. sl_range = np.arange(l, s+2)
  102. result = ((1 - p) / p) * \
  103. ((sum(Xd[:l] / np.arange(1, l+1) * p**np.arange(1, l+1))) +
  104. (sum(Xd[s-1] * (sl_range - s) / (s * sl_range) * p**sl_range))) + \
  105. ((Xd[l-1] - Xd[s-1]) / l + (Xd[s-1] / s)) * p**l
  106. else:
  107. k = min(s, k)
  108. Xd = [len(np.intersect1d(x[:(i+1)], y[:(i+1)])) for i in range(k)]
  109. Xk = Xd[k-1]
  110. result = (Xk / k) * p**k + (((1 - p) / p) * sum((Xd / np.arange(1, k+1)) * p**np.arange(1, k+1)))
  111. return result
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...