1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
- import numpy as np
- from scipy.stats import rankdata
- def tauAP_b(x, y, decreasing=True):
- """
- Weighted kendall tau correlation metric, which handles ties.
- Proposed in "The Treatment of Ties in AP Correlation" by
- Urbano and Marrero (2017). This is the python implementation
- of ircor::tauAP_b from R.
- Parameters
- ----------
- x: array-like of shape (n,)
- Numeric vector.
- y: array-like of shape (n,)
- Numeric vector of same length as x.
- decreasing: bool
- Should the sort order be increasing or decreasing (default)?
- Returns
- -------
- Scalar value between -1 and 1, quantifying how much the
- rankings of x and y agree with each other. A higher
- values indicates greater similarity.
- """
- if decreasing:
- return tauAP_b(-x, -y, decreasing=False)
- else:
- return (_tauAP_b_ties(x, y) + _tauAP_b_ties(y, x)) / 2
- def _tauAP_b_ties(x, y):
- n = len(x)
- rx = rankdata(x)
- ry = rankdata(y, method="ordinal")
- p = rankdata(y, method="min")
- c_all = 0
- not_top = np.argwhere(p != 1)
- for i in not_top:
- c_above = 0
- for j in np.argwhere(p < p[i]):
- sx = np.sign(rx[i] - rx[j])
- sy = np.sign(ry[i] - ry[j])
- if sx == sy:
- c_above = c_above + 1
- c_all = c_all + c_above/(p[i] - 1)
- return 2 / len(not_top) * c_all - 1
- def rbo(s, t, p, k=None, side="top", uneven_lengths=True):
- """
- Rank-based overlap (RBO) metric.
- Proposed in "A Similarity Measure for Indefinite Rankings" by
- Webber et al. (2010). This is the python implementation
- of gespeR::rbo from R.
- Parameters
- ----------
- s: array-like of shape (n,)
- Numeric vector.
- t: array-like of shape (n,)
- Numeric vector of same length as s.
- p: float between 0 and 1
- Weighting parameter in [0, 1]. High p implies strong emphasis
- on the top-ranked elements (i.e, the larger elements).
- k: None or int
- Evaluation depth for extrapolation
- side: string in {"top", "bottom"}
- Evaluate similarity between the top or the bottom of the
- ranked lists.
- uneven_lengths: bool
- Indicator if lists have uneven lengths.
- Returns
- -------
- Scalar value between 0 and 1, quantifying how much the
- rankings of x and y agree with each other. A higher
- values indicates greater similarity.
- """
- assert side in ["top", "bottom"]
- if k is None:
- k = int(np.floor(max(len(s), len(t)) / 2))
- if side == "top":
- ids = {"s": _select_ids(s, "top"),
- "t": _select_ids(t, "top")}
- elif side == "bottom":
- ids = {"s": _select_ids(s, "bottom"),
- "t": _select_ids(t, "bottom")}
- return min(1, _rbo_ext(ids["s"], ids["t"], p, k, uneven_lengths=uneven_lengths))
- def _select_ids(x, side="top"):
- assert side in ["top", "bottom"]
- if side == "top":
- return np.argsort(-x)
- elif side == "bottom":
- return np.argsort(x)
- def _rbo_ext(x, y, p, k, uneven_lengths=True):
- if len(x) <= len(y):
- S = x
- L = y
- else:
- S = y
- L = x
- l = min(k, len(L))
- s = min(k, len(S))
- if uneven_lengths:
- Xd = [len(np.intersect1d(S[:(i+1)], L[:(i+1)])) for i in range(l)]
- if l > s:
- sl_range = np.arange(s+1, l+1)
- else:
- sl_range = np.arange(l, s+2)
- result = ((1 - p) / p) * \
- ((sum(Xd[:l] / np.arange(1, l+1) * p**np.arange(1, l+1))) +
- (sum(Xd[s-1] * (sl_range - s) / (s * sl_range) * p**sl_range))) + \
- ((Xd[l-1] - Xd[s-1]) / l + (Xd[s-1] / s)) * p**l
- else:
- k = min(s, k)
- Xd = [len(np.intersect1d(x[:(i+1)], y[:(i+1)])) for i in range(k)]
- Xk = Xd[k-1]
- result = (Xk / k) * p**k + (((1 - p) / p) * sum((Xd / np.arange(1, k+1)) * p**np.arange(1, k+1)))
- return result
|