csinva
/
imodels
mirror of https://github.com/csinva/imodels


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
            import numpy as np
from scipy.stats import rankdata


def tauAP_b(x, y, decreasing=True):
    """
    Weighted kendall tau correlation metric, which handles ties.
    Proposed in "The Treatment of Ties in AP Correlation" by
    Urbano and Marrero (2017). This is the python implementation
    of ircor::tauAP_b from R.

    Parameters
    ----------
    x: array-like of shape (n,)
        Numeric vector.
    y: array-like of shape (n,)
        Numeric vector of same length as x.
    decreasing: bool
        Should the sort order be increasing or decreasing (default)?

    Returns
    -------
    Scalar value between -1 and 1, quantifying how much the
    rankings of x and y agree with each other. A higher
    values indicates greater similarity.

    """
    if decreasing:
        return tauAP_b(-x, -y, decreasing=False)
    else:
        return (_tauAP_b_ties(x, y) + _tauAP_b_ties(y, x)) / 2


def _tauAP_b_ties(x, y):
    n = len(x)
    rx = rankdata(x)
    ry = rankdata(y, method="ordinal")
    p = rankdata(y, method="min")
    c_all = 0
    not_top = np.argwhere(p != 1)
    for i in not_top:
        c_above = 0
        for j in np.argwhere(p < p[i]):
            sx = np.sign(rx[i] - rx[j])
            sy = np.sign(ry[i] - ry[j])
            if sx == sy:
                c_above = c_above + 1
        c_all = c_all + c_above/(p[i] - 1)
    return 2 / len(not_top) * c_all - 1


def rbo(s, t, p, k=None, side="top", uneven_lengths=True):
    """
    Rank-based overlap (RBO) metric.
    Proposed in "A Similarity Measure for Indefinite Rankings" by
    Webber et al. (2010). This is the python implementation
    of gespeR::rbo from R.

    Parameters
    ----------
    s: array-like of shape (n,)
        Numeric vector.
    t: array-like of shape (n,)
        Numeric vector of same length as s.
    p: float between 0 and 1
        Weighting parameter in [0, 1]. High p implies strong emphasis
        on the top-ranked elements (i.e, the larger elements).
    k: None or int
        Evaluation depth for extrapolation
    side: string in {"top", "bottom"}
        Evaluate similarity between the top or the bottom of the
        ranked lists.
    uneven_lengths: bool
        Indicator if lists have uneven lengths.

    Returns
    -------
    Scalar value between 0 and 1, quantifying how much the
    rankings of x and y agree with each other. A higher
    values indicates greater similarity.

    """
    assert side in ["top", "bottom"]
    if k is None:
        k = int(np.floor(max(len(s), len(t)) / 2))
    if side == "top":
        ids = {"s": _select_ids(s, "top"),
               "t": _select_ids(t, "top")}
    elif side == "bottom":
        ids = {"s": _select_ids(s, "bottom"),
               "t": _select_ids(t, "bottom")}
    return min(1, _rbo_ext(ids["s"], ids["t"], p, k, uneven_lengths=uneven_lengths))


def _select_ids(x, side="top"):
    assert side in ["top", "bottom"]
    if side == "top":
        return np.argsort(-x)
    elif side == "bottom":
        return np.argsort(x)


def _rbo_ext(x, y, p, k, uneven_lengths=True):
    if len(x) <= len(y):
        S = x
        L = y
    else:
        S = y
        L = x
    l = min(k, len(L))
    s = min(k, len(S))
    if uneven_lengths:
        Xd = [len(np.intersect1d(S[:(i+1)], L[:(i+1)])) for i in range(l)]
        if l > s:
            sl_range = np.arange(s+1, l+1)
        else:
            sl_range = np.arange(l, s+2)
        result = ((1 - p) / p) * \
                 ((sum(Xd[:l] / np.arange(1, l+1) * p**np.arange(1, l+1))) +
                  (sum(Xd[s-1] * (sl_range - s) / (s * sl_range) * p**sl_range))) + \
                 ((Xd[l-1] - Xd[s-1]) / l + (Xd[s-1] / s)) * p**l
    else:
        k = min(s, k)
        Xd = [len(np.intersect1d(x[:(i+1)], y[:(i+1)])) for i in range(k)]
        Xk = Xd[k-1]
        result = (Xk / k) * p**k + (((1 - p) / p) * sum((Xd / np.arange(1, k+1)) * p**np.arange(1, k+1)))
    return result