Browse Source

add recommendation method interface

gabirelik 8 months ago
parent
commit
49c73e61d5

+ 51
- 0
content_based_recomendation/scripts/index_mapping.py

@@ -0,0 +1,51 @@
+import numpy as np
+
+
+class IndexMapping:
+
+    def __init__(self, movies_maping):
+        self.user_mapping = None
+        self.movie_mapping = movies_maping
+        self.user_col_id = 0
+        self.movie_col_id = 1
+        self.rating_col_id = 3
+
+    def reassign_ids(self, ratings, mapping, col):
+        assignement = np.vectorize(lambda current_id: mapping.get(current_id))
+        ratings[:, col] = assignement(ratings[:, col])
+
+    def reassign_movie_id(self, ratings):
+        self.reassign_ids(ratings, self.movie_mapping, self.movie_col_id)
+
+    def reassign_user_id(self, ratings):
+        self.user_mapping = dict(zip(set(ratings[:, self.user_col_id]),
+                                     list(range(len(set(ratings[:, self.user_col_id]))))))
+        self.reassign_ids(ratings, self.user_mapping, self.user_col_id)
+
+    def to_matrix(self, ratings, init, get_value):
+        matrix = init((int(max(ratings[:, self.user_col_id])) + 1,
+                       int(max(ratings[:, self.movie_col_id])) + 1))
+
+        def fill_matrix(row):
+            matrix[int(row[self.user_col_id]),
+                   int(row[self.movie_col_id])] = get_value(row)
+
+        np.apply_along_axis(fill_matrix, axis=1, arr=ratings)
+        return matrix
+
+    def generate_user_matrix(self, ratings):
+        return self.to_matrix(ratings, np.zeros,
+                              lambda row: row[self.rating_col_id])
+
+    def generate_rated_mask_matrix(self, ratings):
+        return self.to_matrix(ratings, np.ones, lambda row: 0)
+
+    def remove_na_id(self, ratings):
+        return ratings[~np.isnan(ratings).any(axis=1)]
+
+    def get_users_matrix(self, ratings):
+        self.reassign_movie_id(ratings)
+        self.reassign_user_id(ratings)
+        ratings = self.remove_na_id(ratings)
+        return self.generate_user_matrix(ratings), \
+               self.generate_rated_mask_matrix(ratings)

+ 21
- 49
content_based_recomendation/scripts/movie_lens_content_based_recomendation.py

@@ -10,63 +10,35 @@ import os
 def movie_id(data, title):
     return data.index[data['title'] == title].tolist()[0]
 
-#TODO
-
-def reassign_movie_id(data, ratings):
-    movie_mapping = dict(zip(data['id'].tolist(), data.index.astype(int)))
-    ratings['movieId'] = ratings['movieId'].map(movie_mapping)
-    return movie_mapping
-
-
-def reassign_user_id(data, ratings):
-    user_mapping = dict(zip(set(ratings['userId']),
-                            list(range(len(set(ratings.userId))))))
-    ratings['userId'] = ratings['userId'].map(user_mapping)
-    return user_mapping
-
-
-def remove_na_id(ratings, cols_id):
-    for col in cols_id:
-        ratings = ratings[ratings[col].notna()]
-        ratings[col] = ratings[col].astype(int)
-    return ratings.reset_index(drop=True)
-
-
-def df_to_matrix(df, col_index1, col_index2, col_value):
-    matrix = np.zeros((max(df[col_index1].tolist()) + 1,
-                       max(df[col_index2].tolist()) + 1))
+def main():
+    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
+    features_extractor = FeaturesExtractor(dataset_path)
+    data = features_extractor.run()
 
-    def fill_matrix(row):
-        matrix[int(row[col_index1]),
-               int(row[col_index2])] = row[col_value]
+    # keyword_cbr = KeywordsBasedCbr()
+    # keyword_cbr.fit(data['combined'])
+    # ids = keyword_cbr.movie_based_recommendation(movie_id(data, 'Star Wars'), 5)
+    # print(data.iloc[ids]['title'])
 
-    df.apply(fill_matrix, axis=1)
-    return matrix
+    # weighted_rating_cbr = WeightedRatingCbr(data['combined'])
+    # movies_mapping, users_mapping, users_matrix, ratings = load_rating(
+    #     dataset_path, data)
+    # weighted_rating_cbr.fit(users_matrix)
+    # print(data.iloc[weighted_rating_cbr.predict(0, 5)]['title'])
 
+    movie_mapping = dict(zip(data['id'].tolist(), data.index.astype(int)))
+    weighted_rating_cbr = WeightedRatingCbr(data['combined'], movie_mapping)
 
-def load_rating(dataset_path, data):
     ratings = pd.read_csv(os.path.join(dataset_path, 'ratings_small.csv'))
-    movies_mapping = reassign_movie_id(data, ratings)
-    users_mapping = reassign_user_id(data, ratings)
-    ratings = remove_na_id(ratings, ['movieId', 'userId'])
-    users_matrix = df_to_matrix(ratings, 'userId', 'movieId', 'rating')
-    return movies_mapping, users_mapping, users_matrix
 
-def main():
-    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
-    features_extractor = FeaturesExtractor(dataset_path)
-    data = features_extractor.run()
+    weighted_rating_cbr.fit(ratings.values)
 
-    keyword_cbr = KeywordsBasedCbr()
-    keyword_cbr.fit(data['combined'])
-    ids = keyword_cbr.movie_based_recommendation(movie_id(data, 'Star Wars'), 5)
-    print(data.iloc[ids]['title'])
+    rated_movies = ratings[ratings['userId'] == 1]['movieId'].tolist()
+    print(data[data['id'].isin(rated_movies)]['title'])
+    print(data.iloc[weighted_rating_cbr.get_recommendations(0, 5)]['title'])
 
-    weighted_rating_cbr = WeightedRatingCbr()
-    movies_mapping, users_mapping, users_matrix = load_rating(
-        dataset_path, data)
-    weighted_rating_cbr.fit(data['combined'], users_matrix)
-    print(data.iloc[weighted_rating_cbr.predict(0, 5)]['title'])
+    # hr = HitRate(10)
+    # hr.evaluate(weighted_rating_cbr, ratings.values, users_matrix)
 
 
 if __name__ == '__main__':

+ 13
- 8
content_based_recomendation/weigted_rating_cbr.py

@@ -1,20 +1,25 @@
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
+from interfaces import RecommendationMethod
+from content_based_recomendation.scripts.index_mapping import IndexMapping
 
 
-class WeightedRatingCbr:
+class WeightedRatingCbr(RecommendationMethod):
 
-    def __init__(self):
+    def __init__(self, movies_features, movies_mapping):
         self.recommendation_matrix = None
+        self.movies_features = movies_features
+        self.index_mapping = IndexMapping(movies_mapping)
 
-    def fit(self, movies_features, users_ratings_matrix):
-        users_matrix = users_ratings_matrix
-        movies_matrix = self.calc_movies_matrix(movies_features)
+    def fit(self, ratings):
+        users_matrix, mask = self.index_mapping.get_users_matrix(ratings)
+        movies_matrix = self.calc_movies_matrix(self.movies_features)
         users_profile = self.calc_users_profile(users_matrix, movies_matrix)
-        self.recommendation_matrix = users_profile @ movies_matrix.transpose()
+        self.recommendation_matrix = (users_profile @ movies_matrix.transpose()) \
+                                     * mask
 
-    def predict(self, user_id, n):
+    def get_recommendations(self, user_id, n):
         return np.argsort(-self.recommendation_matrix[user_id, :])[1:n + 1]
 
     def calc_movies_matrix(self, data):
@@ -25,6 +30,6 @@ class WeightedRatingCbr:
     def calc_users_profile(self, users_matrix, movies_matrix):
         users_profile = users_matrix @ movies_matrix
         row_sums = users_profile.sum(axis=1)
-        users_profile = users_profile / row_sums[:, np.newaxis]
+        users_profile = users_profile / (row_sums[:, np.newaxis] + 1e-06)
         return users_profile
 

+ 16
- 0
evaluation/hit_rate.py

@@ -0,0 +1,16 @@
+from sklearn.model_selection import LeaveOneOut
+from user_settings import PATH_TO_DATA
+import pandas as pd
+
+class HitRate:
+    def __init__(self, n):
+        self.n = n
+
+    def evaluate(self, recommendation_method, user_items_ratings):
+        pass
+
+    def is_hit(self, user_items_ratings, user_id, selected_items):
+        pass
+
+
+

+ 11
- 0
interfaces.py

@@ -0,0 +1,11 @@
+from abc import ABC, abstractmethod
+
+class RecommendationMethod(ABC):
+
+    @abstractmethod
+    def fit(self, user_items_ratings):
+        pass
+
+    @abstractmethod
+    def get_recommendations(self, user_id, n):
+        pass