Browse Source

add keywords based cbr example

gabirelik 8 months ago
parent
commit
4f82958667

+ 5
- 4
content_based_recomendation/keywords_based_cbr.py

@@ -13,8 +13,9 @@ class KeywordsBasedCbr:
         x = cv.fit_transform(data)
         self.similarity = cosine_similarity(x)
 
-    def predict(self, user_id, n):
-        pass
+    def predict(self, movie_id, n):
+        return np.argsort(-self.similarity[movie_id])[1:n + 1]
 
-    def movie_based_recommendation(self, movie_id, n):
-        return np.argsort(-self.similarity[movie_id])[1:n + 1]
+    def get_highest_similarities(self, movie_id, n):
+        selected_ids = self.predict(movie_id, n)
+        return self.similarity[movie_id, selected_ids]

+ 19
- 36
content_based_recomendation/scripts/movie_lens_content_based_recomendation.py

@@ -1,19 +1,28 @@
 from settings import PATH_TO_DATA
 from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
 from content_based_recomendation.weigted_rating_cbr import WeightedRatingCbr
+from content_based_recomendation.keywords_based_cbr import KeywordsBasedCbr
 import pandas as pd
-import os
+import numpy as np
 
+pd.set_option('display.max_rows', 500)
+pd.set_option('display.max_columns', 500)
+pd.set_option('display.max_colwidth', -1)
 
-def movie_id(data, title):
-    return data.index[data['title'] == title].tolist()[0]
 
+def run_kbcbr_example(data, n=3):
+    keyword_cbr = KeywordsBasedCbr()
+    keyword_cbr.fit(data['combined'])
 
-def filter_ratings(dataset_path, data):
-    ratings = pd.read_csv(os.path.join(dataset_path, 'ratings_small.csv'))
-    ratings = ratings[ratings['movieId'].isin(data['id'])]
-    ratings.to_csv(os.path.join(dataset_path, 'ratings_small_clean.csv'),
-                  index=False)
+    df = data.copy()
+    df['most_similar'] = [[df.iloc[movie_id]['title']
+                           for movie_id in keyword_cbr.predict(movie_id, n)]
+                           for movie_id in data.index]
+    df['avg_similarity'] = [np.mean(keyword_cbr.get_highest_similarities(movie_id, n))
+                            for movie_id in data.index]
+    df = df.sort_values(by=['avg_similarity'])
+
+    return df[['title', 'avg_similarity', 'most_similar', 'combined']]
 
 
 def main():
@@ -21,34 +30,8 @@ def main():
     features_extractor = FeaturesExtractor(dataset_path)
     data = features_extractor.run()
 
-    # keyword_cbr = KeywordsBasedCbr()
-    # keyword_cbr.fit(data['combined'])
-    # ids = keyword_cbr.movie_based_recommendation(movie_id(data, 'Star Wars'), 5)
-    # print(data.iloc[ids]['title'])
-
-    # weighted_rating_cbr = WeightedRatingCbr(data['combined'])
-    # movies_mapping, users_mapping, users_matrix, ratings = load_rating(
-    #     dataset_path, data)
-    # weighted_rating_cbr.fit(users_matrix)
-    # print(data.iloc[weighted_rating_cbr.predict(0, 5)]['title'])
-
-    movie_mapping = dict(zip(data['id'].tolist(), data.index.astype(int)))
-    weighted_rating_cbr = WeightedRatingCbr(data['combined'], movie_mapping)
-
-    ratings = pd.read_csv(os.path.join(dataset_path, 'ratings_small.csv'))
-
-    # weighted_rating_cbr.fit(ratings.values)
-    #
-    # rated_movies = ratings[ratings['userId'] == 1]['movieId']
-    # print(data[data['id'].isin(rated_movies)][['id', 'title', 'combined']])
-    # # print(data.iloc[weighted_rating_cbr.get_recommendations(0, 10)][['id', 'title']])
-    # #
-    # recommendations = weighted_rating_cbr.get_recommendations(1, 10)
-    # print(data[data['id'].isin(recommendations)][['id', 'title', 'combined']])
-
-    # hr = HitRate(10)
-    # hr.evaluate(weighted_rating_cbr, ratings.values, users_matrix)
-    filter_ratings(dataset_path, data)
+    print('Keywords based CBR method example')
+    print(run_kbcbr_example(data).tail(30))
 
 
 if __name__ == '__main__':

+ 4
- 0
utils/features_extraction/movie_lens_features_extractor.py

@@ -22,6 +22,8 @@ class FeaturesExtractor:
         self.movies_data_files = ['keywords.csv', 'credits.csv',
                                   self.movies_metadata_clean]
 
+        self.messages_to_remove = {'overview': 'No overview found.'}
+
     def run(self, unique=False):
         if not os.path.exists(os.path.join(self.dataset_path,
                                            self.movies_metadata_clean)):
@@ -67,6 +69,8 @@ class FeaturesExtractor:
             lambda value: self.process_dict(value, self.clean_elements))
 
     def process_text_columns(self, data):
+        for col_name, message in self.messages_to_remove.items():
+            data[col_name] = data[col_name].str.replace(message, '')
         data['overview'] = self.process_text_column(data['overview'], 7)
         data['tagline'] = self.process_text_column(data['tagline'], 3)