Browse Source

Deep learning experiments and Readme updated

Witold 8 months ago
parent
commit
cab5cef5d1

+ 16
- 2
README.md

@@ -1,4 +1,4 @@
-# PWG-recommendation-systems
+# PWGŁ-recommendation-systems
 
 Project created for Recommendation Systems classes which we take as part of our final year od Data Science studies on Wrocław University of Science and Technology. Main contributors to this project are Piotr, Gabriela and Witold. Below there are descriptions of main tasks and our solutions.
 
@@ -11,6 +11,9 @@ We used popular [MovieLens](https://www.kaggle.com/rounakbanik/the-movies-datase
 
 To evaluate implemented methods we use HitRate metric evaluation using LeaveOneOut (LOO) split for train and test dataset. Our LOO method splits to number of folds that equals number of users - it chooses one rating for each user as test example for evaluation. The test rating can be the last one for each user (if timestamp feature is provided) or thresholded to be "positive" rating (then HitRate can measure positives hits in recommendation given by implemented methods).
 
+In results section of each method we tried to evaluate hit rate metric on two splits - one with one newest rating for each user and one with newest positive rating for each user (ratings higher or equal 4.0). These splits are named respectively
+**Newest rating** and **Newest positive rating**.
+
 ## Work methodology
 
 To track our tasks we use kanban board on [Trello](https://trello.com/). For our calls we use own channel on [Discord](https://discordapp.com/). In our project, to manage methods, datasets and pipilines we use [DataVersionControl](https://dvc.org/).
@@ -20,6 +23,12 @@ To track our tasks we use kanban board on [Trello](https://trello.com/). For our
 
 We implemented user-based collaborative filtering methods using classic user ratings matrx based method (ClassicMemoryBasedCollaborativeFiltering) and SVD user ratings matrix factorisation method (SVDCollaborativeFiltering). 
 
+#### Results
+
+| Test set        | ClassicMemoryBasedCF (hits)      | SVDCF (hits)  |
+| :-------------: |:-------------:| :-----:|
+| Newest rating     | 100/671 | 19/671 |
+| Newest positive rating      | 112/671       |   14/671 |
 
 ## Deep Learning Methods
 
@@ -29,4 +38,9 @@ We implemented and evaluated two deep learning methods:
 
 * Neural Content Based Recommendation - simple concatenation method based on same movies features that we used in our Content Based Recommendation
 
-#### NeuCF
+#### Results
+
+| Test set        | NeuCf (hits)       | Neural Content Based (hits)  |
+| :-------------: |:-------------:| :-----:|
+| Newest rating     | 56/671 | 26/671 |
+| Newest positive rating      | 141/671       |   51/671 |

+ 4
- 4
collaborative_filtering/neural_collaborative_filtering.py

@@ -281,7 +281,7 @@ class NeuralCollaborativeFiltering(RecommendationMethod):
         train_ds = self._generate_dataset(train_user_item_ratings, batch_size)
 
         loss_object = tf.keras.losses.BinaryCrossentropy()
-        optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
+        optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.01)
 
         done_epochs = 0
         prev_loss = float('inf')
@@ -303,7 +303,7 @@ class NeuralCollaborativeFiltering(RecommendationMethod):
         self.train_loss.reset_states()
 
         loss_object = tf.keras.losses.BinaryCrossentropy()
-        optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
+        optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.01)
 
         done_epochs = 0
         prev_loss = float('inf')
@@ -329,8 +329,8 @@ class NeuralCollaborativeFiltering(RecommendationMethod):
     def fit(self, train_user_items_ratings, test_user_items_ratings=None, batch_size=20, epochs=20, early_stopping=-1):
         self.user_ratings = create_user_items_rating_matrix_w_indexer(train_user_items_ratings, self.indexer)
 
-        #train_user_items_ratings.extend(self._generate_negative_samples(train_user_items_ratings, 1))
-        gmf_model, mlp_model = self._pretrain_models(train_user_items_ratings, epochs=30, error_delta=0.0001)
+        train_user_items_ratings.extend(self._generate_negative_samples(train_user_items_ratings, 1))
+        gmf_model, mlp_model = self._pretrain_models(train_user_items_ratings, epochs=100, error_delta=0.0001)
 
         self.model = self._build_neu_mf_model(pretrained_gmf_model=gmf_model, pretrained_mlp_model=mlp_model)
 

+ 26
- 27
deep_learning/scripts/run_deep_learning_method.py

@@ -43,8 +43,7 @@ def main():
     user_ratings_df = user_ratings_df[user_ratings_df[item_column].isin(movie_id_features_dict.keys())]
 
     train_ratings_df, test_ratings_df = \
-        list(user_leave_on_out(user_ratings_df, timestamp_column='timestamp', make_user_folds=False,
-                               rating_threshold=3.5))[0]
+    list(user_leave_on_out(user_ratings_df, timestamp_column="timestamp", make_user_folds=False))[0]
 
     movies_features, user_features, indexer = get_movies_model_features(user_ratings_df, train_ratings_df, dataset_path,
                                                                         path_to_saved_features)
@@ -52,36 +51,36 @@ def main():
     train_data = map_df_to_model_input(train_ratings_df)
     test_data = map_df_to_model_input(test_ratings_df)
 
-    # method = NeuralCollaborativeFiltering(indexer=indexer, n_factors=64, model_name="neucf_64_w_4_0_threshold")
+    #method = NeuralCollaborativeFiltering(indexer=indexer, n_factors=64, model_name="neucf_64_w_4_0_threshold")
 
     method = MovieFeaturesDeepLearningMethod(indexer=indexer, movies_features=movies_features,
-                                             user_features=user_features, model_name="movies_features_w_4_0_threshold")
+                                              user_features=user_features, model_name="movies_features_w_4_0_threshold")
 
-    # method.fit(train_data, test_data, batch_size=25, epochs=50, early_stopping=3)
+    method.fit(train_data, test_data, batch_size=25, epochs=50, early_stopping=5)
 
     # method.load_model(filepath=os.path.join(PATH_TO_PROJECT, "collaborative_filtering", "saved_models", "neucf_64_w_4_0_threshold.h5"), train_user_item_ratings=train_data)
-    method.load_model(filepath=os.path.join(PATH_TO_PROJECT, "deep_learning", "saved_models", "movies_features_w_4_0_threshold.h5"))
-
-    for user, movie, rating in test_data[:10]:
-        recommendations = method.get_recommendations(user_id=user)
-
-        user_rated_movies = train_ratings_df[train_ratings_df[user_column] == user] \
-            .sort_values(rating_column, ascending=False)[[item_column]] \
-            .values.squeeze()
-
-        recommended_movies = [movie_internal_id for movie_internal_id in recommendations if
-                              movie_internal_id not in user_rated_movies][:10]
-
-        print("Rated movies: ")
-        for movie_id in user_rated_movies:
-            print(movie_id_features_dict[movie_id])
-
-        print("Recommended movies: ")
-        for movie_id in recommended_movies:
-            print(movie_id_features_dict[movie_id])
-
-        print("Test movie rating: ")
-        print(movie_id_features_dict[movie], rating)
+    # method.load_model(filepath=os.path.join(PATH_TO_PROJECT, "deep_learning", "saved_models", "movies_features_w_4_0_threshold.h5"))
+
+    # for user, movie, rating in test_data[:10]:
+    #     recommendations = method.get_recommendations(user_id=user)
+    #
+    #     user_rated_movies = train_ratings_df[train_ratings_df[user_column] == user] \
+    #         .sort_values(rating_column, ascending=False)[[item_column]] \
+    #         .values.squeeze()
+    #
+    #     recommended_movies = [movie_internal_id for movie_internal_id in recommendations if
+    #                           movie_internal_id not in user_rated_movies][:10]
+    #
+    #     print("Rated movies: ")
+    #     for movie_id in user_rated_movies:
+    #         print(movie_id_features_dict[movie_id])
+    #
+    #     print("Recommended movies: ")
+    #     for movie_id in recommended_movies:
+    #         print(movie_id_features_dict[movie_id])
+    #
+    #     print("Test movie rating: ")
+    #     print(movie_id_features_dict[movie], rating)
 
 
 if __name__ == '__main__':

+ 80
- 37
evaluation/scripts/run_deep_learning_experiments.py

@@ -1,69 +1,112 @@
+import deep_learning.utils as deep_learning_utils
 import pandas as pd
 import numpy as np
+import os
 
+from collaborative_filtering.neural_collaborative_filtering import NeuralCollaborativeFiltering
+from deep_learning.movie_features_deep_learning_method import MovieFeaturesDeepLearningMethod
 from sklearn.feature_extraction.text import CountVectorizer
 from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
 from utils.id_indexer import Indexer
 from utils.evaluation.test_train_split import user_leave_on_out
-from settings import PATH_TO_DATA
+from utils.evaluation.metrics import hit_rate
+from settings import PATH_TO_DATA, PATH_TO_PROJECT
 
 user_column = 'userId'
 item_column = 'movieId'
 rating_column = 'rating'
 
 
-def get_movie_id_to_feature_mapping(movies_metadata_df):
-    mapping = {}
-    for i, row in movies_metadata_df.iterrows():
-        features = {
-            "title": row["title"],
-            "id": row["id"],
-        }
+def get_movies_model_features(all_users_ratings_df, train_ratings_df, movies_feature_data_path, saved_movies_features):
+    movies_data_df, movies_features = deep_learning_utils.get_movies_features(movies_feature_data_path,
+                                                                              saved_movies_features)
+    movies_data_df.to_csv(f"{PATH_TO_DATA}/generated/movies_data_df.csv")
+    indexer = Indexer(user_ids=all_users_ratings_df[user_column].unique(), movies_ids=movies_data_df['id'].unique())
 
-        mapping[int(row['id'])] = features
+    user_features = deep_learning_utils.get_weighted_movies_user_features(train_ratings_df, indexer, movies_features)
 
-    return mapping
+    return movies_features, user_features, indexer
 
 
-def get_weighted_movies_user_features(user_ratings_df, indexer, movies_features):
-    user_features = []
-
-    for user_internal_id, user_id in indexer.internal_id_to_user_id_dict.items():
-        user_ratings = user_ratings_df[user_ratings_df[user_column] == user_id][[item_column, rating_column]].values
-        user_rated_movies_id = [indexer.get_movie_internal_id(i) for i in user_ratings[:, 0].astype(int)]
-        user_ratings = np.expand_dims(user_ratings[:, 1] / 5.0, axis=1)
-        user_rated_movies_features = movies_features[user_rated_movies_id, :]
-        user_movies_features = np.sum(np.multiply(user_ratings, user_rated_movies_features), axis=0)
-        user_features.append(user_movies_features)
+def map_df_to_model_input(data_df):
+    data = data_df[[user_column, item_column, rating_column]].values
+    return [(int(r[0]),
+             int(r[1]),
+             r[2] / 5) for r in data]
 
-    return np.array(user_features)
 
+def load_neucf_model(saved_model_filename, indexer, train_data):
+    method = NeuralCollaborativeFiltering(indexer=indexer, n_factors=64, model_name="neucf_64_wo_threshold")
+    method.load_model(
+        filepath=os.path.join(PATH_TO_PROJECT, "collaborative_filtering", "saved_models", saved_model_filename),
+        train_user_item_ratings=train_data)
+    return method
 
-def map_df_to_model_input(data_df, movies_features, user_features, indexer):
-    data = data_df[[user_column, item_column, rating_column]].values
 
-    return [(user_features[indexer.get_user_internal_id(r[0])],
-             movies_features[indexer.get_movie_internal_id(r[1])],
-             r[2]) for r in data]
+def load_movies_features_model(saved_model_filename, indexer, movies_features, user_features):
+    method = MovieFeaturesDeepLearningMethod(indexer=indexer, movies_features=movies_features,
+                                             user_features=user_features, model_name="movies_features_wo_threshold")
+    method.load_model(
+        filepath=os.path.join(PATH_TO_PROJECT, "deep_learning", "saved_models", saved_model_filename))
+    return method
 
 
 def main():
-    user_ratings_df = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/ratings_small.csv")
-    movies_metadata = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/movies_metadata_clean.csv")
-    movie_id_features_dict = get_movie_id_to_feature_mapping(movies_metadata)
+    dataset_path = os.path.join(os.sep, PATH_TO_DATA, "raw/the-movies-dataset")
+    path_to_saved_features = os.path.join(os.sep, PATH_TO_DATA, "generated/movies_data_df.csv")
+    user_ratings_df = pd.read_csv(os.path.join(dataset_path, "ratings_small.csv"))
+    movies_metadata = pd.read_csv(os.path.join(dataset_path, "movies_metadata_clean.csv"))
+    movie_id_features_dict = deep_learning_utils.get_movie_id_to_feature_mapping(movies_metadata)
+
     user_ratings_df = user_ratings_df[user_ratings_df[item_column].isin(movie_id_features_dict.keys())]
 
-    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
-    features_extractor = FeaturesExtractor(dataset_path)
-    movies_data = features_extractor.run()
+    train_ratings_df, test_ratings_df = \
+        list(user_leave_on_out(user_ratings_df, timestamp_column="timestamp", make_user_folds=False, rating_threshold=4.0))[0]
+
+    movies_features, user_features, indexer = get_movies_model_features(user_ratings_df, train_ratings_df, dataset_path,
+                                                                        path_to_saved_features)
+
+    train_data = map_df_to_model_input(train_ratings_df)
+    test_data = map_df_to_model_input(test_ratings_df)
+
+    # neucf_loader = lambda: load_neucf_model("neucf_64_wo_threshold.h5", indexer, train_data)
+    # movies_feature_model_loader = lambda: load_movies_features_model("movies_features_wo_threshold.h5", indexer,
+    #                                                                  movies_features, user_features)
+
+    neucf_loader = lambda: load_neucf_model("neucf_64_w_4_0_threshold.h5", indexer, train_data)
+    movies_feature_model_loader = lambda: load_movies_features_model("movies_features_w_4_0_threshold.h5", indexer,
+                                                                     movies_features, user_features)
+
+    models = [
+        neucf_loader(),
+        movies_feature_model_loader()
+    ]
+
+    n = 30
+
+    results = {}
+
+    for model in models:
+        iterations = 0
+        all_hits = 0
+        for user, movie, rating in test_data:
+            print(iterations)
+            recommendations = model.get_recommendations(user_id=user)
+            user_rated_movies = train_ratings_df[train_ratings_df[user_column] == user] \
+                .sort_values(rating_column, ascending=False)[[item_column]] \
+                .values.squeeze()
+
+            recommended_movies = [movie_id for movie_id in recommendations if movie_id not in user_rated_movies][:n]
+            hits = hit_rate(gt_items_idx=[movie], predicted_items_idx=recommended_movies)
 
-    cv = CountVectorizer(min_df=3)
-    movies_features = cv.fit_transform(movies_data['combined']).toarray().astype(float)
-    indexer = Indexer(user_ids=user_ratings_df[user_column].unique(), movies_ids=movies_data['id'])
+            all_hits += hits
+            iterations += 1
 
-    train_df, test_df = list(user_leave_on_out(user_ratings_df, make_user_folds=False, rating_threshold=5.0))[0]
+            if hits > 0:
+                print(f"{model.__class__}: {all_hits}/{iterations}")
 
-    print(train_df)
+        results[model.__class__] = all_hits
+        print(results)
 
 
 if __name__ == '__main__':

+ 18
- 6
evaluation/scripts/run_experiments.py

@@ -4,6 +4,8 @@ from settings import PATH_TO_DATA
 from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
 from collaborative_filtering.memory_based_collaborative_filtering import MemoryBasedCollaborativeFiltering
 from collaborative_filtering.svd_collaborative_filtering import SVDCollaborativeFiltering
+from hybrid.average_hybrid_filtering import AverageHybridFiltering
+from hybrid.predicate_hybrid_filtering import PredicateHybridFiltering
 from utils.evaluation.test_train_split import user_leave_on_out
 import pandas as pd
 import os
@@ -19,27 +21,34 @@ def main():
 
     features_extractor = FeaturesExtractor(dataset_path)
     data = features_extractor.run()
-    movie_mapping = dict(zip(data['id'].tolist(), data.index.astype(int)))
+    data = data.drop_duplicates(subset=['id'])
+    movie_mapping = dict(zip(data['id'].tolist(), range(len(data))))
 
     user_ids = ratings[user_column].unique()
     movie_ids = ratings[item_column].unique()
 
     wcr_factory = lambda: WeightedRatingCbr(data['combined'], movies_mapping=movie_mapping)
+    mem_factory = lambda: MemoryBasedCollaborativeFiltering(user_ids, movie_ids)
+    svd_factory = lambda: SVDCollaborativeFiltering(user_ids, movie_ids)
 
     methods = [
-        #MemoryBasedCollaborativeFiltering(user_ids, movie_ids),
-        SVDCollaborativeFiltering(user_ids, movie_ids),
-        wcr_factory()
+        #mem_factory(),
+        #svd_factory(),
+        wcr_factory(),
+        AverageHybridFiltering([mem_factory(), wcr_factory()], 50)
     ]
 
     n = 30
 
+    results = {}
+
     for method in methods:
         iterations = 0
         all_hits = 0
-        for train_df, test_df in user_leave_on_out(ratings):
+        for train_df, test_df in user_leave_on_out(ratings, timestamp_column="timestamp", rating_threshold=4.0):
+            print(iterations)
             train_ratings = train_df.values[:, :3]
-            user_id, item_id, ratings = test_df.values[:, :3][0]
+            user_id, item_id, rating = test_df.values[:, :3][0]
             method.fit(train_ratings)
             pred_ids = method.get_recommendations(user_id, n)
             hits = hit_rate(gt_items_idx=[item_id.astype(int)], predicted_items_idx=pred_ids)
@@ -54,6 +63,9 @@ def main():
             if hits > 0:
                 print(f"{method.__class__}: {all_hits}/{iterations}")
 
+        results[method.__class__] = all_hits
+        print(results)
+
 
 if __name__ == '__main__':
     main()

+ 1
- 1
utils/evaluation/test_train_split.py

@@ -18,7 +18,7 @@ def user_leave_on_out(user_ratings_df, timestamp_column=None, make_user_folds=Tr
      If False returning one train dataset and one test set with oe rating for each user
     :return:
     """
-
+    random.seed(30)
     user_ids = user_ratings_df[user_column].unique()
     test_indices = []
     for user_id in user_ids: