Browse Source

Knn user based was done.

Lukasz Sus 8 months ago
parent
commit
1b8beac1e2
3 changed files with 139 additions and 8 deletions
  1. 25
    7
      README.md
  2. 55
    1
      evaluation/scripts/run_knn_experiments.py
  3. 59
    0
      knn/knn_collaborative_filtering.py

+ 25
- 7
README.md

@@ -80,13 +80,31 @@ In sampling clusters (point 2.), every cluster probability is exponential depend
 | Newest rating          | 7.3/671  |  4.9/671 | 14/671    |
 | Newest positive rating | 5.2/671  |  1.7/671 | 9/671    |
 
-## Kmeans clustering
-
-#### Results
-|                        | Cosine | Minkowski_2 | - |
-|------------------------|----------|----------|----------|
-| Newest rating          | 5/671  |  2/671 | -    |
-| Newest positive rating | 7/671  |  6/671 | -    |
+## Knn Recommendation
+
+**Item based**
+It uses the same movies' features as deep learning method and kmeans. This bag of words based on keywords and description of movie.
+K nearest neighbours algorithm is trained on movies watched and rated by user. Next KNN regression is done to estimate rating for every unwatched movie.
+All movies are sorted by the estimated rating and then top n movies are recommended.
+Algorithm was checked with k equal to 5 and 50. If k > n, then k := n, where n - number of movies rated by user.
+Results are presented for k = 50. 
+
+**User based**
+User is defined by his or her ratings for movies. Top k most similiar user are chosen. Then, all movies rated by these top k users are sorted by their ratings.
+In case of two users rated the same movie, the rating from more similiar user is taken. Recommendation is top n movies from that sorted list.
+Results are presented for k = 10.
+
+#### Results - item based
+|                        | Cosine | Minkowski_2 |
+|------------------------|----------|----------|
+| Newest rating          | 5/671  |  2/671 |
+| Newest positive rating | 7/671  |  6/671 |
+
+#### Results - user based
+|                        | Cosine | Euclidean |
+|------------------------|----------|----------|
+| Newest rating          | 76/671  |  13/671 |
+| Newest positive rating | 78/671  |  9/671 |
 
 ## Content Based Recommendation
 

+ 55
- 1
evaluation/scripts/run_knn_experiments.py

@@ -7,8 +7,10 @@ from sklearn.feature_extraction.text import CountVectorizer
 from tqdm import tqdm
 from deep_learning.utils import get_movie_id_to_feature_mapping
 from evaluation.scripts.run_clustering_experiments import create_kmeans_item_based_input_df
+from knn.knn_collaborative_filtering import KnnCollaborativeFiltering
 from knn.knn_item_based import KnnItemBased
 from utils.evaluation.metrics import hit_rate
+from utils.evaluation.test_train_split import user_leave_on_out
 from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
 from utils.id_indexer import Indexer
 from settings import PATH_TO_DATA
@@ -105,5 +107,57 @@ def knn_item_based():
         print(row)
 
 
+def knn_collaborative_filtering():
+    user_ratings_df = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/ratings_small.csv")
+    movies_metadata = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/movies_metadata_clean.csv")
+    movie_id_features_dict = get_movie_id_to_feature_mapping(movies_metadata)
+    user_ratings_df = user_ratings_df[user_ratings_df[item_column].isin(movie_id_features_dict.keys())]
+
+    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
+    features_extractor = FeaturesExtractor(dataset_path)
+    movies_data = features_extractor.run()
+    movies_data = movies_data.drop_duplicates(["id"])
+
+    indexer = Indexer(user_ids=user_ratings_df[user_column].unique(), movies_ids=movies_data['id'])
+
+    results = list()
+    metric = 'cosine'   # 'euclidean'
+    for threshold in [None, 4.0]:
+
+        method = KnnCollaborativeFiltering(indexer, n_neighbors=10, metric=metric)
+
+        print("Testing...")
+        iterations = 0
+        all_hits = 0
+
+        for train_df, test_df in user_leave_on_out(user_ratings_df, timestamp_column="timestamp", rating_threshold=threshold):
+            print(iterations)
+            train_ratings = train_df.values[:, :3]
+            user_id, item_id, rating = test_df.values[:, :3][0]
+            method.fit(train_ratings)
+            pred_ids = method.get_recommendations(user_id, top_n=30)
+
+            print("Recommended movies: ")
+            for movie_id in pred_ids:
+                print(movie_id_features_dict[indexer.get_movie_id(movie_id)])
+
+            hits = hit_rate(gt_items_idx=[item_id.astype(int)], predicted_items_idx=pred_ids)
+
+            all_hits += hits
+            iterations += 1
+
+        if all_hits > 0:
+            print(f"{method.__class__}: {all_hits}/{iterations}")
+            print(f"Percentage-wise: {all_hits / iterations}")
+        print(f"Total hits: {all_hits}")
+        print(f"Total iterations: {iterations}")
+
+        results.append([metric, threshold, all_hits])
+
+    for row in results:
+        print(row)
+
+
 if __name__ == '__main__':
-    knn_item_based()
+    # knn_item_based()
+    knn_collaborative_filtering()

+ 59
- 0
knn/knn_collaborative_filtering.py

@@ -0,0 +1,59 @@
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
+from interfaces import RecommendationMethod
+from collaborative_filtering.utils import create_user_items_rating_matrix
+
+
+class KnnCollaborativeFiltering(RecommendationMethod):
+    def __init__(self, indexer, n_neighbors, metric='cosine'):
+        """
+
+        :param indexer:
+        :param n_neighbors:
+        :param metric: 'cosine' or 'euclidean'
+        """
+        self.indexer = indexer
+        self.n_neighbors = n_neighbors
+        self.metric = metric
+
+    def fit(self, user_items_ratings):
+        self.user_ratings = create_user_items_rating_matrix(user_items_ratings,
+                                                            self.indexer.user_id_to_internal_id_dict,
+                                                            self.indexer.movie_id_to_internal_id_dict)
+
+        if self.metric == 'cosine':
+            self.user_similarities = cosine_similarity(self.user_ratings)
+        else:
+            self.user_similarities = euclidean_distances(self.user_ratings)
+            self.user_similarities[self.user_similarities == 0] = 1e-6
+            self.user_similarities = 1 / self.user_similarities
+
+    def get_recommendations(self, user_id, top_n):
+        train_movies_internal_indices = np.nonzero(self.user_ratings[self.indexer.get_user_internal_id(user_id)])[0]
+        user_similarities = self.user_similarities[self.indexer.get_user_internal_id(user_id), :]
+        closest_users = np.argsort(user_similarities)[::-1]
+        closest_users = closest_users[1:]       # remove the same user
+        recommendations = list()
+        ratings = list()
+
+        n_neighbors = self.n_neighbors - 1
+        while len(recommendations) < top_n:
+            n_neighbors += 1
+            top_users = closest_users[:n_neighbors]
+            for internal_user_id in top_users:
+                similiar_user_movies = list(np.argsort(self.user_ratings[internal_user_id])[::-1])
+                non_zero_count = np.sum(self.user_ratings[internal_user_id] != 0)
+                similiar_user_movies = similiar_user_movies[:non_zero_count]
+                similiar_user_movies_ratings = list(np.sort(self.user_ratings[internal_user_id])[::-1])
+                similiar_user_movies_ratings = similiar_user_movies_ratings[:non_zero_count]
+                recommendations.extend(similiar_user_movies)
+                ratings.extend(similiar_user_movies_ratings)
+            zipped = list(zip(recommendations, ratings))
+            zipped = sorted(zipped, key=lambda x: x[1])
+            recommendations = list()
+            for movie_internal_id, rating in zipped:
+                if movie_internal_id not in train_movies_internal_indices:
+                    recommendations.append(movie_internal_id)
+            recommendations = list(np.unique(recommendations))
+
+        return recommendations