Browse Source

Kmeans finished.

Lukasz Sus 8 months ago
parent
commit
716eec892f
3 changed files with 269 additions and 128 deletions
  1. 33
    1
      README.md
  2. 93
    17
      clustering/kmeans_item_based.py
  3. 143
    110
      evaluation/scripts/run_clustering_experiments.py

+ 33
- 1
README.md

@@ -43,4 +43,36 @@ We implemented and evaluated two deep learning methods:
 | Test set        | NeuCf (hits)       | Neural Content Based (hits)  |
 | :-------------: |:-------------:| :-----:|
 | Newest rating     | 56/671 | 26/671 |
-| Newest positive rating      | 141/671       |   51/671 |
+| Newest positive rating      | 141/671       |   51/671 |
+
+## Kmeans clustering
+
+All kmeans are item based. Kmeans recommendation system defines user with features of movies that he or she watched.
+It consists a matrix n x m, where n is number of rated movies by user and m is feature vector length.
+
+Task: recommend top n movies.
+
+Kmeans_1:
+1. Assign every movie rated by user to its cluster.
+2. Sample n clusters from previous step with probability distribution defined by user ratings.
+3. Sample movies from chosen clusters. Every movie with uniform distribution within a cluster.
+4. Remove duplicated recommendations and remove movies watched by user.
+5. If number of recommended movies is lower than n, return to step 3.
+
+Kmeans_2:
+In sampling clusters (point 2.), every cluster probability is exponential depending on user rating for movie.
+
+Kmeans_3:
+1. Assign every movie rated by user to its cluster.
+2. Rate every cluster by counting mean of user's ratings inside that cluster. 
+3. Sort clusters according to their ratings. In case of two clusters have the same rating, the cluster with more ratings is preferred.
+4. Choose cluster with the highest rating.
+5. Get all movies from the cluster. Sorted them by their popularity.
+6. Remove duplicated recommendations and remove movies watched by user.
+7. If number of recommended movies is lower than n, return to step 4 and select next cluster.
+
+#### Results
+|                        | Kmeans_1 | Kmeans_2 | Kmeans_3 |
+|------------------------|----------|----------|----------|
+| Newest rating          | 1.4/671  |  2.1/671 | 7/671    |
+| Newest positive rating | 1.7/671  |  1.7/671 | 5/671    |

+ 93
- 17
clustering/kmeans_item_based.py

@@ -6,38 +6,114 @@ from interfaces import RecommendationMethod
 
 
 class KmeansItemBased(RecommendationMethod):
-    def __init__(self):
-        pass
+    def __init__(self, cluster_selection="random", item_selection="random"):
+        """
+        :param cluster_selection: way of selecting clusters for recommendation - works only if item_selection is 'random':
+            'random' - sampling with repeating from clusters to which user's movies are assigned to
+                        probability of every cluster is proportional to user rating for a movie
+            'exponential' - sampling with repeating from clusters to which user's movies are assigned to
+                        probability of every cluster is proportional 2^r where r is rating for a movie
+        :param item_selection: way of selecting item from cluster for recommendation:
+            'random' - sampling uniformly
+            'popularity' - take most popular movies from cluster for recommendation
+        """
+        self.cluster_selection = cluster_selection
+        self.item_selection = item_selection
 
-    def fit(self, movies_features):
+    def fit(self, movies_features, n_clusters=10, popularity=None):
+        """
+
+        :param movies_features: features of all clusters
+        :param n_clusters: number of clusters
+        :param popularity: popularity of movies; used only if item_selection = 'popularity', otherwise ignored
+        :return:
+        """
         self.movies_features = movies_features
-        self.kmeans = KMeans(n_clusters=50, init="k-means++", random_state=0, n_jobs=-1,
+        if popularity is not None:
+            self.popularity = popularity
+        self.kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=0, n_jobs=-1,
                              n_init=10, max_iter=500).fit(movies_features)
+        if self.item_selection == "popularity":
+            self._sort_by_popularity_for_clusters()
 
     def get_recommendations(self, user_features, train_movie_ids, user_ratings, top_n):
         self.recommended = list()
         self.train_movie_ids = train_movie_ids
-
         centroid_indices = self.kmeans.predict(user_features)
-        centroid_indices = self._chose_centroids(centroid_indices, user_ratings, top_n)
-        while len(self.recommended) < top_n:
-            self.recommended = list(self.recommended)
-            for centroid_id in centroid_indices:
-                self.recommended.append(self._sample_from_cluster(centroid_id))
-            self.recommended = np.unique(self.recommended)
-            for movie_id in train_movie_ids:
-                self.recommended = self.recommended[self.recommended != movie_id]
+
+        if self.item_selection == "popularity":
+            centroid_indices = self._choose_centroids(centroid_indices, user_ratings)
+            while len(self.recommended) < top_n:
+                self.recommended = list(self.recommended)
+                for centroid_id in centroid_indices:
+                    self.recommended.extend(self._get_most_popular(centroid_id))
+                self.recommended = np.unique(self.recommended)
+                for movie_id in train_movie_ids:
+                    self.recommended = self.recommended[self.recommended != movie_id]
+
+        else:
+            centroid_indices = self._sample_centroids(centroid_indices, user_ratings, top_n)
+
+            while len(self.recommended) < top_n:
+                self.recommended = list(self.recommended)
+                for centroid_id in centroid_indices:
+                    self.recommended.append(self._sample_from_cluster(centroid_id))
+                self.recommended = np.unique(self.recommended)
+                for movie_id in train_movie_ids:
+                    self.recommended = self.recommended[self.recommended != movie_id]
 
         return self.recommended[:top_n]
 
-    def _chose_centroids(self, centroid_indices, user_ratings, size):
-        probs = user_ratings / np.sum(user_ratings)
-        chosen = np.random.choice(centroid_indices, size=size, p=probs)
-        return chosen
+    def _sample_centroids(self, centroid_indices, user_ratings, size):
+        if self.cluster_selection == "exponential":
+            logits = np.power(2, 1 + 5 * np.asarray(user_ratings))
+            probs = logits / np.sum(logits)
+            chosen = np.random.choice(centroid_indices, size=size, p=probs)
+            return chosen
+        else:       # random
+            probs = user_ratings / np.sum(user_ratings)
+            chosen = np.random.choice(centroid_indices, size=size, p=probs)
+            return chosen
+
+    def _choose_centroids(self, centroid_indices, user_ratings):
+        score = dict()
+        weight = dict()
+
+        for centroid_id, user_rating in zip(centroid_indices, user_ratings):
+            if centroid_id not in score:
+                score[centroid_id] = user_rating
+                weight[centroid_id] = 1
+            else:
+                score[centroid_id] = score[centroid_id] + user_rating
+                weight[centroid_id] = weight[centroid_id] + 1
+
+        for key, value in score.items():
+            score[key] = value / weight[key]
+
+        centroids = [(key, value, weight[key]) for key, value in score.items()]
+        centroids = sorted(centroids, key=lambda x: (x[1], x[2]), reverse=True)
+        centroids = [x[0] for x in centroids]
+        
+        return centroids
 
     def _sample_from_cluster(self, centroid_id):
         indices = np.argwhere(self.kmeans.labels_ == centroid_id).squeeze()
         return np.random.choice(indices, 1)[0]
 
+    def _get_most_popular(self, centroid_id):
+        return self.best_movies_per_cluster[centroid_id]
 
+    def _sort_by_popularity_for_clusters(self):
+        labels = self.kmeans.labels_
+        cluser_indices = np.unique(labels)
+        self.best_movies_per_cluster = list()
 
+        for cluster_index in cluser_indices:
+            item_indices = np.argwhere(labels == cluster_index).flatten()
+            if len(item_indices) == 0:
+                self.best_movies_per_cluster.append(list())
+                continue
+            cluster_popularities = [self.popularity[i] for i in item_indices]
+            zipped = list(zip(list(item_indices), cluster_popularities))
+            zipped = sorted(zipped, key=lambda x: x[1], reverse=True)
+            self.best_movies_per_cluster.append([x[0] for x in zipped])

+ 143
- 110
evaluation/scripts/run_clustering_experiments.py

@@ -8,7 +8,8 @@ from tqdm import tqdm
 
 from clustering.kmeans_item_based import KmeansItemBased
 from clustering.kmeans_user_based import KmeansUserBased
-from evaluation.scripts.run_deep_learning_experiments import get_movie_id_to_feature_mapping, map_df_to_model_input
+from deep_learning.utils import get_movie_id_to_feature_mapping
+from evaluation.scripts.run_deep_learning_experiments import map_df_to_model_input
 from utils.evaluation.metrics import hit_rate
 from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
 from utils.id_indexer import Indexer
@@ -20,76 +21,77 @@ item_column = 'movieId'
 rating_column = 'rating'
 
 
-def get_user_features_on_movies_features(user_ratings_df, indexer, movies_features):
-    user_features = []
-
-    for user_internal_id, user_id in indexer.internal_id_to_user_id_dict.items():
-        user_ratings = user_ratings_df[user_ratings_df[user_column] == user_id][[item_column, rating_column]].values
-        user_rated_movies_id = [indexer.get_movie_internal_id(i) for i in user_ratings[:, 0].astype(int)]
-        user_ratings = np.expand_dims(user_ratings[:, 1] / 5.0, axis=1)
-        user_rated_movies_features = movies_features[user_rated_movies_id, :]
-        user_movies_features = np.multiply(user_ratings, user_rated_movies_features)
-        user_movies_features[user_movies_features == 0] = np.nan
-        user_movies_features = np.nan_to_num(np.nanmean(user_movies_features, axis=0))
-        # user_movies_features[user_movies_features == 0] = 0.5
-        # user_movies_features = np.mean(user_movies_features, axis=0)
-
-        user_features.append(user_movies_features)
-
-    return np.array(user_features)
-
-
-def kmeans_user_based():
-    user_ratings_df = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/ratings_small.csv")
-    movies_metadata = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/movies_metadata_clean.csv")
-    movie_id_features_dict = get_movie_id_to_feature_mapping(movies_metadata)
-    user_ratings_df = user_ratings_df[user_ratings_df[item_column].isin(movie_id_features_dict.keys())]
-
-    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
-    features_extractor = FeaturesExtractor(dataset_path)
-    movies_data = features_extractor.run()
-    movies_data = movies_data.drop_duplicates(["id"])
-
-    cv = CountVectorizer(min_df=3)
-    movies_features = cv.fit_transform(movies_data['combined']).toarray().astype(float)
-    indexer = Indexer(user_ids=user_ratings_df[user_column].unique(), movies_ids=movies_data['id'])
-
-    method = KmeansUserBased(movies_features=movies_features)
-
-    for train_df, test_df in user_leave_on_out(user_ratings_df, timestamp_column="timestamp"):
-        user_features = get_user_features_on_movies_features(train_df, indexer, movies_features)
-
-        # train_data = map_df_to_model_input(train_df, movies_features, user_features, indexer)
-        test_data = map_df_to_model_input(test_df, movies_features, user_features, indexer)
-
-        method.fit(user_features)
-
-        for i, (index, row) in enumerate(test_df.iterrows()):
-            user, movie, rating = test_data[i]
-            recommendations = method.get_recommendations(user.reshape(1, -1), top_n=10)
-
-            user_rated_movies = user_ratings_df[user_ratings_df[user_column] == row[user_column]] \
-                .sort_values(rating_column, ascending=False)[[item_column]] \
-                .values.squeeze()
-
-            user_rated_movies_ratings = user_ratings_df[user_ratings_df[user_column] == row[user_column]] \
-                .sort_values(rating_column, ascending=False)[[rating_column]] \
-                .values.squeeze()
-
-            recommended_movies = [indexer.get_movie_id(movie_internal_id) for movie_internal_id in recommendations]
-
-            print(f"Test movie: {movie_id_features_dict[row[item_column]]}, rating: {row[rating_column]}")
-
-            print("Rated movies: ")
-            for movie_id, rating in zip(user_rated_movies, user_rated_movies_ratings):
-                print(movie_id_features_dict[movie_id], f"rating: {rating}")
-
-            print("Recommended movies: ")
-            for movie_id in recommended_movies:
-                print(movie_id_features_dict[movie_id])
-
-
-def create_kmeans_item_based_input_df(user_ratings_df, movies_features, indexer, timestamp_column):
+# def get_user_features_on_movies_features(user_ratings_df, indexer, movies_features):
+#     user_features = []
+# 
+#     for user_internal_id, user_id in indexer.internal_id_to_user_id_dict.items():
+#         user_ratings = user_ratings_df[user_ratings_df[user_column] == user_id][[item_column, rating_column]].values
+#         user_rated_movies_id = [indexer.get_movie_internal_id(i) for i in user_ratings[:, 0].astype(int)]
+#         user_ratings = np.expand_dims(user_ratings[:, 1] / 5.0, axis=1)
+#         user_rated_movies_features = movies_features[user_rated_movies_id, :]
+#         user_movies_features = np.multiply(user_ratings, user_rated_movies_features)
+#         user_movies_features[user_movies_features == 0] = np.nan
+#         user_movies_features = np.nan_to_num(np.nanmean(user_movies_features, axis=0))
+#         # user_movies_features[user_movies_features == 0] = 0.5
+#         # user_movies_features = np.mean(user_movies_features, axis=0)
+# 
+#         user_features.append(user_movies_features)
+# 
+#     return np.array(user_features)
+# 
+# 
+# def kmeans_user_based():
+#     user_ratings_df = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/ratings_small.csv")
+#     movies_metadata = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/movies_metadata_clean.csv")
+#     movie_id_features_dict = get_movie_id_to_feature_mapping(movies_metadata)
+#     user_ratings_df = user_ratings_df[user_ratings_df[item_column].isin(movie_id_features_dict.keys())]
+# 
+#     dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
+#     features_extractor = FeaturesExtractor(dataset_path)
+#     movies_data = features_extractor.run()
+#     movies_data = movies_data.drop_duplicates(["id"])
+# 
+#     cv = CountVectorizer(min_df=3)
+#     movies_features = cv.fit_transform(movies_data['combined']).toarray().astype(float)
+#     indexer = Indexer(user_ids=user_ratings_df[user_column].unique(), movies_ids=movies_data['id'])
+# 
+#     method = KmeansUserBased(movies_features=movies_features)
+# 
+#     for train_df, test_df in user_leave_on_out(user_ratings_df, timestamp_column="timestamp"):
+#         user_features = get_user_features_on_movies_features(train_df, indexer, movies_features)
+# 
+#         # train_data = map_df_to_model_input(train_df, movies_features, user_features, indexer)
+#         test_data = map_df_to_model_input(test_df, movies_features, user_features, indexer)
+# 
+#         method.fit(user_features)
+# 
+#         for i, (index, row) in enumerate(test_df.iterrows()):
+#             user, movie, rating = test_data[i]
+#             recommendations = method.get_recommendations(user.reshape(1, -1), top_n=10)
+# 
+#             user_rated_movies = user_ratings_df[user_ratings_df[user_column] == row[user_column]] \
+#                 .sort_values(rating_column, ascending=False)[[item_column]] \
+#                 .values.squeeze()
+# 
+#             user_rated_movies_ratings = user_ratings_df[user_ratings_df[user_column] == row[user_column]] \
+#                 .sort_values(rating_column, ascending=False)[[rating_column]] \
+#                 .values.squeeze()
+# 
+#             recommended_movies = [indexer.get_movie_id(movie_internal_id) for movie_internal_id in recommendations]
+# 
+#             print(f"Test movie: {movie_id_features_dict[row[item_column]]}, rating: {row[rating_column]}")
+# 
+#             print("Rated movies: ")
+#             for movie_id, rating in zip(user_rated_movies, user_rated_movies_ratings):
+#                 print(movie_id_features_dict[movie_id], f"rating: {rating}")
+# 
+#             print("Recommended movies: ")
+#             for movie_id in recommended_movies:
+#                 print(movie_id_features_dict[movie_id])
+
+
+def create_kmeans_item_based_input_df(user_ratings_df, movies_features, indexer, timestamp_column=None,
+                                      rating_threshold=None):
     users_features = list()
     train_movies_ids = list()
     train_ratings = list()
@@ -99,10 +101,18 @@ def create_kmeans_item_based_input_df(user_ratings_df, movies_features, indexer,
     user_ids = user_ratings_df[user_column].unique()
     test_indices = []
 
-    print("Preparing test data set...")
     for user_id in tqdm(user_ids):
         user_ratings = user_ratings_df[user_ratings_df[user_column] == user_id]
 
+        if rating_threshold is not None:
+            thresholded_user_ratings = user_ratings[user_ratings[rating_column] >= rating_threshold]
+
+            while len(thresholded_user_ratings) < 2:
+                rating_threshold -= 0.5
+                thresholded_user_ratings = user_ratings[user_ratings[rating_column] >= rating_threshold]
+
+            user_ratings = thresholded_user_ratings
+
         if timestamp_column is not None:
             user_ratings = user_ratings.sort_values(by=timestamp_column)
             test_index = len(user_ratings) - 1
@@ -138,6 +148,15 @@ def create_kmeans_item_based_input_df(user_ratings_df, movies_features, indexer,
     return df
 
 
+def prepare_popularities(movies_metadata, movies_data):
+    popularities = list()
+    for index, row in movies_data.iterrows():
+        popularities.append(movies_metadata[movies_metadata["id"] == row["id"]]["popularity"].values[0])
+    if len(popularities) != len(movies_data):
+        raise ValueError("Len of popularities must be equal to len of movies_data")
+    return popularities
+
+
 def kmeans_item_based():
     user_ratings_df = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/ratings_small.csv")
     movies_metadata = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/movies_metadata_clean.csv")
@@ -153,57 +172,71 @@ def kmeans_item_based():
     movies_features = cv.fit_transform(movies_data['combined']).toarray().astype(float)
     indexer = Indexer(user_ids=user_ratings_df[user_column].unique(), movies_ids=movies_data['id'])
 
-    method = KmeansItemBased()
-    print("Fitting kmeans...")
-    method.fit(movies_features)
+    results = list()
+    for n_centroids in [5, 10, 20, 50, 100]:
+        method = KmeansItemBased(cluster_selection="exponential",
+                                 item_selection="random")
+        print("Fitting kmeans...")
+        method.fit(movies_features, n_clusters=n_centroids,
+                   popularity=prepare_popularities(movies_metadata, movies_data))
+
+        df_dataset = create_kmeans_item_based_input_df(user_ratings_df, movies_features, indexer,
+                                                       timestamp_column="timestamp",
+                                                       rating_threshold=None)
+
+        print("Testing...")
+        iterations = 0
+        all_hits = 0
+
+        for i in range(10):
+            for index, row in tqdm(df_dataset.iterrows()):
+                user_id = row["user_id"]
+                user_matrix = row["user_matrix"]
+                train_movie_ids = row["train_movie_ids"]
+                train_ratings = row["train_ratings"]
+                test_movie_id = row["test_movie_ids"]
+                test_rating = row["test_ratings"]
 
-    df_dataset = create_kmeans_item_based_input_df(user_ratings_df, movies_features, indexer,
-                                             timestamp_column="timestamp")
+                recommendations = method.get_recommendations(user_matrix, train_movie_ids, train_ratings, top_n=10)
 
-    print("Testing...")
-    iterations = 0
-    all_hits = 0
-    for index, row in tqdm(df_dataset.iterrows()):
-        user_id = row["user_id"]
-        user_matrix = row["user_matrix"]
-        train_movie_ids = row["train_movie_ids"]
-        train_ratings = row["train_ratings"]
-        test_movie_id = row["test_movie_ids"]
-        test_rating = row["test_ratings"]
+                user_rated_movies = user_ratings_df[user_ratings_df[user_column] == user_id] \
+                    .sort_values(rating_column, ascending=False)[[item_column]] \
+                    .values.squeeze()
 
-        recommendations = method.get_recommendations(user_matrix, train_movie_ids, train_ratings, top_n=100)
+                user_rated_movies_ratings = user_ratings_df[user_ratings_df[user_column] == user_id] \
+                    .sort_values(rating_column, ascending=False)[[rating_column]] \
+                    .values.squeeze()
 
-        user_rated_movies = user_ratings_df[user_ratings_df[user_column] == user_id] \
-            .sort_values(rating_column, ascending=False)[[item_column]] \
-            .values.squeeze()
+                recommended_movies = [indexer.get_movie_id(movie_internal_id) for movie_internal_id in
+                                      recommendations]
 
-        user_rated_movies_ratings = user_ratings_df[user_ratings_df[user_column] == user_id] \
-            .sort_values(rating_column, ascending=False)[[rating_column]] \
-            .values.squeeze()
+                print(f"Test movie: {movie_id_features_dict[test_movie_id]}, rating: {test_rating}")
 
-        recommended_movies = [indexer.get_movie_id(movie_internal_id) for movie_internal_id in recommendations]
+                print("Rated movies: ")
+                for movie_id, rating in zip(user_rated_movies, user_rated_movies_ratings):
+                    print(movie_id_features_dict[movie_id], f"rating: {rating}")
 
-        print(f"Test movie: {movie_id_features_dict[test_movie_id]}, rating: {test_rating}")
+                print("Recommended movies: ")
+                for movie_id in recommended_movies:
+                    print(movie_id_features_dict[movie_id])
 
-        print("Rated movies: ")
-        for movie_id, rating in zip(user_rated_movies, user_rated_movies_ratings):
-            print(movie_id_features_dict[movie_id], f"rating: {rating}")
+                hits = hit_rate(gt_items_idx=[test_movie_id], predicted_items_idx=recommendations)
 
-        print("Recommended movies: ")
-        for movie_id in recommended_movies:
-            print(movie_id_features_dict[movie_id])
+                all_hits += hits
+                iterations += 1
 
-        hits = hit_rate(gt_items_idx=[test_movie_id], predicted_items_idx=recommendations)
+        if all_hits > 0:
+            print(f"{method.__class__}: {all_hits}/{iterations}")
+            print(f"Percentage-wise: {all_hits / iterations}")
+        print(f"Total hits: {all_hits}")
+        print(f"Total iterations: {iterations}")
 
-        all_hits += hits
-        iterations += 1
+        results.append([n_centroids, None, all_hits / 10.0])
 
-    if all_hits > 0:
-        print(f"{method.__class__}: {all_hits}/{iterations}")
-    print(f"Total hits: {all_hits}")
-    print(f"Total iterations: {iterations}")
+        for row in results:
+            print(row)
 
 
 if __name__ == '__main__':
     # kmaens_user_based()
-    kmeans_item_based()
+    kmeans_item_based()