Browse Source

Merge remote-tracking branch 'origin/master' into cbrRefactor

gabirelik 8 months ago
parent
commit
42c66c6888

+ 1178
- 0
utils/notebooks/.ipynb_checkpoints/movie_lens_data_exploration-checkpoint.ipynb

+ 278
- 0
utils/notebooks/movie_lens_data_exploration.ipynb

+ 9
- 0
.dvc/.gitignore

@@ -0,0 +1,9 @@
+/config.local
+/updater
+/lock
+/updater.lock
+/tmp
+/state-journal
+/state-wal
+/state
+/cache

+ 0
- 0
.dvc/config

+ 5
- 0
.gitignore

@@ -5,6 +5,11 @@
 
 data/raw/the-movies-dataset/
 
+data/
+
 user_settings.py
 
 *.h5
+
+*/.ipynb_checkpoints
+/dataset

+ 36
- 1
README.md

@@ -43,4 +43,39 @@ We implemented and evaluated two deep learning methods:
 | Test set        | NeuCf (hits)       | Neural Content Based (hits)  |
 | :-------------: |:-------------:| :-----:|
 | Newest rating     | 56/671 | 26/671 |
-| Newest positive rating      | 141/671       |   51/671 |
+| Newest positive rating      | 141/671       |   51/671 |
+
+## Kmeans clustering
+
+All kmeans are item based. Kmeans recommendation system defines user with features of movies that he or she watched.
+It consists a matrix n x m, where n is number of rated movies by user and m is feature vector length.
+
+Task: recommend top n movies.
+
+**Kmeans_1:**
+
+1. Assign every movie rated by user to its cluster.
+2. Sample n clusters from previous step with probability distribution defined by user ratings.
+3. Sample movies from chosen clusters. Every movie with uniform distribution within a cluster.
+4. Remove duplicated recommendations and remove movies watched by user.
+5. If number of recommended movies is lower than n, return to step 3.
+
+**Kmeans_2:**
+
+In sampling clusters (point 2.), every cluster probability is exponential depending on user rating for movie.
+
+**Kmeans_3:**
+
+1. Assign every movie rated by user to its cluster.
+2. Rate every cluster by counting mean of user's ratings inside that cluster. 
+3. Sort clusters according to their ratings. In case of two clusters have the same rating, the cluster with more ratings is preferred.
+4. Choose cluster with the highest rating.
+5. Get all movies from the cluster. Sorted them by their popularity.
+6. Remove duplicated recommendations and remove movies watched by user.
+7. If number of recommended movies is lower than n, return to step 4 and select next cluster.
+
+#### Results
+|                        | Kmeans_1 | Kmeans_2 | Kmeans_3 |
+|------------------------|----------|----------|----------|
+| Newest rating          | 1.4/671  |  2.1/671 | 7/671    |
+| Newest positive rating | 1.7/671  |  1.7/671 | 5/671    |

+ 119
- 0
clustering/kmeans_item_based.py

@@ -0,0 +1,119 @@
+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans
+
+from interfaces import RecommendationMethod
+
+
+class KmeansItemBased(RecommendationMethod):
+    def __init__(self, cluster_selection="random", item_selection="random"):
+        """
+        :param cluster_selection: way of selecting clusters for recommendation - works only if item_selection is 'random':
+            'random' - sampling with repeating from clusters to which user's movies are assigned to
+                        probability of every cluster is proportional to user rating for a movie
+            'exponential' - sampling with repeating from clusters to which user's movies are assigned to
+                        probability of every cluster is proportional 2^r where r is rating for a movie
+        :param item_selection: way of selecting item from cluster for recommendation:
+            'random' - sampling uniformly
+            'popularity' - take most popular movies from cluster for recommendation
+        """
+        self.cluster_selection = cluster_selection
+        self.item_selection = item_selection
+
+    def fit(self, movies_features, n_clusters=10, popularity=None):
+        """
+
+        :param movies_features: features of all clusters
+        :param n_clusters: number of clusters
+        :param popularity: popularity of movies; used only if item_selection = 'popularity', otherwise ignored
+        :return:
+        """
+        self.movies_features = movies_features
+        if popularity is not None:
+            self.popularity = popularity
+        self.kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=0, n_jobs=-1,
+                             n_init=10, max_iter=500).fit(movies_features)
+        if self.item_selection == "popularity":
+            self._sort_by_popularity_for_clusters()
+
+    def get_recommendations(self, user_features, train_movie_ids, user_ratings, top_n):
+        self.recommended = list()
+        self.train_movie_ids = train_movie_ids
+        centroid_indices = self.kmeans.predict(user_features)
+
+        if self.item_selection == "popularity":
+            centroid_indices = self._choose_centroids(centroid_indices, user_ratings)
+            while len(self.recommended) < top_n:
+                self.recommended = list(self.recommended)
+                for centroid_id in centroid_indices:
+                    self.recommended.extend(self._get_most_popular(centroid_id))
+                self.recommended = np.unique(self.recommended)
+                for movie_id in train_movie_ids:
+                    self.recommended = self.recommended[self.recommended != movie_id]
+
+        else:
+            centroid_indices = self._sample_centroids(centroid_indices, user_ratings, top_n)
+
+            while len(self.recommended) < top_n:
+                self.recommended = list(self.recommended)
+                for centroid_id in centroid_indices:
+                    self.recommended.append(self._sample_from_cluster(centroid_id))
+                self.recommended = np.unique(self.recommended)
+                for movie_id in train_movie_ids:
+                    self.recommended = self.recommended[self.recommended != movie_id]
+
+        return self.recommended[:top_n]
+
+    def _sample_centroids(self, centroid_indices, user_ratings, size):
+        if self.cluster_selection == "exponential":
+            logits = np.power(2, 1 + 5 * np.asarray(user_ratings))
+            probs = logits / np.sum(logits)
+            chosen = np.random.choice(centroid_indices, size=size, p=probs)
+            return chosen
+        else:       # random
+            probs = user_ratings / np.sum(user_ratings)
+            chosen = np.random.choice(centroid_indices, size=size, p=probs)
+            return chosen
+
+    def _choose_centroids(self, centroid_indices, user_ratings):
+        score = dict()
+        weight = dict()
+
+        for centroid_id, user_rating in zip(centroid_indices, user_ratings):
+            if centroid_id not in score:
+                score[centroid_id] = user_rating
+                weight[centroid_id] = 1
+            else:
+                score[centroid_id] = score[centroid_id] + user_rating
+                weight[centroid_id] = weight[centroid_id] + 1
+
+        for key, value in score.items():
+            score[key] = value / weight[key]
+
+        centroids = [(key, value, weight[key]) for key, value in score.items()]
+        centroids = sorted(centroids, key=lambda x: (x[1], x[2]), reverse=True)
+        centroids = [x[0] for x in centroids]
+        
+        return centroids
+
+    def _sample_from_cluster(self, centroid_id):
+        indices = np.argwhere(self.kmeans.labels_ == centroid_id).squeeze()
+        return np.random.choice(indices, 1)[0]
+
+    def _get_most_popular(self, centroid_id):
+        return self.best_movies_per_cluster[centroid_id]
+
+    def _sort_by_popularity_for_clusters(self):
+        labels = self.kmeans.labels_
+        cluser_indices = np.unique(labels)
+        self.best_movies_per_cluster = list()
+
+        for cluster_index in cluser_indices:
+            item_indices = np.argwhere(labels == cluster_index).flatten()
+            if len(item_indices) == 0:
+                self.best_movies_per_cluster.append(list())
+                continue
+            cluster_popularities = [self.popularity[i] for i in item_indices]
+            zipped = list(zip(list(item_indices), cluster_popularities))
+            zipped = sorted(zipped, key=lambda x: x[1], reverse=True)
+            self.best_movies_per_cluster.append([x[0] for x in zipped])

+ 29
- 0
clustering/kmeans_user_based.py

@@ -0,0 +1,29 @@
+import numpy as np
+from sklearn.cluster import KMeans
+
+from interfaces import RecommendationMethod
+
+
+class KmeansUserBased(RecommendationMethod):
+    def __init__(self, movies_features):
+        self.movies_features = movies_features
+
+    def fit(self, users_features):
+        self.kmeans = KMeans(n_clusters=10, random_state=0, n_jobs=-1,
+                             n_init=100, max_iter=500).fit(users_features)
+
+    def get_recommendations(self, user_features, top_n):
+        centroid_features = self.kmeans.cluster_centers_[self.kmeans.predict(user_features)[0]]
+        dist = np.sum((self.movies_features - centroid_features) ** 2, axis=1)
+        dist = dist.argsort()
+        return dist[:top_n]
+
+    def predict(self, user_features, movie_features):
+        centroid_features = self.kmeans.cluster_centers_[self.kmeans.predict(user_features.reshape(1, -1))[0]]
+        rating = np.multiply(centroid_features, movie_features)
+        rating[rating == 0] = np.nan
+        rating = np.nanmean(rating) * 5
+        return rating
+
+
+

+ 27
- 0
create_dvc.bat

@@ -0,0 +1,27 @@
+setlocal
+set PYTHONPATH=%CD%
+dvc run^
+ -d prepare_dataset.py^
+ -o dataset^
+ -f prepare_dataset.dvc^
+ python prepare_dataset.py
+dvc run^
+ -d evaluation/scripts/run_experiments.py^
+ -d dataset -d clustering -d collaborative_filtering -d content_based_recomendation -d hybrid -d deep_learning -d data -d utils -d sequence^
+ -o results/run_experiments.csv^
+ -f run_experiments.dvc^
+ python evaluation/scripts/run_experiments.py
+dvc run^
+ -d evaluation/scripts/run_deep_learning_experiments.py^
+ -d dataset -d clustering -d collaborative_filtering -d content_based_recomendation -d hybrid -d deep_learning -d data -d utils -d sequence^
+ -o results/run_deep_learning_experiments.csv^
+ -f run_deep_learning_experiments.dvc^
+ python evaluation/scripts/run_deep_learning_experiments.py
+dvc run^
+ -d evaluation/scripts/run_clustering_experiments.py^
+ -d dataset -d clustering -d collaborative_filtering -d content_based_recomendation -d hybrid -d deep_learning -d data -d utils -d sequence^
+ -o results/run_clustering_experiments.csv^
+ -f run_clustering_experiments.dvc^
+ python evaluation/scripts/run_clustering_experiments.py
+endlocal
+pause

+ 239
- 0
evaluation/scripts/run_clustering_experiments.py

@@ -0,0 +1,239 @@
+from random import random
+
+import numpy as np
+import pandas as pd
+
+from sklearn.feature_extraction.text import CountVectorizer
+from tqdm import tqdm
+
+from clustering.kmeans_item_based import KmeansItemBased
+from deep_learning.utils import get_movie_id_to_feature_mapping
+from utils.evaluation.metrics import hit_rate
+from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
+from utils.id_indexer import Indexer
+from settings import PATH_TO_DATA
+
+user_column = 'userId'
+item_column = 'movieId'
+rating_column = 'rating'
+
+
+# def get_user_features_on_movies_features(user_ratings_df, indexer, movies_features):
+#     user_features = []
+# 
+#     for user_internal_id, user_id in indexer.internal_id_to_user_id_dict.items():
+#         user_ratings = user_ratings_df[user_ratings_df[user_column] == user_id][[item_column, rating_column]].values
+#         user_rated_movies_id = [indexer.get_movie_internal_id(i) for i in user_ratings[:, 0].astype(int)]
+#         user_ratings = np.expand_dims(user_ratings[:, 1] / 5.0, axis=1)
+#         user_rated_movies_features = movies_features[user_rated_movies_id, :]
+#         user_movies_features = np.multiply(user_ratings, user_rated_movies_features)
+#         user_movies_features[user_movies_features == 0] = np.nan
+#         user_movies_features = np.nan_to_num(np.nanmean(user_movies_features, axis=0))
+#         # user_movies_features[user_movies_features == 0] = 0.5
+#         # user_movies_features = np.mean(user_movies_features, axis=0)
+# 
+#         user_features.append(user_movies_features)
+# 
+#     return np.array(user_features)
+# 
+# 
+# def kmeans_user_based():
+#     user_ratings_df = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/ratings_small.csv")
+#     movies_metadata = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/movies_metadata_clean.csv")
+#     movie_id_features_dict = get_movie_id_to_feature_mapping(movies_metadata)
+#     user_ratings_df = user_ratings_df[user_ratings_df[item_column].isin(movie_id_features_dict.keys())]
+# 
+#     dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
+#     features_extractor = FeaturesExtractor(dataset_path)
+#     movies_data = features_extractor.run()
+#     movies_data = movies_data.drop_duplicates(["id"])
+# 
+#     cv = CountVectorizer(min_df=3)
+#     movies_features = cv.fit_transform(movies_data['combined']).toarray().astype(float)
+#     indexer = Indexer(user_ids=user_ratings_df[user_column].unique(), movies_ids=movies_data['id'])
+# 
+#     method = KmeansUserBased(movies_features=movies_features)
+# 
+#     for train_df, test_df in user_leave_on_out(user_ratings_df, timestamp_column="timestamp"):
+#         user_features = get_user_features_on_movies_features(train_df, indexer, movies_features)
+# 
+#         # train_data = map_df_to_model_input(train_df, movies_features, user_features, indexer)
+#         test_data = map_df_to_model_input(test_df, movies_features, user_features, indexer)
+# 
+#         method.fit(user_features)
+# 
+#         for i, (index, row) in enumerate(test_df.iterrows()):
+#             user, movie, rating = test_data[i]
+#             recommendations = method.get_recommendations(user.reshape(1, -1), top_n=10)
+# 
+#             user_rated_movies = user_ratings_df[user_ratings_df[user_column] == row[user_column]] \
+#                 .sort_values(rating_column, ascending=False)[[item_column]] \
+#                 .values.squeeze()
+# 
+#             user_rated_movies_ratings = user_ratings_df[user_ratings_df[user_column] == row[user_column]] \
+#                 .sort_values(rating_column, ascending=False)[[rating_column]] \
+#                 .values.squeeze()
+# 
+#             recommended_movies = [indexer.get_movie_id(movie_internal_id) for movie_internal_id in recommendations]
+# 
+#             print(f"Test movie: {movie_id_features_dict[row[item_column]]}, rating: {row[rating_column]}")
+# 
+#             print("Rated movies: ")
+#             for movie_id, rating in zip(user_rated_movies, user_rated_movies_ratings):
+#                 print(movie_id_features_dict[movie_id], f"rating: {rating}")
+# 
+#             print("Recommended movies: ")
+#             for movie_id in recommended_movies:
+#                 print(movie_id_features_dict[movie_id])
+
+
+def create_kmeans_item_based_input_df(user_ratings_df, movies_features, indexer, timestamp_column=None,
+                                      rating_threshold=None):
+    users_features = list()
+    train_movies_ids = list()
+    train_ratings = list()
+    test_movies_ids = list()
+    test_ratings = list()
+
+    user_ids = user_ratings_df[user_column].unique()
+    test_indices = []
+
+    for user_id in tqdm(user_ids):
+        user_ratings = user_ratings_df[user_ratings_df[user_column] == user_id]
+
+        if rating_threshold is not None:
+            thresholded_user_ratings = user_ratings[user_ratings[rating_column] >= rating_threshold]
+
+            while len(thresholded_user_ratings) < 2:
+                rating_threshold -= 0.5
+                thresholded_user_ratings = user_ratings[user_ratings[rating_column] >= rating_threshold]
+
+            user_ratings = thresholded_user_ratings
+
+        if timestamp_column is not None:
+            user_ratings = user_ratings.sort_values(by=timestamp_column)
+            test_index = len(user_ratings) - 1
+        else:
+            test_index = random.randint(0, len(user_ratings) - 1)
+
+        indices = user_ratings.index.values
+        test_indices.append(indices[test_index])
+
+        users_feature_matrix = list()
+        train_movie_id = list()
+        train_rating = list()
+        for index in indices[:-1]:
+            movie_id = user_ratings_df[item_column][index]
+            train_movie_id.append(movie_id)
+            users_feature_matrix.append(movies_features[indexer.get_movie_internal_id(movie_id)])
+            train_rating.append(user_ratings_df[rating_column][index])
+        users_feature_matrix = np.asarray(users_feature_matrix)
+        users_features.append(users_feature_matrix)
+        train_movies_ids.append(train_movie_id)
+        train_ratings.append(train_rating)
+        test_movies_ids.append(user_ratings_df[item_column][indices[test_index]])
+        test_ratings.append(user_ratings_df[rating_column][indices[test_index]])
+
+    df = pd.DataFrame()
+    df["user_id"] = user_ids
+    df["user_matrix"] = users_features
+    df["train_movie_ids"] = train_movies_ids
+    df["train_ratings"] = train_ratings
+    df["test_movie_ids"] = test_movies_ids
+    df["test_ratings"] = test_ratings
+
+    return df
+
+
+def prepare_popularities(movies_metadata, movies_data):
+    popularities = list()
+    for index, row in movies_data.iterrows():
+        popularities.append(movies_metadata[movies_metadata["id"] == row["id"]]["popularity"].values[0])
+    if len(popularities) != len(movies_data):
+        raise ValueError("Len of popularities must be equal to len of movies_data")
+    return popularities
+
+
+def kmeans_item_based():
+    user_ratings_df = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/ratings_small.csv")
+    movies_metadata = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/movies_metadata_clean.csv")
+    movie_id_features_dict = get_movie_id_to_feature_mapping(movies_metadata)
+    user_ratings_df = user_ratings_df[user_ratings_df[item_column].isin(movie_id_features_dict.keys())]
+
+    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
+    features_extractor = FeaturesExtractor(dataset_path)
+    movies_data = features_extractor.run()
+    movies_data = movies_data.drop_duplicates(["id"])
+
+    cv = CountVectorizer(min_df=3)
+    movies_features = cv.fit_transform(movies_data['combined']).toarray().astype(float)
+    indexer = Indexer(user_ids=user_ratings_df[user_column].unique(), movies_ids=movies_data['id'])
+
+    results = list()
+    for n_centroids in [5, 10, 20, 50, 100]:
+        method = KmeansItemBased(cluster_selection="exponential",
+                                 item_selection="random")
+        print("Fitting kmeans...")
+        method.fit(movies_features, n_clusters=n_centroids,
+                   popularity=prepare_popularities(movies_metadata, movies_data))
+
+        df_dataset = create_kmeans_item_based_input_df(user_ratings_df, movies_features, indexer,
+                                                       timestamp_column="timestamp",
+                                                       rating_threshold=None)
+
+        print("Testing...")
+        iterations = 0
+        all_hits = 0
+
+        for i in range(10):
+            for index, row in tqdm(df_dataset.iterrows()):
+                user_id = row["user_id"]
+                user_matrix = row["user_matrix"]
+                train_movie_ids = row["train_movie_ids"]
+                train_ratings = row["train_ratings"]
+                test_movie_id = row["test_movie_ids"]
+                test_rating = row["test_ratings"]
+
+                recommendations = method.get_recommendations(user_matrix, train_movie_ids, train_ratings, top_n=10)
+
+                user_rated_movies = user_ratings_df[user_ratings_df[user_column] == user_id] \
+                    .sort_values(rating_column, ascending=False)[[item_column]] \
+                    .values.squeeze()
+
+                user_rated_movies_ratings = user_ratings_df[user_ratings_df[user_column] == user_id] \
+                    .sort_values(rating_column, ascending=False)[[rating_column]] \
+                    .values.squeeze()
+
+                recommended_movies = [indexer.get_movie_id(movie_internal_id) for movie_internal_id in
+                                      recommendations]
+
+                print(f"Test movie: {movie_id_features_dict[test_movie_id]}, rating: {test_rating}")
+
+                print("Rated movies: ")
+                for movie_id, rating in zip(user_rated_movies, user_rated_movies_ratings):
+                    print(movie_id_features_dict[movie_id], f"rating: {rating}")
+
+                print("Recommended movies: ")
+                for movie_id in recommended_movies:
+                    print(movie_id_features_dict[movie_id])
+
+                hits = hit_rate(gt_items_idx=[test_movie_id], predicted_items_idx=recommendations)
+
+                all_hits += hits
+                iterations += 1
+
+        if all_hits > 0:
+            print(f"{method.__class__}: {all_hits}/{iterations}")
+            print(f"Percentage-wise: {all_hits / iterations}")
+        print(f"Total hits: {all_hits}")
+        print(f"Total iterations: {iterations}")
+
+        results.append([n_centroids, None, all_hits / 10.0])
+
+        for row in results:
+            print(row)
+
+
+if __name__ == '__main__':
+    # kmaens_user_based()
+    kmeans_item_based()

+ 11
- 0
prepare_dataset.dvc

@@ -0,0 +1,11 @@
+md5: c122007d14c11c687c0143b0004f77f1
+cmd: python prepare_dataset.py
+deps:
+- md5: 86a6eb0445e40183badfe3fa3171e4f4
+  path: prepare_dataset.py
+outs:
+- md5: 355acab208f5585fd279c9147f8f0350.dir
+  path: dataset
+  cache: true
+  metric: false
+  persist: false

+ 78
- 0
prepare_dataset.py

@@ -0,0 +1,78 @@
+import os
+import io
+import requests
+import zipfile
+import shutil
+from content_based_recomendation.scripts.movie_lens_content_based_recomendation import filter_ratings
+from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
+from settings import PATH_TO_DATA
+
+
+def download_file_from_google_drive(id, destination):
+    URL = "https://docs.google.com/uc?export=download"
+
+    session = requests.Session()
+
+    response = session.get(URL, params={'id': id}, stream=True)
+    token = get_confirm_token(response)
+
+    if token:
+        params = {'id': id, 'confirm': token}
+        response = session.get(URL, params=params, stream=True)
+
+    # save_response_content(response, destination)
+    return response
+
+
+def get_confirm_token(response):
+    for key, value in response.cookies.items():
+        if key.startswith('download_warning'):
+            return value
+
+    return None
+
+
+def save_response_content(response, destination):
+    CHUNK_SIZE = 32768
+
+    with open(destination, "wb") as f:
+        for chunk in response.iter_content(CHUNK_SIZE):
+            if chunk:  # filter out keep-alive new chunks
+                f.write(chunk)
+
+
+def unpack_starts_with(zip_file, zip_skip, save_path):
+    members = [x for x in zip_file.NameToInfo.keys() if x.startswith(zip_skip) and len(x) > len(zip_skip)]
+    for mem in members:
+        path = save_path + mem[len(zip_skip):]
+        if not path.endswith('/'):
+            read_file = zip_file.open(mem)
+            with open(path, 'wb') as write_file:
+                shutil.copyfileobj(read_file, write_file)
+        else:
+            os.makedirs(path, exist_ok=True)
+
+
+def main():
+    eas_path = './dataset/raw/the-movies-dataset/'
+    eas_zip_skip = ''
+    eas_gdrive_id = '1Qx9FAqaIG9PbMRJ6coT_NNA9Bck3-jSZ'
+
+    os.makedirs(eas_path, exist_ok=True)
+    print('Downloading...')
+    r = download_file_from_google_drive(eas_gdrive_id, None)
+    print('Unzip...')
+    z = zipfile.ZipFile(io.BytesIO(r.content))
+    unpack_starts_with(z, eas_zip_skip, eas_path)
+    print('Filtering')
+    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
+    features_extractor = FeaturesExtractor(dataset_path)
+    data = features_extractor.run()
+    filter_ratings(dataset_path, data)
+    print('Done')
+
+
+if __name__ == '__main__':
+    main()
+
+

+ 3
- 0
results/.gitignore

@@ -0,0 +1,3 @@
+/run_clustering_experiments.csv
+/run_experiments.csv
+/run_deep_learning_experiments.csv

+ 29
- 0
run_clustering_experiments.dvc

@@ -0,0 +1,29 @@
+md5: d09678ee619ad958b731451f67292995
+cmd: python evaluation/scripts/run_clustering_experiments.py
+deps:
+- md5: edb4d8173181f21d5f6573742ce1de1c
+  path: evaluation/scripts/run_clustering_experiments.py
+- md5: 355acab208f5585fd279c9147f8f0350.dir
+  path: dataset
+- md5: eea90dffb0b63a8a5cabdff2cb84fe47.dir
+  path: clustering
+- md5: 0d922879691335de5079cecf52a52df8.dir
+  path: collaborative_filtering
+- md5: ef78c70597b682960c666a7f9734232b.dir
+  path: content_based_recomendation
+- md5: ed456204f499ee14691dc86baa154f07.dir
+  path: hybrid
+- md5: 3f0b3450bd454a4b2f28b99b41fcb8cb.dir
+  path: deep_learning
+- md5: 979a2ee82d026442f432f4254a27b052.dir
+  path: data
+- md5: 4cfc2400550789f59eccee40c53a78c0.dir
+  path: utils
+- md5: 7156a3423242d73daad6a9c637a5bd20.dir
+  path: sequence
+outs:
+- md5: 8d777f385d3dfec8815d20f7496026dc
+  path: results/run_clustering_experiments.csv
+  cache: true
+  metric: false
+  persist: false

+ 29
- 0
run_deep_learning_experiments.dvc

@@ -0,0 +1,29 @@
+md5: 700fd4a92b4e485f4b0f646e0c172791
+cmd: python evaluation/scripts/run_deep_learning_experiments.py
+deps:
+- md5: 9c238a242ae70f5a8633388f1120f99f
+  path: evaluation/scripts/run_deep_learning_experiments.py
+- md5: 355acab208f5585fd279c9147f8f0350.dir
+  path: dataset
+- md5: eea90dffb0b63a8a5cabdff2cb84fe47.dir
+  path: clustering
+- md5: 0d922879691335de5079cecf52a52df8.dir
+  path: collaborative_filtering
+- md5: ef78c70597b682960c666a7f9734232b.dir
+  path: content_based_recomendation
+- md5: ed456204f499ee14691dc86baa154f07.dir
+  path: hybrid
+- md5: 3f0b3450bd454a4b2f28b99b41fcb8cb.dir
+  path: deep_learning
+- md5: 979a2ee82d026442f432f4254a27b052.dir
+  path: data
+- md5: 4cfc2400550789f59eccee40c53a78c0.dir
+  path: utils
+- md5: 7156a3423242d73daad6a9c637a5bd20.dir
+  path: sequence
+outs:
+- md5: 8d777f385d3dfec8815d20f7496026dc
+  path: results/run_deep_learning_experiments.csv
+  cache: true
+  metric: false
+  persist: false

+ 29
- 0
run_experiments.dvc

@@ -0,0 +1,29 @@
+md5: cb943f590084079344ec245edcf7f3df
+cmd: python evaluation/scripts/run_experiments.py
+deps:
+- md5: 497dc67168efab85cef1aef71f8e53ae
+  path: evaluation/scripts/run_experiments.py
+- md5: 355acab208f5585fd279c9147f8f0350.dir
+  path: dataset
+- md5: eea90dffb0b63a8a5cabdff2cb84fe47.dir
+  path: clustering
+- md5: 0d922879691335de5079cecf52a52df8.dir
+  path: collaborative_filtering
+- md5: ef78c70597b682960c666a7f9734232b.dir
+  path: content_based_recomendation
+- md5: ed456204f499ee14691dc86baa154f07.dir
+  path: hybrid
+- md5: 3f0b3450bd454a4b2f28b99b41fcb8cb.dir
+  path: deep_learning
+- md5: 979a2ee82d026442f432f4254a27b052.dir
+  path: data
+- md5: 4cfc2400550789f59eccee40c53a78c0.dir
+  path: utils
+- md5: 7156a3423242d73daad6a9c637a5bd20.dir
+  path: sequence
+outs:
+- md5: 8d777f385d3dfec8815d20f7496026dc
+  path: results/run_experiments.csv
+  cache: true
+  metric: false
+  persist: false

+ 0
- 0
sequence/scripts/run.py

+ 2
- 0
utils/id_indexer.py

@@ -6,6 +6,8 @@ def _generate_mapping(ids):
         if i not in id_to_data_id_vocab.values():
             id_to_data_id_vocab[id] = i
             id += 1
+        else:
+            print(i)
 
     return id_to_data_id_vocab, {v: k for k, v in id_to_data_id_vocab.items()}