Browse Source

Deep learning code refecatored

Witold 8 months ago
parent
commit
12f5d512a7

+ 122
- 87
collaborative_filtering/neural_collaborative_filtering.py

@@ -4,51 +4,51 @@ from tensorflow.python.keras.callbacks import TensorBoard, ModelCheckpoint, Earl
 from tqdm import tqdm
 from settings import CHECKPOINTS_DIRECTORY
 from interfaces import RecommendationMethod
-from collaborative_filtering.utils import create_id_vocab, create_user_items_rating_matrix
+from collaborative_filtering.utils import create_id_vocab, create_user_items_rating_matrix_w_indexer
 
 MLP_DENSE_LAYERS_SIZE = [32, 16, 4]
 
 
 class NeuralCollaborativeFiltering(RecommendationMethod):
-    def __init__(self, users_ids, items_ids):
-        self.id_to_user_id_vocab, self.user_id_to_id_vocab = create_id_vocab(users_ids)
-        self.id_to_item_id_vocab, self.item_id_to_id_vocab = create_id_vocab(items_ids)
+    def __init__(self, indexer, n_factors, model_name):
         self.model = None
+        self.name = model_name
+        self.indexer = indexer
+        self.n_factors = n_factors
         self.user_ratings = None
         self.train_loss = tf.keras.metrics.Mean(name='train_loss')
         self.test_loss = tf.keras.metrics.Mean(name='test_loss')
 
     @property
     def num_users(self):
-        return len(self.id_to_user_id_vocab.keys())
+        return len(self.indexer.internal_id_to_user_id_dict.keys())
 
     @property
     def num_items(self):
-        return len(self.id_to_item_id_vocab.keys())
+        return len(self.indexer.internal_id_to_movie_id_dict.keys())
 
-    def load_model(self, filepath, train_user_item_ratings, n_factors):
+    def load_model(self, filepath, train_user_item_ratings):
         user_inputs = tf.keras.Input(shape=(1,))
         item_inputs = tf.keras.Input(shape=(1,))
 
-        self.user_ratings = create_user_items_rating_matrix(train_user_item_ratings, self.user_id_to_id_vocab,
-                                                            self.item_id_to_id_vocab)
+        self.user_ratings = create_user_items_rating_matrix_w_indexer(train_user_item_ratings, self.indexer)
 
-        mf_user_embedding = tf.keras.layers.Embedding(self.num_users, n_factors, input_length=1,
+        mf_user_embedding = tf.keras.layers.Embedding(self.num_users, self.n_factors, input_length=1,
                                                       name="gmf_user_embedding")(user_inputs)
-        mf_user_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mf_user_embedding)
+        mf_user_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mf_user_embedding)
 
-        mf_item_embedding = tf.keras.layers.Embedding(self.num_items, n_factors, input_length=1,
+        mf_item_embedding = tf.keras.layers.Embedding(self.num_items, self.n_factors, input_length=1,
                                                       name="gmf_item_embedding")(item_inputs)
-        mf_item_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mf_item_embedding)
+        mf_item_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mf_item_embedding)
 
-        mlp_user_embedding = tf.keras.layers.Embedding(self.num_users, n_factors, input_length=1,
+        mlp_user_embedding = tf.keras.layers.Embedding(self.num_users, self.n_factors, input_length=1,
                                                        name="mlp_user_embedding")(user_inputs)
-        mlp_user_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mlp_user_embedding)
+        mlp_user_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mlp_user_embedding)
 
-        mlp_item_embedding = tf.keras.layers.Embedding(self.num_items, n_factors, input_length=1,
+        mlp_item_embedding = tf.keras.layers.Embedding(self.num_items, self.n_factors, input_length=1,
                                                        name="mlp_item_embedding")(item_inputs)
 
-        mlp_item_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mlp_item_embedding)
+        mlp_item_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mlp_item_embedding)
 
         gmf_dot_product = tf.keras.layers.Dot(axes=1)([mf_user_embedding, mf_item_embedding])
         gmf_dot_product = tf.keras.layers.Flatten()(gmf_dot_product)
@@ -70,25 +70,25 @@ class NeuralCollaborativeFiltering(RecommendationMethod):
         self.model = tf.keras.models.Model([user_inputs, item_inputs], final_dense)
         self.model.load_weights(filepath=filepath)
 
-    def _build_simple_dot_model(self, n_factors):
+    def _build_simple_dot_model(self):
         users_embedding = tf.keras.Sequential()
-        users_embedding.add(tf.keras.layers.Embedding(self.num_users, n_factors, input_length=1))
-        users_embedding.add(tf.keras.layers.Reshape(target_shape=(n_factors,)))
+        users_embedding.add(tf.keras.layers.Embedding(self.num_users, self.n_factors, input_length=1))
+        users_embedding.add(tf.keras.layers.Reshape(target_shape=(self.n_factors,)))
         items_embedding = tf.keras.models.Sequential()
-        items_embedding.add(tf.keras.layers.Embedding(self.num_items, n_factors, input_length=1))
-        items_embedding.add(tf.keras.layers.Reshape(target_shape=(n_factors,)))
+        items_embedding.add(tf.keras.layers.Embedding(self.num_items, self.n_factors, input_length=1))
+        items_embedding.add(tf.keras.layers.Reshape(target_shape=(self.n_factors,)))
         dot = tf.keras.layers.Dot(axes=1)([users_embedding.output, items_embedding.output])
         model = tf.keras.models.Model([users_embedding.input, items_embedding.input], dot)
 
         return model
 
-    def _build_concat_model(self, n_factors):
+    def _build_concat_model(self):
         users_embedding = tf.keras.Sequential()
-        users_embedding.add(tf.keras.layers.Embedding(self.num_users, n_factors, input_length=1))
-        users_embedding.add(tf.keras.layers.Reshape(target_shape=(n_factors,)))
+        users_embedding.add(tf.keras.layers.Embedding(self.num_users, self.n_factors, input_length=1))
+        users_embedding.add(tf.keras.layers.Reshape(target_shape=(self.n_factors,)))
         items_embedding = tf.keras.models.Sequential()
-        items_embedding.add(tf.keras.layers.Embedding(self.num_items, n_factors, input_length=1))
-        items_embedding.add(tf.keras.layers.Reshape(target_shape=(n_factors,)))
+        items_embedding.add(tf.keras.layers.Embedding(self.num_items, self.n_factors, input_length=1))
+        items_embedding.add(tf.keras.layers.Reshape(target_shape=(self.n_factors,)))
         concatenated = tf.keras.layers.Concatenate(axis=1)([users_embedding.output, items_embedding.output])
         flattened = tf.keras.layers.Flatten()(concatenated)
         dense_1 = tf.keras.layers.Dense(64, activation=tf.nn.relu, trainable=True)(flattened)
@@ -99,17 +99,17 @@ class NeuralCollaborativeFiltering(RecommendationMethod):
 
         return model
 
-    def _build_gmf_model(self, n_factors):
+    def _build_gmf_model(self):
         user_inputs = tf.keras.Input(shape=(1,))
         item_inputs = tf.keras.Input(shape=(1,))
 
-        mf_user_embedding = tf.keras.layers.Embedding(self.num_users, n_factors, input_length=1,
+        mf_user_embedding = tf.keras.layers.Embedding(self.num_users, self.n_factors, input_length=1,
                                                       name="gmf_user_embedding")(user_inputs)
-        mf_user_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mf_user_embedding)
+        mf_user_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mf_user_embedding)
 
-        mf_item_embedding = tf.keras.layers.Embedding(self.num_items, n_factors, input_length=1,
+        mf_item_embedding = tf.keras.layers.Embedding(self.num_items, self.n_factors, input_length=1,
                                                       name="gmf_item_embedding")(item_inputs)
-        mf_item_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mf_item_embedding)
+        mf_item_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mf_item_embedding)
 
         gmf_dot_product = tf.keras.layers.Dot(axes=1)([mf_user_embedding, mf_item_embedding])
         gmf_dot_product = tf.keras.layers.Flatten()(gmf_dot_product)
@@ -121,17 +121,17 @@ class NeuralCollaborativeFiltering(RecommendationMethod):
 
         return model
 
-    def _build_mlp_model(self, n_factors):
+    def _build_mlp_model(self):
         user_inputs = tf.keras.Input(shape=(1,))
         item_inputs = tf.keras.Input(shape=(1,))
 
-        mlp_user_embedding = tf.keras.layers.Embedding(self.num_users, n_factors, input_length=1,
+        mlp_user_embedding = tf.keras.layers.Embedding(self.num_users, self.n_factors, input_length=1,
                                                        name="mlp_user_embedding")(user_inputs)
-        mlp_user_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mlp_user_embedding)
+        mlp_user_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mlp_user_embedding)
 
-        mlp_item_embedding = tf.keras.layers.Embedding(self.num_items, n_factors, input_length=1,
+        mlp_item_embedding = tf.keras.layers.Embedding(self.num_items, self.n_factors, input_length=1,
                                                        name="mlp_item_embedding")(item_inputs)
-        mlp_item_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mlp_item_embedding)
+        mlp_item_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mlp_item_embedding)
 
         mlp_concatenation = tf.keras.layers.Concatenate(axis=1)(
             [mlp_user_embedding, mlp_item_embedding])
@@ -149,33 +149,33 @@ class NeuralCollaborativeFiltering(RecommendationMethod):
 
         return model
 
-    def _build_neu_mf_model(self, n_factors, pretrained_gmf_model, pretrained_mlp_model):
+    def _build_neu_mf_model(self, pretrained_gmf_model, pretrained_mlp_model):
         user_inputs = tf.keras.Input(shape=(1,))
         item_inputs = tf.keras.Input(shape=(1,))
 
-        mf_user_embedding = tf.keras.layers.Embedding(self.num_users, n_factors, input_length=1,
+        mf_user_embedding = tf.keras.layers.Embedding(self.num_users, self.n_factors, input_length=1,
                                                       name="gmf_user_embedding",
                                                       weights=pretrained_gmf_model.get_layer(
                                                           "gmf_user_embedding").get_weights())(user_inputs)
-        mf_user_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mf_user_embedding)
+        mf_user_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mf_user_embedding)
 
-        mf_item_embedding = tf.keras.layers.Embedding(self.num_items, n_factors, input_length=1,
+        mf_item_embedding = tf.keras.layers.Embedding(self.num_items, self.n_factors, input_length=1,
                                                       name="gmf_item_embedding",
                                                       weights=pretrained_gmf_model.get_layer(
                                                           "gmf_item_embedding").get_weights())(item_inputs)
-        mf_item_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mf_item_embedding)
+        mf_item_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mf_item_embedding)
 
-        mlp_user_embedding = tf.keras.layers.Embedding(self.num_users, n_factors, input_length=1,
+        mlp_user_embedding = tf.keras.layers.Embedding(self.num_users, self.n_factors, input_length=1,
                                                        name="mlp_user_embedding",
                                                        weights=pretrained_mlp_model.get_layer(
                                                            "mlp_user_embedding").get_weights())(user_inputs)
-        mlp_user_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mlp_user_embedding)
+        mlp_user_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mlp_user_embedding)
 
-        mlp_item_embedding = tf.keras.layers.Embedding(self.num_items, n_factors, input_length=1,
+        mlp_item_embedding = tf.keras.layers.Embedding(self.num_items, self.n_factors, input_length=1,
                                                        name="mlp_item_embedding",
                                                        weights=pretrained_mlp_model.get_layer(
                                                            "mlp_item_embedding").get_weights())(item_inputs)
-        mlp_item_embedding = tf.keras.layers.Reshape(target_shape=(n_factors,))(mlp_item_embedding)
+        mlp_item_embedding = tf.keras.layers.Reshape(target_shape=(self.n_factors,))(mlp_item_embedding)
 
         gmf_dot_product = tf.keras.layers.Dot(axes=1)([mf_user_embedding, mf_item_embedding])
         gmf_dot_product = tf.keras.layers.Flatten()(gmf_dot_product)
@@ -234,16 +234,16 @@ class NeuralCollaborativeFiltering(RecommendationMethod):
         items_ids = np.arange(0, self.user_ratings.shape[1])
 
         for u, i, r in data:
-            non_rated_movies = self.user_ratings[self.user_id_to_id_vocab[u], :] == 0
+            non_rated_movies = self.user_ratings[self.indexer.get_user_internal_id(u), :] == 0
             ratings_to_sample = np.random.choice(items_ids[non_rated_movies], count_for_one_user)
             for s in ratings_to_sample:
-                new_data.append((u, self.id_to_item_id_vocab[s], 0))
+                new_data.append((u, self.indexer.get_movie_id(s), 0))
 
         return new_data
 
     def _generate_dataset(self, data, batch_size):
-        users_ids = np.expand_dims(np.array([self.user_id_to_id_vocab[r[0]] for r in data]), axis=1)
-        items_ids = np.expand_dims(np.array([self.item_id_to_id_vocab[r[1]] for r in data]), axis=1)
+        users_ids = np.expand_dims(np.array([self.indexer.get_user_internal_id(r[0]) for r in data]), axis=1)
+        items_ids = np.expand_dims(np.array([self.indexer.get_movie_internal_id(r[1]) for r in data]), axis=1)
         ratings_ids = np.expand_dims(np.array([r[2] for r in data]), axis=1)
         return tf.data.Dataset.from_tensor_slices((users_ids, items_ids, ratings_ids)).shuffle(10000, seed=56).batch(
             batch_size)
@@ -270,91 +270,126 @@ class NeuralCollaborativeFiltering(RecommendationMethod):
 
         self.train_loss(loss)
 
-    def _pretrain_models(self, train_user_item_ratings, epochs=20, batch_size=100,
-                         n_factors=None):
+    def _pretrain_models(self, train_user_item_ratings, epochs=20, batch_size=100, error_delta=-1.0):
 
         print("Starting pretraining phase...")
 
-        gmf_model = self._build_gmf_model(n_factors=n_factors)
-        mlp_model = self._build_mlp_model(n_factors=n_factors)
+        gmf_model = self._build_gmf_model()
+        mlp_model = self._build_mlp_model()
 
         train_ds = self._generate_dataset(train_user_item_ratings, batch_size)
 
         loss_object = tf.keras.losses.MeanSquaredError()
-        optimizer = tf.keras.optimizers.Adam()
+        optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
 
-        for e in range(epochs):
+        done_epochs = 0
+        prev_loss = float('inf')
+        delta = float('inf')
+
+        while done_epochs < epochs and (error_delta == -1.0 or delta > error_delta):
             for train_data in tqdm(train_ds, total=len(train_user_item_ratings) // batch_size):
                 user_id, item_id, rating = train_data
                 self.gmf_pretrain_step(gmf_model, optimizer, loss_object, user_id, item_id, rating)
 
             template = 'Epoch {}, Loss: {}'
+            delta = prev_loss - self.train_loss.result().numpy()
+            prev_loss = self.train_loss.result().numpy()
 
-            print(template.format(e + 1,
+            print(template.format(done_epochs + 1,
                                   self.train_loss.result().numpy()))
+            done_epochs += 1
 
         self.train_loss.reset_states()
 
         loss_object = tf.keras.losses.MeanSquaredError()
-        optimizer = tf.keras.optimizers.Adam()
+        optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
+
+        done_epochs = 0
+        prev_loss = float('inf')
+        delta = float('inf')
 
-        for e in range(epochs):
+        while done_epochs < epochs and (prev_loss == -1 or delta > error_delta):
             for train_data in tqdm(train_ds, total=len(train_user_item_ratings) // batch_size):
                 user_id, item_id, rating = train_data
                 self.mlp_pretrain_step(mlp_model, optimizer, loss_object, user_id, item_id, rating)
 
             template = 'Epoch {}, Loss: {}'
+            delta = prev_loss - self.train_loss.result().numpy()
+            prev_loss = self.train_loss.result().numpy()
 
-            print(template.format(e + 1,
+            print(template.format(done_epochs + 1,
                                   self.train_loss.result().numpy()))
+            done_epochs += 1
 
         self.train_loss.reset_states()
 
         return gmf_model, mlp_model
 
-    def fit(self, train_user_item_ratings, test_user_item_ratings, epochs=10, batch_size=100, n_factors=16):
-        self.user_ratings = create_user_items_rating_matrix(train_user_item_ratings, self.user_id_to_id_vocab,
-                                                            self.item_id_to_id_vocab)
+    def fit(self, train_user_items_ratings, test_user_items_ratings=None, batch_size=20, epochs=20, early_stopping=-1):
+        self.user_ratings = create_user_items_rating_matrix_w_indexer(train_user_items_ratings, self.indexer)
+
+        train_user_items_ratings.extend(self._generate_negative_samples(train_user_items_ratings, 3))
+        gmf_model, mlp_model = self._pretrain_models(train_user_items_ratings, epochs=30, error_delta=0.005)
 
-        train_user_item_ratings.extend(self._generate_negative_samples(train_user_item_ratings, 5))
-        gmf_model, mlp_model = self._pretrain_models(train_user_item_ratings, epochs=10, n_factors=n_factors)
+        self.model = self._build_neu_mf_model(pretrained_gmf_model=gmf_model, pretrained_mlp_model=mlp_model)
 
-        self.model = self._build_neu_mf_model(n_factors, pretrained_gmf_model=gmf_model, pretrained_mlp_model=mlp_model)
+        train_ds = self._generate_dataset(train_user_items_ratings, batch_size)
+        eval_test = test_user_items_ratings is not None
+
+        if eval_test:
+            test_ds = self._generate_dataset(test_user_items_ratings, batch_size)
 
         loss_object = tf.keras.losses.MeanSquaredError()
-        optimizer = tf.keras.optimizers.Adam()
+        optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
 
-        train_ds = self._generate_dataset(train_user_item_ratings, batch_size)
-        test_ds = self._generate_dataset(test_user_item_ratings, batch_size)
+        prev_test_loss = float("inf")
+        patience = early_stopping
+        done_epochs = 0
 
-        for e in range(epochs):
-            for train_data in tqdm(train_ds, total=len(train_user_item_ratings) // batch_size):
-                user_id, item_id, rating = train_data
-                self.train_step(optimizer, loss_object, user_id, item_id, rating)
+        while done_epochs < epochs and patience != 0:
+            for train_data in tqdm(train_ds, total=len(train_user_items_ratings) // batch_size):
+                users, items, ratings = train_data
+                self.train_step(optimizer, loss_object, users, items, ratings)
+
+            if eval_test:
+                for test_data in tqdm(test_ds, total=len(test_user_items_ratings) // batch_size):
+                    users, items, ratings = test_data
+                    self.test_step(loss_object, users, items, ratings)
+
+                template = 'Epoch {}, Loss: {}, Test Loss: {}'
+                print(template.format(done_epochs + 1,
+                                      self.train_loss.result().numpy(),
+                                      self.test_loss.result().numpy()))
+
+                if self.test_loss.result().numpy() < prev_test_loss:
+                    prev_test_loss = self.test_loss.result().numpy()
+                    patience = early_stopping
+                else:
+                    patience -= 1
 
-            for test_data in tqdm(test_ds, total=len(test_user_item_ratings) // batch_size):
-                user_id, item_id, rating = test_data
-                self.test_step(loss_object, user_id, item_id, rating)
+            else:
+                template = 'Epoch {}, Loss: {}'
+                print(template.format(done_epochs + 1,
+                                      self.train_loss.result().numpy()))
 
-            template = 'Epoch {}, Loss: {}, Test Loss: {}'
+            done_epochs += 1
 
-            print(template.format(e + 1,
-                                  self.train_loss.result().numpy(),
-                                  self.test_loss.result().numpy()))
+        if patience == 0:
+            print("Early stopped")
 
-        self.model.save(f'{CHECKPOINTS_DIRECTORY}/model.h5')
+        self.model.save(f'{CHECKPOINTS_DIRECTORY}/{self.name}.h5')
 
     def predict(self, user_id, item_id):
-        user_input = np.array([self.user_id_to_id_vocab[user_id]])
-        item_input = np.array([self.item_id_to_id_vocab[item_id]])
+        user_input = np.array([self.indexer.get_user_internal_id(user_id)])
+        item_input = np.array([self.indexer.get_user_internal_id(item_id)])
         return self.model.predict([user_input, item_input]).squeeze().tolist()
 
     def get_recommendations(self, user_id, k):
-        user_input = np.full((len(self.item_id_to_id_vocab.keys()), 1), fill_value=self.user_id_to_id_vocab[user_id])
-        item_input = np.expand_dims(np.arange(0, len(self.item_id_to_id_vocab.keys())), axis=1)
+        user_input = np.full((self.num_items, 1), fill_value=self.indexer.get_user_internal_id[user_id])
+        item_input = np.expand_dims(np.arange(0, self.num_items), axis=1)
 
-        non_rated_user_movies = self.user_ratings[self.user_id_to_id_vocab[user_id], :] == 0
+        non_rated_user_movies = self.user_ratings[self.indexer.get_user_internal_id(user_id), :] == 0
         recommendations = self.model.predict([user_input, item_input]).squeeze()
         recommendations_idx = np.argsort(recommendations)[::-1]
-        return [self.id_to_item_id_vocab[i] for i in recommendations_idx if
+        return [self.indexer.get_movie_id(i) for i in recommendations_idx if
                 non_rated_user_movies[i]][:k]

+ 15
- 1
collaborative_filtering/utils.py

@@ -24,4 +24,18 @@ def create_user_items_rating_matrix(data, user_id_to_id_mapping, item_id_to_id_m
 
         user_ratings[user_index, item_index] = rating
 
-    return user_ratings
+    return user_ratings
+
+
+def create_user_items_rating_matrix_w_indexer(data, indexer):
+    user_ratings = np.zeros(
+        (len(indexer.user_id_to_internal_id_dict.values()), len(indexer.movie_id_to_internal_id_dict.values())))
+
+    for i in tqdm(range(len(data))):
+        user, item, rating = data[i]
+        user_index = indexer.get_user_internal_id(user)
+        item_index = indexer.get_movie_internal_id(item)
+
+        user_ratings[user_index, item_index] = rating
+
+    return user_ratings

File diff suppressed because it is too large
+ 2831
- 0
data/generated/movies_data_df.csv

+ 61
- 24
deep_learning/movie_features_deep_learning_method.py

@@ -3,29 +3,40 @@ import tensorflow as tf
 import numpy as np
 from tqdm import tqdm
 from settings import PATH_TO_PROJECT
+from collaborative_filtering.utils import create_user_items_rating_matrix_w_indexer
 import os
 
 
 class MovieFeaturesDeepLearningMethod(RecommendationMethod):
-    def __init__(self):
+    def __init__(self, indexer, movies_features, user_features, model_name):
+        self.indexer = indexer
+        self.name = model_name
+        self.movies_features = movies_features
+        self.user_features = user_features
         self.model = None
         self.user_ratings = None
         self.train_loss = tf.keras.metrics.Mean(name='train_loss')
         self.test_loss = tf.keras.metrics.Mean(name='test_loss')
 
-    def fit(self, train_user_items_ratings, test_user_items_ratings=None, batch_size=20, epochs=20):
-        self.model = self._build_model(train_user_items_ratings[0][0].shape[0])
+    def fit(self, train_user_items_ratings, test_user_items_ratings=None, batch_size=20, epochs=20, early_stopping=-1):
+        self.model = self._build_model()
         eval_test = test_user_items_ratings is not None
 
+        #train_user_items_ratings.extend(self._generate_negative_samples(train_user_items_ratings, 1))
+
         train_ds = self._generate_dataset(train_user_items_ratings, batch_size)
 
         if eval_test:
             test_ds = self._generate_dataset(test_user_items_ratings, batch_size)
 
         loss_object = tf.keras.losses.MeanSquaredError()
-        optimizer = tf.keras.optimizers.Adam()
+        optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0001)
+
+        prev_test_loss = float("inf")
+        patience = early_stopping
+        done_epochs = 0
 
-        for e in range(epochs):
+        while done_epochs < epochs and patience != 0:
             for train_data in tqdm(train_ds, total=len(train_user_items_ratings) // batch_size):
                 users, items, ratings = train_data
                 self.train_step(optimizer, loss_object, users, items, ratings)
@@ -36,33 +47,58 @@ class MovieFeaturesDeepLearningMethod(RecommendationMethod):
                     self.test_step(loss_object, users, items, ratings)
 
                 template = 'Epoch {}, Loss: {}, Test Loss: {}'
-                print(template.format(e + 1,
+                print(template.format(done_epochs + 1,
                                       self.train_loss.result().numpy(),
                                       self.test_loss.result().numpy()))
+
+                if self.test_loss.result().numpy() < prev_test_loss:
+                    prev_test_loss = self.test_loss.result().numpy()
+                    patience = early_stopping
+                else:
+                    patience -= 1
+
             else:
                 template = 'Epoch {}, Loss: {}'
-                print(template.format(e + 1,
+                print(template.format(done_epochs + 1,
                                       self.train_loss.result().numpy()))
 
-        self.model.save(os.path.join(PATH_TO_PROJECT, 'deep_learning', 'saved_models', 'model.h5'))
+            done_epochs += 1
+
+        if patience == 0:
+            print("Early stopped")
 
-    def load_model(self, filepath, input_size):
-        self.model = self._build_model(input_size=input_size)
+        self.model.save(os.path.join(PATH_TO_PROJECT, 'deep_learning', 'saved_models', f'{self.name}.h5'))
+
+    def load_model(self, filepath):
+        self.model = self._build_model()
         self.model.load_weights(filepath=filepath)
 
-    def get_recommendations(self, user, movies, k=10):
+    def get_recommendations(self, user, movies, k=None):
         user_input = np.repeat(np.expand_dims(user, axis=0), movies.shape[0], axis=0)
         movies_input = movies
 
         recommendations = self.model.predict([user_input, movies_input]).squeeze()
         recommendations_idx = np.argsort(recommendations)[::-1]
-        return recommendations_idx[:k]
+        return recommendations_idx[:k] if k is not None else recommendations_idx
+
+    def _generate_negative_samples(self, data, count_for_one_user):
+        new_data = []
+        items_ids = np.arange(0, len(self.indexer.internal_id_to_movie_id_dict.keys()))
+        user_ratings = create_user_items_rating_matrix_w_indexer(data, self.indexer)
+
+        for u, i, r in data:
+            non_rated_movies = user_ratings[self.indexer.get_user_internal_id(u), :] == 0
+            ratings_to_sample = np.random.choice(items_ids[non_rated_movies], count_for_one_user)
+            for s in ratings_to_sample:
+                new_data.append((u, self.indexer.get_movie_id(s), 0))
+
+        return new_data
 
     def _generate_dataset(self, data, batch_size):
-        users_ids = np.array([r[0] for r in data])
-        items_ids = np.array([r[1] for r in data])
-        ratings_ids = np.array([r[2] for r in data])
-        return tf.data.Dataset.from_tensor_slices((users_ids, items_ids, ratings_ids)).shuffle(10000, seed=56).batch(
+        users = np.array([self.user_features[self.indexer.get_user_internal_id(r[0])] for r in data])
+        items = np.array([self.movies_features[self.indexer.get_movie_internal_id(r[1])] for r in data])
+        ratings = np.array([r[2] for r in data])
+        return tf.data.Dataset.from_tensor_slices((users, items, ratings)).shuffle(10000, seed=56).batch(
             batch_size)
 
     @tf.function
@@ -83,20 +119,21 @@ class MovieFeaturesDeepLearningMethod(RecommendationMethod):
 
         self.train_loss(loss)
 
-    def _build_model(self, input_size):
-        user_inputs = tf.keras.Input(shape=(input_size,))
-        item_inputs = tf.keras.Input(shape=(input_size,))
+    def _build_model(self):
+        user_inputs = tf.keras.Input(shape=(self.movies_features.shape[1],))
+        item_inputs = tf.keras.Input(shape=(self.movies_features.shape[1],))
 
-        user_hidden = tf.keras.layers.Dense(512, activation=tf.nn.relu)(user_inputs)
+        user_hidden = tf.keras.layers.Dense(256, activation=tf.nn.tanh)(user_inputs)
 
-        item_hidden = tf.keras.layers.Dense(512, activation=tf.nn.relu)(item_inputs)
+        item_hidden = tf.keras.layers.Dense(256, activation=tf.nn.tanh)(item_inputs)
 
         concatenated = tf.keras.layers.Concatenate(axis=1)([item_hidden, user_hidden])
 
         flattened = tf.keras.layers.Flatten()(concatenated)
-        dense_1 = tf.keras.layers.Dense(64, activation=tf.nn.relu, trainable=True)(flattened)
-        dense_2 = tf.keras.layers.Dense(32, activation=tf.nn.relu, trainable=True)(dense_1)
-        dense_3 = tf.keras.layers.Dense(1, activation=tf.nn.relu, trainable=True)(dense_2)
+        dense_1 = tf.keras.layers.Dense(128, activation=tf.nn.tanh, trainable=True)(flattened)
+        dense_2 = tf.keras.layers.Dense(16, activation=tf.nn.tanh, trainable=True)(dense_1)
+        dropout = tf.keras.layers.Dropout(0.5)(dense_2)
+        dense_3 = tf.keras.layers.Dense(1, activation=tf.nn.relu, trainable=True)(dropout)
 
         model = tf.keras.models.Model([item_inputs, user_inputs], dense_3)
         return model

+ 39
- 61
deep_learning/scripts/run_deep_learning_method.py

@@ -1,100 +1,75 @@
 import pandas as pd
-import numpy as np
 import os
+import deep_learning.utils as deep_learning_utils
+import pickle as plk
 
 from settings import PATH_TO_DATA, PATH_TO_PROJECT
-from sklearn.model_selection import train_test_split
-from sklearn.feature_extraction.text import CountVectorizer
-from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
 from utils.id_indexer import Indexer
 from utils.evaluation.test_train_split import user_leave_on_out
 
 from deep_learning.movie_features_deep_learning_method import MovieFeaturesDeepLearningMethod
+from collaborative_filtering.neural_collaborative_filtering import NeuralCollaborativeFiltering
 
 user_column = 'userId'
 item_column = 'movieId'
 rating_column = 'rating'
 
 
-def get_movie_id_to_feature_mapping(movies_metadata_df):
-    mapping = {}
-    for i, row in movies_metadata_df.iterrows():
-        features = {
-            "title": row["title"],
-            "id": row["id"],
-        }
+def get_movies_model_features(all_users_ratings_df, train_ratings_df, movies_feature_data_path, saved_movies_features):
+    movies_data_df, movies_features = deep_learning_utils.get_movies_features(movies_feature_data_path,
+                                                                              saved_movies_features)
+    movies_data_df.to_csv(f"{PATH_TO_DATA}/generated/movies_data_df.csv")
+    indexer = Indexer(user_ids=all_users_ratings_df[user_column].unique(), movies_ids=movies_data_df['id'].unique())
 
-        mapping[int(row['id'])] = features
+    user_features = deep_learning_utils.get_weighted_movies_user_features(train_ratings_df, indexer, movies_features)
 
-    return mapping
+    return movies_features, user_features, indexer
 
 
-def get_weighted_movies_user_features(user_ratings_df, indexer, movies_features):
-    user_features = []
-
-    for user_internal_id, user_id in indexer.internal_id_to_user_id_dict.items():
-        user_ratings = user_ratings_df[user_ratings_df[user_column] == user_id][[item_column, rating_column]].values
-        user_rated_movies_id = [indexer.get_movie_internal_id(i) for i in user_ratings[:, 0].astype(int)]
-        user_ratings = np.expand_dims(user_ratings[:, 1] / 5.0, axis=1)
-        user_rated_movies_features = movies_features[user_rated_movies_id, :]
-        user_movies_features = np.sum(np.multiply(user_ratings, user_rated_movies_features), axis=0)
-        user_features.append(user_movies_features)
-
-    return np.array(user_features)
-
-
-def map_df_to_model_train_input(data_df, movies_features, user_features, indexer):
-    data = data_df[[user_column, item_column, rating_column]].values
-    return [(user_features[indexer.get_user_internal_id(r[0])],
-             movies_features[indexer.get_movie_internal_id(r[1])],
-             r[2]) for r in data]
-
-
-def map_df_to_test_input(data_df, movies_features, user_features, indexer):
+def map_df_to_model_input(data_df):
     data = data_df[[user_column, item_column, rating_column]].values
-    return [(indexer.get_user_internal_id(r[0]),
-             indexer.get_movie_internal_id(r[1]),
+    return [(int(r[0]),
+             int(r[1]),
              r[2]) for r in data]
 
 
 def main():
-    user_ratings_df = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/ratings_small.csv")
-    movies_metadata = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/movies_metadata_clean.csv")
-    movie_id_features_dict = get_movie_id_to_feature_mapping(movies_metadata)
+    dataset_path = os.path.join(os.sep, PATH_TO_DATA, "raw/the-movies-dataset")
+    path_to_saved_features = os.path.join(os.sep, PATH_TO_DATA, "generated/movies_data_df.csv")
+    user_ratings_df = pd.read_csv(os.path.join(dataset_path, "ratings_small.csv"))
+    movies_metadata = pd.read_csv(os.path.join(dataset_path, "movies_metadata_clean.csv"))
+    movie_id_features_dict = deep_learning_utils.get_movie_id_to_feature_mapping(movies_metadata)
 
     user_ratings_df = user_ratings_df[user_ratings_df[item_column].isin(movie_id_features_dict.keys())]
 
-    # train_ratings_df, test_ratings_df = train_test_split(user_ratings_df, train_size=0.8, shuffle=True,
-    #                                                     random_state=123)
-
     train_ratings_df, test_ratings_df = \
         list(user_leave_on_out(user_ratings_df, timestamp_column='timestamp', make_user_folds=False))[0]
 
-    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
-    features_extractor = FeaturesExtractor(dataset_path)
-    movies_data = features_extractor.run()
+    movies_features, user_features, indexer = get_movies_model_features(user_ratings_df, train_ratings_df, dataset_path,
+                                                                        path_to_saved_features)
 
-    cv = CountVectorizer(min_df=10)
-    movies_features = cv.fit_transform(movies_data['keywords']).toarray().astype(float)
-    indexer = Indexer(user_ids=user_ratings_df[user_column].unique(), movies_ids=movies_data['id'])
-    user_features = get_weighted_movies_user_features(train_ratings_df, indexer, movies_features)
+    train_data = map_df_to_model_input(train_ratings_df)
+    test_data = map_df_to_model_input(test_ratings_df)
 
-    train_data = map_df_to_model_train_input(train_ratings_df, movies_features, user_features, indexer)
-    test_data = map_df_to_test_input(test_ratings_df, movies_features, user_features, indexer)
+    #method = NeuralCollaborativeFiltering(indexer=indexer, n_factors=128, model_name="neucf_128_wo_threshold")
 
-    method = MovieFeaturesDeepLearningMethod()
-    # method.fit(train_data, test_data, epochs=20)
-    method.load_model(filepath=os.path.join(PATH_TO_PROJECT, "deep_learning", "saved_models", "model.h5"),
-                      input_size=movies_features.shape[1])
+    method = MovieFeaturesDeepLearningMethod(indexer=indexer, movies_features=movies_features,
+                                             user_features=user_features, model_name="movies_features")
 
-    for user, movie, rating in test_data[:6]:
-        recommendations = method.get_recommendations(user_features[user, :], movies_features, k=10)
+    #method.fit(train_data, test_data, batch_size=50, epochs=50, early_stopping=-1)
 
-        user_rated_movies = user_ratings_df[user_ratings_df[user_column] == indexer.get_user_id(user)] \
-            .sort_values(rating_column, ascending=False)[[item_column]]\
+    method.load_model(filepath=os.path.join(PATH_TO_PROJECT, "deep_learning", "saved_models", "movies_features.h5"))
+
+    for user, movie, rating in test_data[:10]:
+        recommendations = method.get_recommendations(user_features[indexer.get_user_internal_id(user), :],
+                                                     movies_features)
+
+        user_rated_movies = train_ratings_df[train_ratings_df[user_column] == user] \
+            .sort_values(rating_column, ascending=False)[[item_column]] \
             .values.squeeze()
 
-        recommended_movies = [indexer.get_movie_id(movie_internal_id) for movie_internal_id in recommendations]
+        recommended_movies = [indexer.get_movie_id(movie_internal_id) for movie_internal_id in recommendations if
+                              movie_internal_id not in user_rated_movies][:30]
 
         print("Rated movies: ")
         for movie_id in user_rated_movies:
@@ -104,6 +79,9 @@ def main():
         for movie_id in recommended_movies:
             print(movie_id_features_dict[movie_id])
 
+        print("Test movie rating: ")
+        print(movie_id_features_dict[movie], rating)
+
 
 if __name__ == '__main__':
     main()

+ 46
- 0
deep_learning/utils.py

@@ -0,0 +1,46 @@
+from sklearn.feature_extraction.text import CountVectorizer
+from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
+from settings import USER_COLUMN, ITEM_COLUMN, RATING_COLUMN
+import numpy as np
+import pandas as pd
+
+
+def get_movies_features(dataset_path, saved_movies_features=None):
+    if saved_movies_features is not None:
+        movies_data = pd.read_csv(saved_movies_features, index_col=0)
+    else:
+        features_extractor = FeaturesExtractor(dataset_path)
+        movies_data = features_extractor.run()
+        movies_data = movies_data.drop_duplicates(subset=['id'])
+
+    cv = CountVectorizer(min_df=20)
+    movies_features = cv.fit_transform(movies_data['combined']).toarray().astype(float)
+
+    return movies_data, movies_features
+
+
+def get_movie_id_to_feature_mapping(movies_metadata_df):
+    mapping = {}
+    for i, row in movies_metadata_df.iterrows():
+        features = {
+            "title": row["title"],
+            "id": row["id"],
+        }
+
+        mapping[int(row['id'])] = features
+
+    return mapping
+
+
+def get_weighted_movies_user_features(user_ratings_df, indexer, movies_features):
+    user_features = []
+
+    for user_internal_id, user_id in indexer.internal_id_to_user_id_dict.items():
+        user_ratings = user_ratings_df[user_ratings_df[USER_COLUMN] == user_id][[ITEM_COLUMN, RATING_COLUMN]].values
+        user_rated_movies_id = [indexer.get_movie_internal_id(i) for i in user_ratings[:, 0].astype(int)]
+        user_ratings = np.expand_dims(user_ratings[:, 1] / 5.0, axis=1)
+        user_rated_movies_features = movies_features[user_rated_movies_id, :]
+        user_movies_features = np.sum(np.multiply(user_ratings, user_rated_movies_features), axis=0)
+        user_features.append(user_movies_features)
+
+    return np.array(user_features)

+ 20
- 0
settings.py

@@ -1,5 +1,16 @@
 import os
 
+DATASET_NAME = "movie_lens"
+
+
+COLUMN_DICT = None
+
+MOVIE_LENS_COLUMN_DICT = {
+    "item": "movieId",
+    "user": "userId",
+    "rating": "rating"
+}
+
 # to override in user_settings.py
 PATH_TO_DATA = None
 PATH_TO_PROJECT = None
@@ -11,3 +22,12 @@ try:
     from user_settings import *  # silence pyflakes
 except ImportError:
     pass
+
+if DATASET_NAME is "movie_lens":
+    COLUMN_DICT = MOVIE_LENS_COLUMN_DICT
+
+if COLUMN_DICT is not None:
+    USER_COLUMN = COLUMN_DICT['user']
+    ITEM_COLUMN = COLUMN_DICT['item']
+    RATING_COLUMN = COLUMN_DICT['rating']
+