Browse Source

deep learning context based like model added. Some refactor in evaluations and experiments

Witold 8 months ago
parent
commit
01b18336ef

notebooks/.ipynb_checkpoints/movie_lens_data_mining-checkpoint.ipynb → utils/notebooks/.ipynb_checkpoints/movie_lens_data_mining-checkpoint.ipynb

notebooks/movie_lens_data_mining.ipynb → utils/notebooks/movie_lens_data_mining.ipynb

BIN
collaborative_filtering/saved_models/model.h5

+ 2
- 0
collaborative_filtering/scripts/movie_lens_collaborative_filtering.py

@@ -10,6 +10,7 @@ from collaborative_filtering.neural_collaborative_filtering import NeuralCollabo
 from tqdm import tqdm
 from sklearn.preprocessing import MinMaxScaler
 
+
 def get_movie_id_to_feature_mapping(movies_metadata_df):
     mapping = {}
     for i, row in movies_metadata_df.iterrows():
@@ -107,5 +108,6 @@ def count_mse(method, test_data):
     data = scaler.transform(np.array([y_true, y_pred]).transpose())
     print(mean_squared_error(y_true=data[:, 0], y_pred=data[:, 1]))
 
+
 if __name__ == '__main__':
     main()

+ 1
- 3
content_based_recomendation/scripts/movie_lens_content_based_recomendation.py

@@ -1,8 +1,6 @@
 from settings import PATH_TO_DATA
-from content_based_recomendation.scripts.movie_lens_features_extractor import FeaturesExtractor
-from content_based_recomendation.keywords_based_cbr import KeywordsBasedCbr
+from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
 from content_based_recomendation.weigted_rating_cbr import WeightedRatingCbr
-import numpy as np
 import pandas as pd
 import os
 

+ 0
- 1
content_based_recomendation/weigted_rating_cbr.py

@@ -1,5 +1,4 @@
 from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 from interfaces import RecommendationMethod
 from content_based_recomendation.scripts.index_mapping import IndexMapping

+ 102
- 0
deep_learning/movie_features_deep_learning_method.py

@@ -0,0 +1,102 @@
+from interfaces import RecommendationMethod
+import tensorflow as tf
+import numpy as np
+from tqdm import tqdm
+from settings import PATH_TO_PROJECT
+import os
+
+
+class MovieFeaturesDeepLearningMethod(RecommendationMethod):
+    def __init__(self):
+        self.model = None
+        self.user_ratings = None
+        self.train_loss = tf.keras.metrics.Mean(name='train_loss')
+        self.test_loss = tf.keras.metrics.Mean(name='test_loss')
+
+    def fit(self, train_user_items_ratings, test_user_items_ratings=None, batch_size=20, epochs=20):
+        self.model = self._build_model(train_user_items_ratings[0][0].shape[0])
+        eval_test = test_user_items_ratings is not None
+
+        train_ds = self._generate_dataset(train_user_items_ratings, batch_size)
+
+        if eval_test:
+            test_ds = self._generate_dataset(test_user_items_ratings, batch_size)
+
+        loss_object = tf.keras.losses.MeanSquaredError()
+        optimizer = tf.keras.optimizers.Adam()
+
+        for e in range(epochs):
+            for train_data in tqdm(train_ds, total=len(train_user_items_ratings) // batch_size):
+                users, items, ratings = train_data
+                self.train_step(optimizer, loss_object, users, items, ratings)
+
+            if eval_test:
+                for test_data in tqdm(test_ds, total=len(test_user_items_ratings) // batch_size):
+                    users, items, ratings = test_data
+                    self.test_step(loss_object, users, items, ratings)
+
+                template = 'Epoch {}, Loss: {}, Test Loss: {}'
+                print(template.format(e + 1,
+                                      self.train_loss.result().numpy(),
+                                      self.test_loss.result().numpy()))
+            else:
+                template = 'Epoch {}, Loss: {}'
+                print(template.format(e + 1,
+                                      self.train_loss.result().numpy()))
+
+        self.model.save(os.path.join(PATH_TO_PROJECT, 'deep_learning', 'saved_models', 'model.h5'))
+
+    def load_model(self, filepath, input_size):
+        self.model = self._build_model(input_size=input_size)
+        self.model.load_weights(filepath=filepath)
+
+    def get_recommendations(self, user, movies, k=10):
+        user_input = np.repeat(np.expand_dims(user, axis=0), movies.shape[0], axis=0)
+        movies_input = movies
+
+        recommendations = self.model.predict([user_input, movies_input]).squeeze()
+        recommendations_idx = np.argsort(recommendations)[::-1]
+        return recommendations_idx[:k]
+
+    def _generate_dataset(self, data, batch_size):
+        users_ids = np.array([r[0] for r in data])
+        items_ids = np.array([r[1] for r in data])
+        ratings_ids = np.array([r[2] for r in data])
+        return tf.data.Dataset.from_tensor_slices((users_ids, items_ids, ratings_ids)).shuffle(10000, seed=56).batch(
+            batch_size)
+
+    @tf.function
+    def test_step(self, loss_object, user_input, item_input, labels):
+        predictions = self.model([user_input, item_input])
+        t_loss = loss_object(labels, predictions)
+
+        self.test_loss(t_loss)
+
+    @tf.function
+    def train_step(self, optimizer, loss_object, user_input, item_input, labels):
+        with tf.GradientTape() as tape:
+            predictions = self.model([user_input, item_input], training=True)
+            loss = loss_object(labels, predictions)
+
+        gradients = tape.gradient(loss, self.model.trainable_variables)
+        optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
+
+        self.train_loss(loss)
+
+    def _build_model(self, input_size):
+        user_inputs = tf.keras.Input(shape=(input_size,))
+        item_inputs = tf.keras.Input(shape=(input_size,))
+
+        user_hidden = tf.keras.layers.Dense(512, activation=tf.nn.relu)(user_inputs)
+
+        item_hidden = tf.keras.layers.Dense(512, activation=tf.nn.relu)(item_inputs)
+
+        concatenated = tf.keras.layers.Concatenate(axis=1)([item_hidden, user_hidden])
+
+        flattened = tf.keras.layers.Flatten()(concatenated)
+        dense_1 = tf.keras.layers.Dense(64, activation=tf.nn.relu, trainable=True)(flattened)
+        dense_2 = tf.keras.layers.Dense(32, activation=tf.nn.relu, trainable=True)(dense_1)
+        dense_3 = tf.keras.layers.Dense(1, activation=tf.nn.relu, trainable=True)(dense_2)
+
+        model = tf.keras.models.Model([item_inputs, user_inputs], dense_3)
+        return model

+ 109
- 0
deep_learning/scripts/run_deep_learning_method.py

@@ -0,0 +1,109 @@
+import pandas as pd
+import numpy as np
+import os
+
+from settings import PATH_TO_DATA, PATH_TO_PROJECT
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer
+from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
+from utils.id_indexer import Indexer
+from utils.evaluation.test_train_split import user_leave_on_out
+
+from deep_learning.movie_features_deep_learning_method import MovieFeaturesDeepLearningMethod
+
+user_column = 'userId'
+item_column = 'movieId'
+rating_column = 'rating'
+
+
+def get_movie_id_to_feature_mapping(movies_metadata_df):
+    mapping = {}
+    for i, row in movies_metadata_df.iterrows():
+        features = {
+            "title": row["title"],
+            "id": row["id"],
+        }
+
+        mapping[int(row['id'])] = features
+
+    return mapping
+
+
+def get_weighted_movies_user_features(user_ratings_df, indexer, movies_features):
+    user_features = []
+
+    for user_internal_id, user_id in indexer.internal_id_to_user_id_dict.items():
+        user_ratings = user_ratings_df[user_ratings_df[user_column] == user_id][[item_column, rating_column]].values
+        user_rated_movies_id = [indexer.get_movie_internal_id(i) for i in user_ratings[:, 0].astype(int)]
+        user_ratings = np.expand_dims(user_ratings[:, 1] / 5.0, axis=1)
+        user_rated_movies_features = movies_features[user_rated_movies_id, :]
+        user_movies_features = np.sum(np.multiply(user_ratings, user_rated_movies_features), axis=0)
+        user_features.append(user_movies_features)
+
+    return np.array(user_features)
+
+
+def map_df_to_model_train_input(data_df, movies_features, user_features, indexer):
+    data = data_df[[user_column, item_column, rating_column]].values
+    return [(user_features[indexer.get_user_internal_id(r[0])],
+             movies_features[indexer.get_movie_internal_id(r[1])],
+             r[2]) for r in data]
+
+
+def map_df_to_test_input(data_df, movies_features, user_features, indexer):
+    data = data_df[[user_column, item_column, rating_column]].values
+    return [(indexer.get_user_internal_id(r[0]),
+             indexer.get_movie_internal_id(r[1]),
+             r[2]) for r in data]
+
+
+def main():
+    user_ratings_df = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/ratings_small.csv")
+    movies_metadata = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/movies_metadata_clean.csv")
+    movie_id_features_dict = get_movie_id_to_feature_mapping(movies_metadata)
+
+    user_ratings_df = user_ratings_df[user_ratings_df[item_column].isin(movie_id_features_dict.keys())]
+
+    # train_ratings_df, test_ratings_df = train_test_split(user_ratings_df, train_size=0.8, shuffle=True,
+    #                                                     random_state=123)
+
+    train_ratings_df, test_ratings_df = \
+        list(user_leave_on_out(user_ratings_df, timestamp_column='timestamp', make_user_folds=False))[0]
+
+    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
+    features_extractor = FeaturesExtractor(dataset_path)
+    movies_data = features_extractor.run()
+
+    cv = CountVectorizer(min_df=10)
+    movies_features = cv.fit_transform(movies_data['keywords']).toarray().astype(float)
+    indexer = Indexer(user_ids=user_ratings_df[user_column].unique(), movies_ids=movies_data['id'])
+    user_features = get_weighted_movies_user_features(train_ratings_df, indexer, movies_features)
+
+    train_data = map_df_to_model_train_input(train_ratings_df, movies_features, user_features, indexer)
+    test_data = map_df_to_test_input(test_ratings_df, movies_features, user_features, indexer)
+
+    method = MovieFeaturesDeepLearningMethod()
+    # method.fit(train_data, test_data, epochs=20)
+    method.load_model(filepath=os.path.join(PATH_TO_PROJECT, "deep_learning", "saved_models", "model.h5"),
+                      input_size=movies_features.shape[1])
+
+    for user, movie, rating in test_data[:6]:
+        recommendations = method.get_recommendations(user_features[user, :], movies_features, k=10)
+
+        user_rated_movies = user_ratings_df[user_ratings_df[user_column] == indexer.get_user_id(user)] \
+            .sort_values(rating_column, ascending=False)[[item_column]]\
+            .values.squeeze()
+
+        recommended_movies = [indexer.get_movie_id(movie_internal_id) for movie_internal_id in recommendations]
+
+        print("Rated movies: ")
+        for movie_id in user_rated_movies:
+            print(movie_id_features_dict[movie_id])
+
+        print("Recommended movies: ")
+        for movie_id in recommended_movies:
+            print(movie_id_features_dict[movie_id])
+
+
+if __name__ == '__main__':
+    main()

+ 0
- 53
evaluation/experiments.py

@@ -1,53 +0,0 @@
-from evaluation.hit_rate import HitRate
-from content_based_recomendation.weigted_rating_cbr import WeightedRatingCbr
-from hybrid.average_hybrid_filtering import AverageHybridFiltering
-from hybrid.predicate_hybrid_filtering import PredicateHybridFiltering
-from settings import PATH_TO_DATA
-from content_based_recomendation.scripts.movie_lens_features_extractor import FeaturesExtractor
-from collaborative_filtering.memory_based_collaborative_filtering import MemoryBasedCollaborativeFiltering
-from collaborative_filtering.svd_collaborative_filtering import SVDCollaborativeFiltering
-import pandas as pd
-import os
-
-
-def main():
-    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
-    user_column = 'userId'
-    item_column = 'movieId'
-
-    ratings = pd.read_csv(os.path.join(dataset_path, 'ratings_small_clean.csv'))
-    ratings_per_user = ratings.groupby('userId').sum()
-
-    features_extractor = FeaturesExtractor(dataset_path)
-    data = features_extractor.run()
-
-    movie_mapping = dict(zip(data['id'].tolist(), data.index.astype(int)))
-
-    user_ids = ratings[user_column].unique()
-    movie_ids = ratings[item_column].unique()
-
-    hit_rate_ns = [30]
-
-    mem_factory = lambda: MemoryBasedCollaborativeFiltering(ratings[user_column].unique(),
-                                                 ratings[item_column].unique())
-    wcr_factory = lambda: WeightedRatingCbr(data['combined'], movie_mapping)
-    predicate = lambda userId, itemId: 0 if userId <= 1 else 1
-
-    methods = [
-        mem_factory(),
-        SVDCollaborativeFiltering(ratings[user_column].unique(), ratings[item_column].unique()),
-        wcr_factory(),
-        AverageHybridFiltering([mem_factory(), wcr_factory()], 50),  # lub len(ratings_per_user) zamiast 50
-        PredicateHybridFiltering([mem_factory(), wcr_factory()], predicate, len(ratings_per_user)),
-    ]
-
-    for n in hit_rate_ns:
-        hit_rate = HitRate(n)
-        for method in methods:
-            print(type(method))
-            result = hit_rate.evaluate(method, ratings.values[:, :3])
-            print(f'Final result: {result}')
-
-
-if __name__ == '__main__':
-    main()

+ 0
- 28
evaluation/hit_rate.py

@@ -1,28 +0,0 @@
-from sklearn.model_selection import LeaveOneOut, StratifiedKFold
-import numpy as np
-
-
-class HitRate:
-    def __init__(self, n):
-        self.n = n
-        self.user_col_id = 0
-        self.movie_col_id = 1
-
-    def evaluate(self, recommendation_method, ratings):
-        hits = 0
-        cross_validation = LeaveOneOut()
-
-        for counter, (train, test) in enumerate(cross_validation.split(ratings)):
-            test_user = int(ratings[test[0], self.user_col_id])
-            recommendation_method.fit(np.delete(ratings, test, axis=0))
-            selected_items = recommendation_method.get_recommendations(test_user,
-                                                                       self.n)
-
-            test_movie = int(ratings[test[0], self.movie_col_id])
-
-            hit = 1 if test_movie in selected_items else 0
-            hits += hit
-            if hit > 0:
-                print(f'{counter} hits: {hits}')
-        return hits
-

+ 76
- 0
evaluation/scripts/run_deep_learning_experiments.py

@@ -0,0 +1,76 @@
+import pandas as pd
+import numpy as np
+
+from sklearn.feature_extraction.text import CountVectorizer
+from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
+from utils.id_indexer import Indexer
+from utils.evaluation.test_train_split import user_leave_on_out
+from settings import PATH_TO_DATA
+
+user_column = 'userId'
+item_column = 'movieId'
+rating_column = 'rating'
+
+
+def get_movie_id_to_feature_mapping(movies_metadata_df):
+    mapping = {}
+    for i, row in movies_metadata_df.iterrows():
+        features = {
+            "title": row["title"],
+            "id": row["id"],
+        }
+
+        mapping[int(row['id'])] = features
+
+    return mapping
+
+
+def get_weighted_movies_user_features(user_ratings_df, indexer, movies_features):
+    user_features = []
+
+    for user_internal_id, user_id in indexer.internal_id_to_user_id_dict.items():
+        user_ratings = user_ratings_df[user_ratings_df[user_column] == user_id][[item_column, rating_column]].values
+        user_rated_movies_id = [indexer.get_movie_internal_id(i) for i in user_ratings[:, 0].astype(int)]
+        user_ratings = np.expand_dims(user_ratings[:, 1] / 5.0, axis=1)
+        user_rated_movies_features = movies_features[user_rated_movies_id, :]
+        user_movies_features = np.sum(np.multiply(user_ratings, user_rated_movies_features), axis=0)
+        user_features.append(user_movies_features)
+
+    return np.array(user_features)
+
+
+def map_df_to_model_input(data_df, movies_features, user_features, indexer):
+    data = data_df[[user_column, item_column, rating_column]].values
+
+    return [(user_features[indexer.get_user_internal_id(r[0])],
+             movies_features[indexer.get_movie_internal_id(r[1])],
+             r[2]) for r in data]
+
+
+def main():
+    user_ratings_df = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/ratings_small.csv")
+    movies_metadata = pd.read_csv(f"{PATH_TO_DATA}/raw/the-movies-dataset/movies_metadata_clean.csv")
+    movie_id_features_dict = get_movie_id_to_feature_mapping(movies_metadata)
+    user_ratings_df = user_ratings_df[user_ratings_df[item_column].isin(movie_id_features_dict.keys())]
+
+    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
+    features_extractor = FeaturesExtractor(dataset_path)
+    movies_data = features_extractor.run()
+
+    cv = CountVectorizer(min_df=3)
+    movies_features = cv.fit_transform(movies_data['combined']).toarray().astype(float)
+    indexer = Indexer(user_ids=user_ratings_df[user_column].unique(), movies_ids=movies_data['id'])
+
+    for train_df, test_df in user_leave_on_out(user_ratings_df, timestamp_column="timestamp"):
+        user_features = get_weighted_movies_user_features(train_df, indexer, movies_features)
+
+        train_data = map_df_to_model_input(train_df, movies_features, user_features, indexer)
+        test_data = map_df_to_model_input(test_df, movies_features, user_features, indexer)
+
+    train_df, test_df = list(user_leave_on_out(user_ratings_df, make_user_folds=False))[0]
+
+    print(train_df)
+
+
+if __name__ == '__main__':
+    main()

+ 59
- 0
evaluation/scripts/run_experiments.py

@@ -0,0 +1,59 @@
+from utils.evaluation.metrics import hit_rate
+from content_based_recomendation.weigted_rating_cbr import WeightedRatingCbr
+from settings import PATH_TO_DATA
+from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
+from collaborative_filtering.memory_based_collaborative_filtering import MemoryBasedCollaborativeFiltering
+from collaborative_filtering.svd_collaborative_filtering import SVDCollaborativeFiltering
+from utils.evaluation.test_train_split import user_leave_on_out
+import pandas as pd
+import os
+
+
+def main():
+    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
+    user_column = 'userId'
+    item_column = 'movieId'
+
+    ratings = pd.read_csv(os.path.join(dataset_path, 'ratings_small_clean.csv'))
+    ratings_per_user = ratings.groupby('userId').sum()
+
+    features_extractor = FeaturesExtractor(dataset_path)
+    data = features_extractor.run()
+    movie_mapping = dict(zip(data['id'].tolist(), data.index.astype(int)))
+
+    user_ids = ratings[user_column].unique()
+    movie_ids = ratings[item_column].unique()
+
+    wcr_factory = lambda: WeightedRatingCbr(data['combined'], movies_mapping=movie_mapping)
+
+    methods = [
+        #MemoryBasedCollaborativeFiltering(user_ids, movie_ids),
+        SVDCollaborativeFiltering(user_ids, movie_ids),
+        wcr_factory()
+    ]
+
+    n = 30
+
+    for method in methods:
+        iterations = 0
+        all_hits = 0
+        for train_df, test_df in user_leave_on_out(ratings):
+            train_ratings = train_df.values[:, :3]
+            user_id, item_id, ratings = test_df.values[:, :3][0]
+            method.fit(train_ratings)
+            pred_ids = method.get_recommendations(user_id, n)
+            hits = hit_rate(gt_items_idx=[item_id.astype(int)], predicted_items_idx=pred_ids)
+
+            ###
+            # SPACE OF OTHER METRICS
+            ###
+
+            all_hits += hits
+            iterations += 1
+
+            if hits > 0:
+                print(f"{method.__class__}: {all_hits}/{iterations}")
+
+
+if __name__ == '__main__':
+    main()

+ 1
- 2
hybrid/scripts/movie_lens_hybrid_filtering.py

@@ -2,10 +2,9 @@ import os
 
 from sklearn.model_selection import train_test_split
 from collaborative_filtering.memory_based_collaborative_filtering import MemoryBasedCollaborativeFiltering
-from content_based_recomendation.scripts.movie_lens_features_extractor import FeaturesExtractor
+from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
 from content_based_recomendation.weigted_rating_cbr import WeightedRatingCbr
 from hybrid.average_hybrid_filtering import AverageHybridFiltering
-from hybrid.predicate_hybrid_filtering import PredicateHybridFiltering
 from settings import PATH_TO_DATA
 import pandas as pd
 

+ 6
- 0
utils/evaluation/metrics.py

@@ -0,0 +1,6 @@
+from sklearn.model_selection import LeaveOneOut, StratifiedKFold
+import numpy as np
+
+
+def hit_rate(gt_items_idx, predicted_items_idx):
+    return len(set(gt_items_idx).intersection(set(predicted_items_idx)))

+ 39
- 0
utils/evaluation/test_train_split.py

@@ -0,0 +1,39 @@
+import random
+import numpy as np
+import pandas as pd
+
+user_column = 'userId'
+item_column = 'movieId'
+rating_column = 'rating'
+
+
+def user_leave_on_out(user_ratings_df, timestamp_column=None, make_user_folds=True):
+    """
+    Make leave one out folds for each user
+    :param user_ratings_df: default user items ratings pandas dataframe
+    :param timestamp_column: timestamp column name in user_ratings_df for setting last reting as test rating
+     If None setting random user rating as test rating
+    :param make_user_folds: if True makes fold generator for each user leaving last or random rating as test rating.
+     If False returning one train dataset and one test set with oe rating for each user
+    :return:
+    """
+
+    user_ids = user_ratings_df[user_column].unique()
+    test_indices = []
+    for user_id in user_ids:
+        user_ratings = user_ratings_df[user_ratings_df[user_column] == user_id]
+
+        if timestamp_column is not None:
+            user_ratings = user_ratings.sort_values(by=timestamp_column)
+            test_index = len(user_ratings) - 1
+        else:
+            test_index = random.randint(0, len(user_ratings) - 1)
+
+        indices = user_ratings.index.values
+        test_indices.append(indices[test_index])
+
+    if make_user_folds:
+        for i in test_indices:
+            yield user_ratings_df.loc[~user_ratings_df.index.isin([i])], user_ratings_df.loc[[i]]
+    else:
+        yield user_ratings_df.loc[~user_ratings_df.index.isin(test_indices)], user_ratings_df.loc[test_indices]

content_based_recomendation/scripts/movie_lens_features_extractor.py → utils/features_extraction/movie_lens_features_extractor.py

+ 45
- 0
utils/id_indexer.py

@@ -0,0 +1,45 @@
+def _generate_mapping(ids):
+    id_to_data_id_vocab = {}
+
+    id = 0
+    for i in ids:
+        if i not in id_to_data_id_vocab.values():
+            id_to_data_id_vocab[id] = i
+            id += 1
+
+    return id_to_data_id_vocab, {v: k for k, v in id_to_data_id_vocab.items()}
+
+
+class Indexer:
+    def __init__(self, user_ids=None, movies_ids=None):
+        if user_ids is not None:
+            self.internal_id_to_user_id_dict, self.user_id_to_internal_id_dict = _generate_mapping(ids=user_ids)
+        else:
+            self.internal_id_to_user_id_dict = None
+            self.user_id_to_internal_id_dict = None
+
+        if movies_ids is not None:
+            self.internal_id_to_movie_id_dict, self.movie_id_to_internal_id_dict = _generate_mapping(
+                ids=movies_ids)
+        else:
+            self.internal_id_to_movie_id_dict = None
+            self.movie_id_to_internal_id_dict = None
+
+    def set_user_ids(self, user_ids):
+        self.internal_id_to_user_id_dict, self.user_id_to_internal_id_dict = _generate_mapping(ids=user_ids)
+
+    def set_movies_id(self, movies_ids):
+        self.internal_id_to_movie_id_dict, self.movie_id_to_internal_id_dict = _generate_mapping(
+            ids=movies_ids)
+
+    def get_user_id(self, internal_id):
+        return self.internal_id_to_user_id_dict[internal_id]
+
+    def get_user_internal_id(self, user_id):
+        return self.user_id_to_internal_id_dict[user_id]
+
+    def get_movie_id(self, internal_id):
+        return self.internal_id_to_movie_id_dict[internal_id]
+
+    def get_movie_internal_id(self, movie_id):
+        return self.movie_id_to_internal_id_dict[movie_id]