PiotrSajewiczP52 8 months ago
parent
commit
db2da7503d

+ 9
- 0
.dvc/.gitignore

@@ -0,0 +1,9 @@
+/config.local
+/updater
+/lock
+/updater.lock
+/tmp
+/state-journal
+/state-wal
+/state
+/cache

+ 0
- 0
.dvc/config

+ 1
- 0
.gitignore

@@ -12,3 +12,4 @@ user_settings.py
 *.h5
 
 */.ipynb_checkpoints
+/dataset

+ 27
- 0
create_dvc.bat

@@ -0,0 +1,27 @@
+setlocal
+set PYTHONPATH=%CD%
+dvc run^
+ -d prepare_dataset.py^
+ -o dataset^
+ -f prepare_dataset.dvc^
+ python prepare_dataset.py
+dvc run^
+ -d evaluation/scripts/run_experiments.py^
+ -d dataset -d clustering -d collaborative_filtering -d content_based_recomendation -d hybrid -d deep_learning -d data -d utils -d sequence^
+ -o results/run_experiments.csv^
+ -f run_experiments.dvc^
+ python evaluation/scripts/run_experiments.py
+dvc run^
+ -d evaluation/scripts/run_deep_learning_experiments.py^
+ -d dataset -d clustering -d collaborative_filtering -d content_based_recomendation -d hybrid -d deep_learning -d data -d utils -d sequence^
+ -o results/run_deep_learning_experiments.csv^
+ -f run_deep_learning_experiments.dvc^
+ python evaluation/scripts/run_deep_learning_experiments.py
+dvc run^
+ -d evaluation/scripts/run_clustering_experiments.py^
+ -d dataset -d clustering -d collaborative_filtering -d content_based_recomendation -d hybrid -d deep_learning -d data -d utils -d sequence^
+ -o results/run_clustering_experiments.csv^
+ -f run_clustering_experiments.dvc^
+ python evaluation/scripts/run_clustering_experiments.py
+endlocal
+pause

+ 11
- 0
prepare_dataset.dvc

@@ -0,0 +1,11 @@
+md5: c122007d14c11c687c0143b0004f77f1
+cmd: python prepare_dataset.py
+deps:
+- md5: 86a6eb0445e40183badfe3fa3171e4f4
+  path: prepare_dataset.py
+outs:
+- md5: 355acab208f5585fd279c9147f8f0350.dir
+  path: dataset
+  cache: true
+  metric: false
+  persist: false

+ 78
- 0
prepare_dataset.py

@@ -0,0 +1,78 @@
+import os
+import io
+import requests
+import zipfile
+import shutil
+from content_based_recomendation.scripts.movie_lens_content_based_recomendation import filter_ratings
+from utils.features_extraction.movie_lens_features_extractor import FeaturesExtractor
+from settings import PATH_TO_DATA
+
+
+def download_file_from_google_drive(id, destination):
+    URL = "https://docs.google.com/uc?export=download"
+
+    session = requests.Session()
+
+    response = session.get(URL, params={'id': id}, stream=True)
+    token = get_confirm_token(response)
+
+    if token:
+        params = {'id': id, 'confirm': token}
+        response = session.get(URL, params=params, stream=True)
+
+    # save_response_content(response, destination)
+    return response
+
+
+def get_confirm_token(response):
+    for key, value in response.cookies.items():
+        if key.startswith('download_warning'):
+            return value
+
+    return None
+
+
+def save_response_content(response, destination):
+    CHUNK_SIZE = 32768
+
+    with open(destination, "wb") as f:
+        for chunk in response.iter_content(CHUNK_SIZE):
+            if chunk:  # filter out keep-alive new chunks
+                f.write(chunk)
+
+
+def unpack_starts_with(zip_file, zip_skip, save_path):
+    members = [x for x in zip_file.NameToInfo.keys() if x.startswith(zip_skip) and len(x) > len(zip_skip)]
+    for mem in members:
+        path = save_path + mem[len(zip_skip):]
+        if not path.endswith('/'):
+            read_file = zip_file.open(mem)
+            with open(path, 'wb') as write_file:
+                shutil.copyfileobj(read_file, write_file)
+        else:
+            os.makedirs(path, exist_ok=True)
+
+
+def main():
+    eas_path = './dataset/raw/the-movies-dataset/'
+    eas_zip_skip = ''
+    eas_gdrive_id = '1Qx9FAqaIG9PbMRJ6coT_NNA9Bck3-jSZ'
+
+    os.makedirs(eas_path, exist_ok=True)
+    print('Downloading...')
+    r = download_file_from_google_drive(eas_gdrive_id, None)
+    print('Unzip...')
+    z = zipfile.ZipFile(io.BytesIO(r.content))
+    unpack_starts_with(z, eas_zip_skip, eas_path)
+    print('Filtering')
+    dataset_path = f'{PATH_TO_DATA}/raw/the-movies-dataset'
+    features_extractor = FeaturesExtractor(dataset_path)
+    data = features_extractor.run()
+    filter_ratings(dataset_path, data)
+    print('Done')
+
+
+if __name__ == '__main__':
+    main()
+
+

+ 3
- 0
results/.gitignore

@@ -0,0 +1,3 @@
+/run_clustering_experiments.csv
+/run_experiments.csv
+/run_deep_learning_experiments.csv

+ 29
- 0
run_clustering_experiments.dvc

@@ -0,0 +1,29 @@
+md5: d09678ee619ad958b731451f67292995
+cmd: python evaluation/scripts/run_clustering_experiments.py
+deps:
+- md5: edb4d8173181f21d5f6573742ce1de1c
+  path: evaluation/scripts/run_clustering_experiments.py
+- md5: 355acab208f5585fd279c9147f8f0350.dir
+  path: dataset
+- md5: eea90dffb0b63a8a5cabdff2cb84fe47.dir
+  path: clustering
+- md5: 0d922879691335de5079cecf52a52df8.dir
+  path: collaborative_filtering
+- md5: ef78c70597b682960c666a7f9734232b.dir
+  path: content_based_recomendation
+- md5: ed456204f499ee14691dc86baa154f07.dir
+  path: hybrid
+- md5: 3f0b3450bd454a4b2f28b99b41fcb8cb.dir
+  path: deep_learning
+- md5: 979a2ee82d026442f432f4254a27b052.dir
+  path: data
+- md5: 4cfc2400550789f59eccee40c53a78c0.dir
+  path: utils
+- md5: 7156a3423242d73daad6a9c637a5bd20.dir
+  path: sequence
+outs:
+- md5: 8d777f385d3dfec8815d20f7496026dc
+  path: results/run_clustering_experiments.csv
+  cache: true
+  metric: false
+  persist: false

+ 29
- 0
run_deep_learning_experiments.dvc

@@ -0,0 +1,29 @@
+md5: 700fd4a92b4e485f4b0f646e0c172791
+cmd: python evaluation/scripts/run_deep_learning_experiments.py
+deps:
+- md5: 9c238a242ae70f5a8633388f1120f99f
+  path: evaluation/scripts/run_deep_learning_experiments.py
+- md5: 355acab208f5585fd279c9147f8f0350.dir
+  path: dataset
+- md5: eea90dffb0b63a8a5cabdff2cb84fe47.dir
+  path: clustering
+- md5: 0d922879691335de5079cecf52a52df8.dir
+  path: collaborative_filtering
+- md5: ef78c70597b682960c666a7f9734232b.dir
+  path: content_based_recomendation
+- md5: ed456204f499ee14691dc86baa154f07.dir
+  path: hybrid
+- md5: 3f0b3450bd454a4b2f28b99b41fcb8cb.dir
+  path: deep_learning
+- md5: 979a2ee82d026442f432f4254a27b052.dir
+  path: data
+- md5: 4cfc2400550789f59eccee40c53a78c0.dir
+  path: utils
+- md5: 7156a3423242d73daad6a9c637a5bd20.dir
+  path: sequence
+outs:
+- md5: 8d777f385d3dfec8815d20f7496026dc
+  path: results/run_deep_learning_experiments.csv
+  cache: true
+  metric: false
+  persist: false

+ 29
- 0
run_experiments.dvc

@@ -0,0 +1,29 @@
+md5: cb943f590084079344ec245edcf7f3df
+cmd: python evaluation/scripts/run_experiments.py
+deps:
+- md5: 497dc67168efab85cef1aef71f8e53ae
+  path: evaluation/scripts/run_experiments.py
+- md5: 355acab208f5585fd279c9147f8f0350.dir
+  path: dataset
+- md5: eea90dffb0b63a8a5cabdff2cb84fe47.dir
+  path: clustering
+- md5: 0d922879691335de5079cecf52a52df8.dir
+  path: collaborative_filtering
+- md5: ef78c70597b682960c666a7f9734232b.dir
+  path: content_based_recomendation
+- md5: ed456204f499ee14691dc86baa154f07.dir
+  path: hybrid
+- md5: 3f0b3450bd454a4b2f28b99b41fcb8cb.dir
+  path: deep_learning
+- md5: 979a2ee82d026442f432f4254a27b052.dir
+  path: data
+- md5: 4cfc2400550789f59eccee40c53a78c0.dir
+  path: utils
+- md5: 7156a3423242d73daad6a9c637a5bd20.dir
+  path: sequence
+outs:
+- md5: 8d777f385d3dfec8815d20f7496026dc
+  path: results/run_experiments.csv
+  cache: true
+  metric: false
+  persist: false

+ 0
- 0
sequence/scripts/run.py