Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

make_dataset.py 2.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
  1. # -*- coding: utf-8 -*-
  2. # Copyright (c) 2021. Jeffrey J. Nirschl. All rights reserved.
  3. #
  4. # Licensed under the MIT license. See the LICENSE.md file in the project
  5. # root directory for full license information.
  6. #
  7. # Time-stamp: <>
  8. # ======================================================================
  9. import argparse
  10. import os
  11. from pathlib import Path
  12. from kaggle.api.kaggle_api_extended import KaggleApi
  13. from src.data import data_dictionary
  14. def download_data(competition, train_data, test_data,
  15. output_dir="./data/raw",
  16. credentials=".kaggle/kaggle.json"):
  17. """Download raw dataset from Kaggle"""
  18. credentials = Path.home().joinpath(credentials)
  19. output_dir = Path(output_dir).resolve()
  20. assert (os.path.isfile(credentials)), FileNotFoundError(credentials)
  21. assert (os.path.isdir(output_dir)), NotADirectoryError(output_dir)
  22. api = KaggleApi()
  23. api.authenticate()
  24. # downloading from kaggle.com/c/titanic
  25. api.competition_download_file(competition,
  26. train_data, path=output_dir)
  27. api.competition_download_file(competition,
  28. test_data, path=output_dir)
  29. if __name__ == '__main__':
  30. parser = argparse.ArgumentParser()
  31. parser.add_argument("-c", "--competition", dest="competition",
  32. required=True, help="Kaggle competition to download")
  33. parser.add_argument("-tr", "--train_data", dest="train_data",
  34. required=True, help="Train CSV file")
  35. parser.add_argument("-te", "--test_data", dest="test_data",
  36. required=True, help="Test CSV file")
  37. parser.add_argument("-o", "--out-dir", dest="output_dir",
  38. default=os.path.dirname(Path(__file__).resolve()),
  39. required=False, help="output directory")
  40. args = parser.parse_args()
  41. # set vars
  42. args.output_dir = Path(args.output_dir).resolve()
  43. train_path = args.output_dir.joinpath(args.train_data)
  44. test_path = args.output_dir.joinpath(args.test_data)
  45. # download dataset from kaggle
  46. download_data(args.competition, args.train_data, args.test_data,
  47. output_dir=args.output_dir)
  48. data_dictionary.create(train_path)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...