Browse Source

Initial commit

kuparez 4 months ago
commit
b6087043cf

+ 8 - 0
.dvc/.gitignore

@@ -0,0 +1,8 @@
+/state
+/lock
+/config.local
+/updater
+/updater.lock
+/state-journal
+/state-wal
+/cache

+ 0 - 0
.dvc/config


+ 1 - 0
.gitattributes

@@ -0,0 +1 @@
+* text=auto

+ 90 - 0
.gitignore

@@ -0,0 +1,90 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+/env/
+/build/
+/develop-eggs/
+/dist/
+/downloads/
+/eggs/
+/.eggs/
+/lib/
+/lib64/
+/parts/
+/sdist/
+/var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+/docs/_build/
+
+# PyBuilder
+/target/
+
+# DotEnv configuration
+.env
+
+# Database
+*.db
+*.rdb
+
+# Pycharm
+.idea
+
+# VS Code
+.vscode/
+
+# Spyder
+.spyproject/
+
+# Jupyter NB Checkpoints
+.ipynb_checkpoints/
+
+# exclude data and trained models from source control by default
+/data/
+/models/
+
+# Mac OS-specific storage files
+.DS_Store
+
+# vim
+*.swp
+*.swo
+
+# Mypy cache
+.mypy_cache/

+ 94 - 0
Makefile

@@ -0,0 +1,94 @@
+.PHONY: clean dirs virtualenv lint requirements push pull reproduce
+
+#################################################################################
+# GLOBALS                                                                       #
+#################################################################################
+
+PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+PYTHON_INTERPRETER = python
+
+#################################################################################
+# COMMANDS                                                                      #
+#################################################################################
+
+## Create virtualenv.
+## Activate with the command:
+## source env/bin/activate
+virtualenv:
+	virtualenv -p $(PYTHON_INTERPRETER) env
+
+## Install Python Dependencies.
+## Make sure you activate the virtualenv first!
+requirements: 
+	$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
+	$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
+
+## Create directories that are ignored by git but required for the project
+dirs:
+	mkdir -p data/raw data/processed models
+
+## Delete all compiled Python files
+clean:
+	find . -type f -name "*.py[co]" -delete
+	find . -type d -name "__pycache__" -delete
+
+## Lint using flake8
+lint:
+	flake8 src
+
+## Upload Data to default DVC remote
+push:
+	dvc push
+
+## Download Data from default DVC remote
+pull:
+	dvc pull
+
+## Reproduce the DVC pipeline - recompute any modified outputs such as processed data or trained models
+reproduce:
+	dvc repro eval.dvc
+
+#################################################################################
+# PROJECT RULES                                                                 #
+#################################################################################
+
+
+
+#################################################################################
+# Self Documenting Commands                                                     #
+#################################################################################
+
+.DEFAULT_GOAL := help
+
+# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
+# sed script explained:
+# /^##/:
+# 	* save line in hold space
+# 	* purge line
+# 	* Loop:
+# 		* append newline + line to hold space
+# 		* go to next line
+# 		* if line starts with doc comment, strip comment character off and loop
+# 	* remove target prerequisites
+# 	* append hold space (+ newline) to line
+# 	* replace newline plus comments by `---`
+# 	* print line
+# Separate expressions are necessary because labels cannot be delimited by
+# semicolon; see <http://stackoverflow.com/a/11799865/1968>
+.PHONY: help
+help:
+	@echo "$$(tput bold)Available rules:$$(tput sgr0)"
+	@echo
+	@sed -n -e "/^## / Missing" $Missing \
+	| LC_ALL='C' sort --ignore-case \
+	| awk -F '---' \
+		-v ncol=$$(tput cols) \
+		-v indent=19 \
+		-v col_on="$$(tput setaf 6)" \
+		-v col_off="$$(tput sgr0)" \
+	'Missing \
+			printf "%s ", words[i]; \
+		} \
+		printf "\n"; \
+	}' \
+	| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')

+ 71 - 0
README.md

@@ -0,0 +1,71 @@
+asdasd
+==============================
+
+
+
+Instructions
+------------
+1. Clone the repo.
+1. Run `make dirs` to create the missing parts of the directory structure described below. 
+1. *Optional:* Run `make virtualenv` to create a python virtual environment. Skip if using conda or some other env manager.
+    1. Run `source env/bin/activate` to activate the virtualenv. 
+1. Run `make requirements` to install required python packages.
+1. Put the raw data in `data/raw`.
+1. To save the raw data to the DVC cache, run `dvc commit raw_data.dvc`
+1. Edit the code files to your heart's desire.
+1. Process your data, train and evaluate your model using `dvc repro eval.dvc` or `make reproduce`
+1. When you're happy with the result, commit files (including .dvc files) to git.
+ 
+Project Organization
+------------
+
+    ├── LICENSE
+    ├── Makefile           <- Makefile with commands like `make dirs` or `make clean`
+    ├── README.md          <- The top-level README for developers using this project.
+    ├── data
+    │   ├── processed      <- The final, canonical data sets for modeling.
+    │   └── raw            <- The original, immutable data dump.
+    │
+    ├── eval.dvc           <- The end of the data pipeline - evaluates the trained model on the test dataset.
+    │
+    ├── models             <- Trained and serialized models, model predictions, or model summaries
+    │
+    ├── notebooks          <- Jupyter notebooks. Naming convention is a number (for ordering),
+    │                         the creator's initials, and a short `-` delimited description, e.g.
+    │                         `1.0-jqp-initial-data-exploration`.
+    │
+    ├── process_data.dvc   <- Process the raw data and prepare it for training.
+    ├── raw_data.dvc       <- Keeps the raw data versioned.
+    │
+    ├── references         <- Data dictionaries, manuals, and all other explanatory materials.
+    │
+    ├── reports            <- Generated analysis as HTML, PDF, LaTeX, etc.
+    │   └── figures        <- Generated graphics and figures to be used in reporting
+    │   └── metrics.txt    <- Relevant metrics after evaluating the model.
+    │   └── training_metrics.txt    <- Relevant metrics from training the model.
+    │
+    ├── requirements.txt   <- The requirements file for reproducing the analysis environment, e.g.
+    │                         generated with `pip freeze > requirements.txt`
+    │
+    ├── setup.py           <- makes project pip installable (pip install -e .) so src can be imported
+    ├── src                <- Source code for use in this project.
+    │   ├── __init__.py    <- Makes src a Python module
+    │   │
+    │   ├── data           <- Scripts to download or generate data
+    │   │   └── make_dataset.py
+    │   │
+    │   ├── models         <- Scripts to train models and then use trained models to make
+    │   │   │                 predictions
+    │   │   ├── predict_model.py
+    │   │   └── train_model.py
+    │   │
+    │   └── visualization  <- Scripts to create exploratory and results oriented visualizations
+    │       └── visualize.py
+    │
+    ├── tox.ini            <- tox file with settings for running tox; see tox.testrun.org
+    └── train.dvc          <- Traing a model on the processed data.
+
+
+--------
+
+<p><small>Project based on the <a target="_blank" href="https://drivendata.github.io/cookiecutter-data-science/">cookiecutter data science project template</a>. #cookiecutterdatascience</small></p>

+ 16 - 0
eval.dvc

@@ -0,0 +1,16 @@
+cmd: python src/models/predict_model.py
+deps:
+- md5: d751713988987e9331980363e24189ce.dir
+  path: models
+- md5: d41d8cd98f00b204e9800998ecf8427e
+  path: src/models/predict_model.py
+- md5: d751713988987e9331980363e24189ce.dir
+  path: data/processed
+md5: df501fa51f8f3bffdefc852440b2c4bf
+outs:
+- cache: false
+  md5: d41d8cd98f00b204e9800998ecf8427e
+  metric: true
+  path: reports/metrics.txt
+  persist: false
+wdir: .

+ 0 - 0
notebooks/.gitkeep


+ 14 - 0
process_data.dvc

@@ -0,0 +1,14 @@
+cmd: python src/data/make_dataset.py
+deps:
+- md5: d751713988987e9331980363e24189ce.dir
+  path: data/raw
+- md5: d41d8cd98f00b204e9800998ecf8427e
+  path: src/data/make_dataset.py
+md5: b6846fadaff7e5481be03e7d1e38a6bd
+outs:
+- cache: true
+  md5: d751713988987e9331980363e24189ce.dir
+  metric: false
+  path: data/processed
+  persist: true
+wdir: .

+ 8 - 0
raw_data.dvc

@@ -0,0 +1,8 @@
+md5: a77d208c8b62fbf7b40e1cda06fd2e1b
+outs:
+- cache: true
+  md5: d751713988987e9331980363e24189ce.dir
+  metric: false
+  path: data/raw
+  persist: false
+wdir: .

+ 0 - 0
references/.gitkeep


+ 0 - 0
reports/.gitkeep


+ 0 - 0
reports/figures/.gitkeep


+ 0 - 0
reports/metrics.txt


+ 0 - 0
reports/training_metrics.txt


+ 9 - 0
requirements.txt

@@ -0,0 +1,9 @@
+# local package
+-e .
+
+# external requirements
+click
+coverage
+awscli
+flake8
+python-dotenv>=0.5.1

+ 7 - 0
setup.py

@@ -0,0 +1,7 @@
+from setuptools import find_packages, setup
+
+setup(
+    name='src',
+    packages=find_packages(),
+    version='0.1.0',
+)

+ 0 - 0
src/__init__.py


+ 0 - 0
src/data/.gitkeep


+ 0 - 0
src/data/__init__.py


+ 0 - 0
src/data/make_dataset.py


+ 0 - 0
src/models/.gitkeep


+ 0 - 0
src/models/__init__.py


+ 0 - 0
src/models/predict_model.py


+ 0 - 0
src/models/train_model.py


+ 0 - 0
src/visualization/.gitkeep


+ 0 - 0
src/visualization/__init__.py


+ 0 - 0
src/visualization/visualize.py


+ 3 - 0
tox.ini

@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 79
+max-complexity = 10

+ 19 - 0
train.dvc

@@ -0,0 +1,19 @@
+cmd: python src/models/train_model.py
+deps:
+- md5: d41d8cd98f00b204e9800998ecf8427e
+  path: src/models/train_model.py
+- md5: d751713988987e9331980363e24189ce.dir
+  path: data/processed
+md5: aae5283d45ea683bcce139f2f98beabb
+outs:
+- cache: true
+  md5: d751713988987e9331980363e24189ce.dir
+  metric: false
+  path: models
+  persist: true
+- cache: false
+  md5: d41d8cd98f00b204e9800998ecf8427e
+  metric: true
+  path: reports/training_metrics.txt
+  persist: false
+wdir: .