Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocess.py 2.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  1. import logging
  2. import os
  3. import pathlib
  4. import pickle
  5. import glob
  6. import warnings
  7. from pathlib import Path
  8. from joblib import Parallel, delayed
  9. from tqdm import tqdm
  10. import catboost as cb
  11. import joblib
  12. import lightgbm as lgb
  13. import matplotlib.pyplot as plt
  14. import mlflow
  15. import numpy as np
  16. import pandas as pd
  17. import seaborn as sns
  18. import tensorflow as tf
  19. import keras
  20. from keras.preprocessing.image import ImageDataGenerator
  21. import xgboost as xgb
  22. from sklearn.compose import *
  23. from sklearn.dummy import DummyRegressor
  24. from sklearn.ensemble import *
  25. from sklearn.metrics import mean_squared_error
  26. from sklearn.model_selection import *
  27. from sklearn.pipeline import *
  28. from sklearn.preprocessing import *
  29. from sklearn.tree import *
  30. from skimage.transform import resize
  31. warnings.filterwarnings("ignore")
  32. logging.basicConfig(
  33. format="%(asctime)s - %(message)s", datefmt="%d-%b-%y %H:%M:%S", level=logging.INFO
  34. )
  35. SEED = 1121218
  36. def resize_image(path, target_size=(224, 224), save_path="data/processed/train"):
  37. """A function to resize images."""
  38. root_path = Path(path)
  39. img = plt.imread(root_path)
  40. img = resize(img, target_size)
  41. target_path = (Path(save_path) / root_path.stem).with_suffix(root_path.suffix)
  42. plt.imsave(target_path, img)
  43. def execute_parallel(func, iterator):
  44. """A function to execute parallel jobs on the iterator."""
  45. Parallel(n_jobs=3, backend="multiprocessing")(
  46. delayed(func)(item) for item in tqdm(iterator))
  47. def load_tf_datasets():
  48. """A function to save images as TF datasets."""
  49. train_df = pd.read_csv("data/raw/train.csv")
  50. train_df['filename'] = train_df['Id'] + '.jpg'
  51. img_size = (224, 224)
  52. rescale = 1.0 / 255.0
  53. data_generator = ImageDataGenerator(rescale=rescale, validation_split=0.2)
  54. gen_kwargs = dict(
  55. dataframe=train_df,
  56. directory="data/raw/train",
  57. x_col="filename",
  58. y_col="Pawpularity",
  59. batch_size=32,
  60. seed=SEED,
  61. shuffle=True,
  62. class_mode='raw',
  63. target_size=img_size,
  64. )
  65. train_generator = data_generator.flow_from_dataframe(**gen_kwargs, subset="training")
  66. validation_generator = data_generator.flow_from_dataframe(**gen_kwargs,
  67. subset="validation")
  68. return train_generator, validation_generator
  69. if __name__ == "__main__":
  70. pass
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...