Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocess.py 1.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
  1. import pandas as pd
  2. from sklearn.model_selection import train_test_split
  3. # Consts
  4. CLASS_LABEL = 'MachineLearning'
  5. raw_df_path = 'data/raw.csv'
  6. train_df_path = 'data/train.csv'
  7. test_df_path = 'data/test.csv'
  8. def feature_engineering(raw_df):
  9. df = raw_df.copy()
  10. df['CreationDate'] = pd.to_datetime(df['CreationDate'])
  11. df['CreationDate_Epoch'] = df['CreationDate'].astype('int64') // 10 ** 9
  12. df = df.drop(columns=['Id', 'Tags'])
  13. df['Title_Len'] = df.Title.str.len()
  14. df['Body_Len'] = df.Body.str.len()
  15. # Drop the correlated features
  16. df = df.drop(columns=['FavoriteCount'])
  17. df['Text'] = df['Title'].fillna('') + ' ' + df['Body'].fillna('')
  18. return df
  19. def split(random_state=42):
  20. print('Loading data...')
  21. df = pd.read_csv(raw_df_path)
  22. df[CLASS_LABEL] = df['Tags'].str.contains('machine-learning').fillna(False)
  23. train_df, test_df = train_test_split(df, random_state=random_state, stratify=df[CLASS_LABEL])
  24. print('Engineering features...')
  25. train_df = feature_engineering(train_df)
  26. test_df = feature_engineering(test_df)
  27. print('Saving split data...')
  28. train_df.to_csv(train_df_path)
  29. test_df.to_csv(test_df_path)
  30. if __name__ == '__main__':
  31. split()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...