Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

pre_process.py 2.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
  1. import os
  2. import pandas as pd
  3. from sklearn.model_selection import train_test_split
  4. import src.reddit_utils as r_utils
  5. from src.utilities import read_yaml
  6. params = read_yaml("params.yaml", "pre_process")
  7. CHUNK_SIZE = params["chunk_size"]
  8. TARGET_LABEL = params["target_col"]
  9. UNIQUE_FLAIRS = [
  10. "Discussion",
  11. "Project",
  12. "Research",
  13. "None",
  14. "News",
  15. "Shameless Self Promo",
  16. "Inaccurate",
  17. "Misleading",
  18. "Clickbait",
  19. ]
  20. def load_and_process_data(random_state=42):
  21. print("Loading data in chuncks...")
  22. raw_data = os.path.join("data/raw", r_utils.RAW_DF_PATH)
  23. processed_train = os.path.join("data/processed", r_utils.TRAIN_DF_PATH)
  24. processed_test = os.path.join("data/processed", r_utils.TEST_DF_PATH)
  25. for i, chunk in enumerate(pd.read_csv(raw_data, chunksize=CHUNK_SIZE)):
  26. print(f"Processing chunk {i + 1}...")
  27. processed_data = process(chunk)
  28. print("Splitting into train and test data...")
  29. train_chunk, test_chunk = train_test_split(
  30. processed_data,
  31. random_state=random_state,
  32. stratify=processed_data[TARGET_LABEL],
  33. )
  34. print("Saving to cloud...")
  35. save_data(train_chunk, processed_train, test_chunk, processed_test, i)
  36. def process(chunk):
  37. df = chunk.copy()
  38. df = df.drop(columns=["id", "author"])
  39. df = df.rename(columns={"selftext": "body", "link_flair_text": "flair"})
  40. df["title_len"] = df.title.str.len()
  41. df["body_len"] = df.body.str.len()
  42. df["has_thumbnail"] = [
  43. 0 if (x == "self" or x == "default") else 1 for x in df["thumbnail"]
  44. ]
  45. df = df.fillna({"body": "", "flair": "None", "body_len": 0})
  46. df["flair"] = ["Discussion" if (x == "Discusssion") else x for x in df["flair"]]
  47. df = pd.concat([df, pd.get_dummies(df["flair"], prefix="flair")], axis=1).drop(
  48. ["flair"], axis=1
  49. )
  50. for flair in UNIQUE_FLAIRS:
  51. flair_with_prefix = "flair_" + flair
  52. if flair_with_prefix not in df.columns:
  53. df[flair_with_prefix] = 0
  54. df = df[df["title"] != "[deleted by user]"]
  55. df = df[df["body"] != "[deleted]"]
  56. df = df[df["body"] != "[removed]"]
  57. df["title_and_body"] = (df["title"] + " " + df["body"]).astype(str)
  58. return df
  59. def save_data(train_chunk, train_f, test_chunk, test_f, i):
  60. # We want to write the headers only once
  61. header = True if i == 0 else False
  62. train_chunk.to_csv(train_f, header=header, mode="a")
  63. test_chunk.to_csv(test_f, header=header, mode="a")
  64. if __name__ == "__main__":
  65. load_and_process_data()
  66. print("Loading and processing done!")
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...