Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

make_train.py 1.9 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
  1. import numpy as np
  2. import pandas as pd
  3. import utils.preprocessing as preprocessing
  4. import utils.feature_generation as feature_generation
  5. PARTITION_NAME_TEMPLATE = "./data_parts/labeled_comments_partition{}.npy"
  6. PARTITION_COUNT = 2
  7. def preprocess(code_blocks):
  8. prep_pipeline = [
  9. preprocessing.trim_symbols,
  10. preprocessing.single_lines,
  11. preprocessing.multiple_lines,
  12. preprocessing.extract_comments,
  13. ]
  14. for prep_func in prep_pipeline:
  15. code_blocks = code_blocks.apply(prep_func, axis=1)
  16. comments = []
  17. for block_comments in code_blocks["comments"]:
  18. for comment_data in block_comments:
  19. comments.append(comment_data[1])
  20. comments = np.array(comments)
  21. return pd.DataFrame(data=comments.reshape((-1, 1)), columns=["comment"])
  22. def load_code_blocks():
  23. all_blocks = pd.read_csv("../data/code_blocks_clean.csv")
  24. all_blocks = all_blocks["code_block"].to_frame()
  25. comment_blocks_idx = (
  26. all_blocks["code_block"].str.contains("#") |
  27. (all_blocks["code_block"].str.contains("'''") &
  28. (all_blocks["code_block"].str.count("'''") % 2 == 0)) |
  29. (all_blocks["code_block"].str.contains('"""') &
  30. (all_blocks["code_block"].str.count('"""') % 2 == 0))
  31. )
  32. return preprocess(all_blocks[comment_blocks_idx].reset_index())
  33. target = None
  34. for part_id in range(PARTITION_COUNT):
  35. part_data = np.load(PARTITION_NAME_TEMPLATE.format(part_id))
  36. if target is None:
  37. target = part_data
  38. continue
  39. labeled_idx = part_data >= 0
  40. target[labeled_idx] = part_data[labeled_idx]
  41. train_mask = target >= 0
  42. determined_target = target[train_mask]
  43. all_comments = load_code_blocks()
  44. comment_df = feature_generation.preprocess_comments(all_comments)
  45. train_df = comment_df[train_mask]
  46. train_df["is_good_comment"] = determined_target
  47. train_df.to_csv("train.csv")
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...