Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

piplines_creation.py 2.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
  1. # -*- coding: utf-8 -*-
  2. """piplines_creation.ipynb
  3. Automatically generated by Colaboratory.
  4. Original file is located at
  5. https://colab.research.google.com/drive/1k0KJokr1_jAQsAimwlt3W7zRVsuL2yDr
  6. """
  7. import argparse
  8. import pandas as pd
  9. import numpy as np
  10. import matplotlib.pyplot as plt
  11. import seaborn as sns
  12. parser = argparse.ArgumentParser()
  13. parser.add_argument("MURKUP_DATA_PATH", help="path to your input CSV", type=str)
  14. parser.add_argument("GRAPH_ID2NAME", help="path to graph id2name data", type=str)
  15. parser.add_argument("PIPELINES_PATH", help="path to save pipelines", type=str)
  16. args = parser.parse_args()
  17. DATASET_PATH = args.MURKUP_DATA_PATH
  18. GRAPH_ID2NAME_PATH = args.GRAPH_ID2NAME
  19. PIPELINES_PATH = args.PIPELINES_PATH
  20. df = pd.read_csv(DATASET_PATH, sep = ';')
  21. graph = pd.read_csv(GRAPH_ID2NAME_PATH)
  22. graph.rename(columns = {'id': 'graph_vertex_id'}, inplace = True)
  23. df = pd.merge(df, graph)
  24. #Смотрим, есть ли блоки кода, которые разные люди отнесли к разным вершинам графа:
  25. duplicated_blocks = df[df["code_block_id"].duplicated(keep=False)]
  26. conflicted_blocks = duplicated_blocks.groupby("code_block_id")["graph_vertex_id"].nunique() != 1
  27. conflicted_blocks[conflicted_blocks ==True]
  28. if len(conflicted_blocks) == 0:
  29. df = df.drop_duplicates('code_block_id')
  30. #Составляем пайплайны для каждого ноутбука
  31. def group_by_notebooks(data, vertex_col = 'graph_vertex_subclass', len_col = 'len', notebook_id_col = 'kaggle_id',
  32. competition_id_col = 'competition_id', competition_name_col = 'comp_name', code_block_col = 'code_block_id') -> pd.DataFrame:
  33. notebook_cols = [notebook_id_col, vertex_col, len_col, competition_id_col, competition_name_col, code_block_col]
  34. df = pd.DataFrame(columns=notebook_cols)
  35. for i, notebook_id in enumerate(data[notebook_id_col].unique()):
  36. if not pd.isnull(notebook_id):
  37. notebook = data[data[notebook_id_col] == notebook_id].reset_index(drop=True).sort_values('code_block_id')
  38. vertices_seq = " ".join(notebook[vertex_col])
  39. code_block_ids = " ".join(str(x) for x in notebook['code_block_id'])
  40. lenth = len(notebook['code_block_id'])
  41. competition_id = notebook[competition_id_col].unique()[0]
  42. competition_name = notebook[competition_name_col].unique()[0]
  43. row = [notebook_id, vertices_seq, lenth, competition_id, competition_name, code_block_ids]
  44. df.loc[i] = row
  45. print('notebook #{} done'.format(notebook_id))
  46. return df
  47. piplines = group_by_notebooks(df)
  48. piplines['len'].describe()
  49. piplines.to_csv(PIPELINES_PATH)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...