Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

prepare_data.py 1.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  1. import pandas as pd
  2. import os
  3. from operator import itemgetter
  4. from github_search import text_preprocessing
  5. input_file_path = 'github_repos.json'
  6. raw_df = pd.read_json(os.path.join('data', input_file_path), lines=True)
  7. raw_df.head()
  8. raw_df['languages'] = raw_df['language'].apply(lambda ds: [d['name'] for d in ds])
  9. selected_langs = ['python', 'r', 'matlab', 'julia', 'c++', 'java', 'scala']
  10. df = raw_df[raw_df['languages'].apply(lambda langs: any([lang.lower() in selected_langs for lang in langs]))]
  11. df = df.drop(['language'], axis=1)
  12. df = df[(~df['content'].isna())]
  13. df = df[df['content'].str.split().apply(len) > 25]
  14. df = df[(df['content'].apply(itemgetter(0)) != '<') & (df['content'].apply(itemgetter(-1)) != '>')]
  15. n_examples = 10000
  16. print('selected_n_examples: {}'.format(n_examples))
  17. lm_df = df[['repo_name', 'languages', 'content']][:n_examples]
  18. lm_df = lm_df.dropna()
  19. lm_df[['content']].to_csv('github_repos_lm_text_small.csv')
  20. lm_df.index = pd.RangeIndex(len(lm_df))
  21. import tqdm
  22. extracted_content = pd.Series([text_preprocessing.tokenize_markdown(md_string) for md_string in tqdm.tqdm(lm_df['content'])])
  23. lm_df['text'] = extracted_content.apply(' '.join)
  24. lm_df = lm_df[(~lm_df['text'].isna()) & (lm_df['text'].apply(len) > 0)]
  25. print('filtered_n_examples: {}'.format(lm_df.shape[0]))
  26. out_file = 'github_repos_lm_text.csv'
  27. print('saving results to: {}'.format(out_file))
  28. lm_df[['text']].to_csv(out_file)
  29. # ### Load to FastAI api
  30. bs = 64
  31. #import fastai
  32. #import nltk
  33. #markdown_tokenizer = fastai.text.Tokenizer(
  34. # tok_func=tok_fn,
  35. # pre_rules=[],
  36. # post_rules=[],
  37. # special_cases=[],
  38. # n_cpus=12)
  39. #def tok_fn(lang):
  40. # tok = fastai.text.transform.BaseTokenizer('none')
  41. # tok.tokenizer = tokenize_markdown
  42. # return tok
  43. #
  44. #
  45. #data_lm = load_data('', 'data_lm_export.pkl', bs=bs, bptt=50)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...