Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocess.py 3.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
  1. ##############################################################################################
  2. ################ Cleaning and Preprocessing the publications related to COVID-19 #############
  3. ##############################################################################################
  4. ########################################################################
  5. # Importing the required libraries.
  6. import csv, pandas as pd, sys
  7. from abc import ABC, abstractmethod
  8. ########################################################################
  9. class Preprocess(ABC):
  10. # Datasets enumeration.
  11. ARXIV = "arxiv"
  12. BIORXIV = "biorxiv"
  13. PUBMED = "pubmed"
  14. SCOPUS = "scopus"
  15. FINAL = "final"
  16. # Method that perform the specific process of cleaning and preprocessing.
  17. @abstractmethod
  18. def _preprocess(self):
  19. pass
  20. # Method that perform the generic process of cleaning and preprocessing.
  21. def process_raw_data(self, raw_data, preprocessed_data, dtypes_raw_data = None):
  22. # Creating a dataframe from the raw data.
  23. self._dataframe = pd.read_csv(raw_data, header=0, dtype=dtypes_raw_data)
  24. # Cleaning and preprocessing the raw data.
  25. self._preprocess()
  26. # Exporting the data to CSV file.
  27. self._dataframe.to_csv(preprocessed_data, index=False, quoting=csv.QUOTE_ALL)
  28. # Method that generate a specific process of cleaning and preprocessing.
  29. @staticmethod
  30. def factory_process(dataset):
  31. if dataset == Preprocess.ARXIV:
  32. from preprocess_arxiv import ProcessArxiv
  33. return ProcessArxiv()
  34. elif dataset == Preprocess.BIORXIV:
  35. from preprocess_biorxiv import ProcessBiorxiv
  36. return ProcessBiorxiv()
  37. elif dataset == Preprocess.PUBMED:
  38. from preprocess_pubmed import ProcessPubmed
  39. return ProcessPubmed()
  40. elif dataset == Preprocess.SCOPUS:
  41. from preprocess_scopus import ProcessScopus
  42. return ProcessScopus()
  43. elif dataset == Preprocess.FINAL:
  44. from preprocess_final import ProcessFinal
  45. return ProcessFinal()
  46. else:
  47. raise FileNotFoundError("This dataset does not exist.")
  48. # Executing the cleaning and preprocessing process of raw data.
  49. if __name__ == "__main__":
  50. if len(sys.argv) == 1:
  51. Preprocess.factory_process(Preprocess.ARXIV).process_raw_data(
  52. "data/raw/arxiv_raw.csv", "data/prepared/arxiv_covid_19.csv")
  53. Preprocess.factory_process(Preprocess.BIORXIV).process_raw_data(
  54. "data/raw/biorxiv_raw.csv", "data/prepared/biorxiv_covid_19.csv")
  55. Preprocess.factory_process(Preprocess.PUBMED).process_raw_data(
  56. "data/raw/pubmed_raw.csv", "data/prepared/pubmed_covid_19.csv",
  57. {"pubmed_id": "str"})
  58. Preprocess.factory_process(Preprocess.SCOPUS).process_raw_data(
  59. "data/raw/scopus_raw.csv", "data/prepared/scopus_covid_19.csv",
  60. {"id": "str", "eid": "str", "pii": "str", "pubmed_id": "str"})
  61. elif sys.argv[1] == "final":
  62. Preprocess.factory_process(Preprocess.FINAL).process_raw_data(
  63. "data/raw/final_raw.csv", "data/prepared/final_covid_19.csv",
  64. {"id": "str", "pubmed_id": "str"})
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...