Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocess_scopus.py 5.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
  1. ##############################################################################################
  2. ############ Cleaning and Preprocessing the Scopus publications related to COVID-19 ##########
  3. ##############################################################################################
  4. # For collecting the Scopus publications related to COVID-19, we used the "pybliometrics"
  5. # library. It is avaliable on https://pypi.org/project/pybliometrics/.
  6. ########################################################################
  7. # Importing the required libraries.
  8. import re, pandas as pd, numpy as np
  9. from preprocess import Preprocess
  10. ########################################################################
  11. class ProcessScopus(Preprocess):
  12. # Cleaning and preprocessing the dataframe.
  13. def _preprocess(self):
  14. # Removing the invalid articles.
  15. self._dataframe = self._dataframe.loc[
  16. self._dataframe.id.notnull() & self._dataframe.eid.notnull()]
  17. # Defining the "None" value for the "NaN" values.
  18. self._dataframe.replace({np.nan: None}, inplace=True)
  19. # Defining the "zero" value for the articles without numbers of citation and references.
  20. self._dataframe.citation_num.loc[self._dataframe.citation_num.isnull()] = 0
  21. self._dataframe.ref_count.loc[self._dataframe.ref_count.isnull()] = 0
  22. # Normalizing the feature "abstract".
  23. self._dataframe.abstract.loc[
  24. self._dataframe.abstract.isnull() & self._dataframe.description.notnull()
  25. ] = self._dataframe.description.loc[
  26. self._dataframe.abstract.isnull() & self._dataframe.description.notnull()]
  27. # Normalizing the feature "vehicle_name".
  28. self._dataframe.vehicle_name.loc[
  29. self._dataframe.conference_name.notnull() & self._dataframe.vehicle_name.notnull()
  30. ] = self._dataframe.conference_name.loc[
  31. self._dataframe.conference_name.notnull() & self._dataframe.vehicle_name.notnull()]
  32. # Removing unnecessary columns.
  33. columns_drop = ["eid", "pii", "description", "isbn", "conf_location", "conference_name",
  34. "vehicle_address", "title_edition"]
  35. self._dataframe.drop(axis=1, columns=columns_drop, inplace=True)
  36. # Changing the type of features.
  37. self._dataframe.loc[:, ["citation_num", "ref_count"]] = self._dataframe.loc[:,
  38. ["citation_num", "ref_count"]].astype("int")
  39. self._dataframe.auth_keywords.loc[self._dataframe.auth_keywords.notnull()] = \
  40. self._dataframe.auth_keywords.loc[self._dataframe.auth_keywords.notnull()].apply(eval)
  41. self._dataframe.index_terms.loc[self._dataframe.index_terms.notnull()] = \
  42. self._dataframe.index_terms.loc[self._dataframe.index_terms.notnull()].apply(eval)
  43. self._dataframe.affiliations.loc[self._dataframe.affiliations.notnull()] = \
  44. self._dataframe.affiliations.loc[self._dataframe.affiliations.notnull()].apply(eval)
  45. self._dataframe.subject_areas.loc[self._dataframe.subject_areas.notnull()] = \
  46. self._dataframe.subject_areas.loc[self._dataframe.subject_areas.notnull()].apply(eval)
  47. self._dataframe.authors.loc[self._dataframe.authors.notnull()] = \
  48. self._dataframe.authors.loc[self._dataframe.authors.notnull()].apply(eval)
  49. self._dataframe.author_affil.loc[self._dataframe.author_affil.notnull()] = \
  50. self._dataframe.author_affil.loc[self._dataframe.author_affil.notnull()].apply(eval)
  51. self._dataframe.references.loc[self._dataframe.references.notnull()] = \
  52. self._dataframe.references.loc[self._dataframe.references.notnull()].apply(eval)
  53. self._dataframe.publication_date = pd.to_datetime(self._dataframe.publication_date)
  54. # Normalizing the feature "abstract".
  55. self._dataframe.abstract.loc[self._dataframe.abstract.notnull()] = \
  56. self._dataframe.abstract.loc[self._dataframe.abstract.notnull()].apply(
  57. lambda x: x.replace("\\u0019", "").replace("\\%", "%").replace("\\s", "s").strip())
  58. # Normalizing the itens contained in the features "auth_keywords" and "index_terms".
  59. self._dataframe.auth_keywords.loc[self._dataframe.auth_keywords.notnull()] = \
  60. self._dataframe.auth_keywords.loc[self._dataframe.auth_keywords.notnull()].apply(
  61. lambda x: tuple([item.replace("\ufeff", "").strip() for item in x]))
  62. self._dataframe.index_terms.loc[self._dataframe.index_terms.notnull()] = \
  63. self._dataframe.index_terms.loc[self._dataframe.index_terms.notnull()].apply(
  64. lambda x: tuple([item.replace("\ufeff", "").strip() for item in x]))
  65. # Normalizing the affiliations contained in the features "affiliations" and "author_affil".
  66. self._dataframe.affiliations.loc[self._dataframe.affiliations.notnull()] = \
  67. self._dataframe.affiliations.loc[self._dataframe.affiliations.notnull()].apply(
  68. lambda x: tuple([{"id": affil["id"],
  69. "affiliation": affil["affiliation"].replace("\u200b", "").replace(
  70. "\u202f", "").strip(),
  71. "country": affil["country"]}
  72. for affil in x]))
  73. self._dataframe.author_affil.loc[self._dataframe.author_affil.notnull()] = \
  74. self._dataframe.author_affil.loc[self._dataframe.author_affil.notnull()].apply(
  75. lambda x: tuple([{"id": item["id"], "name": item["name"],
  76. "affil_id": item["affil_id"], "affiliation": item["affiliation"].replace(
  77. "\u200b", "").replace("\u202f", "").strip() if item["affiliation"] else None,
  78. "country": item["country"]}
  79. for item in x]))
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...