snambi
/
dib-covid-dataset
forked from breno-madruga/dib-covid-dataset


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
            ##############################################################################################
############ Cleaning and Preprocessing the Scopus publications related to COVID-19 ##########
##############################################################################################

# For collecting the Scopus publications related to COVID-19, we used the "pybliometrics"
# library. It is avaliable on https://pypi.org/project/pybliometrics/.

########################################################################
# Importing the required libraries.
import re, pandas as pd, numpy as np
from preprocess import Preprocess
########################################################################

class ProcessScopus(Preprocess):
    # Cleaning and preprocessing the dataframe.
    def _preprocess(self):
        # Removing the invalid articles.
        self._dataframe = self._dataframe.loc[
            self._dataframe.id.notnull() & self._dataframe.eid.notnull()]

        # Defining the "None" value for the "NaN" values.
        self._dataframe.replace({np.nan: None}, inplace=True)

        # Defining the "zero" value for the articles without numbers of citation and references.
        self._dataframe.citation_num.loc[self._dataframe.citation_num.isnull()] = 0
        self._dataframe.ref_count.loc[self._dataframe.ref_count.isnull()] = 0

        # Normalizing the feature "abstract".
        self._dataframe.abstract.loc[
            self._dataframe.abstract.isnull() & self._dataframe.description.notnull()
        ] = self._dataframe.description.loc[
            self._dataframe.abstract.isnull() & self._dataframe.description.notnull()]

        # Normalizing the feature "vehicle_name".
        self._dataframe.vehicle_name.loc[
            self._dataframe.conference_name.notnull() & self._dataframe.vehicle_name.notnull()
        ] = self._dataframe.conference_name.loc[
            self._dataframe.conference_name.notnull() & self._dataframe.vehicle_name.notnull()]

        # Removing unnecessary columns.
        columns_drop = ["eid", "pii", "description", "isbn", "conf_location", "conference_name",
            "vehicle_address", "title_edition"]
        self._dataframe.drop(axis=1, columns=columns_drop, inplace=True)

        # Changing the type of features.
        self._dataframe.loc[:, ["citation_num", "ref_count"]] = self._dataframe.loc[:,
            ["citation_num", "ref_count"]].astype("int")
        self._dataframe.auth_keywords.loc[self._dataframe.auth_keywords.notnull()] = \
            self._dataframe.auth_keywords.loc[self._dataframe.auth_keywords.notnull()].apply(eval)
        self._dataframe.index_terms.loc[self._dataframe.index_terms.notnull()] = \
            self._dataframe.index_terms.loc[self._dataframe.index_terms.notnull()].apply(eval)
        self._dataframe.affiliations.loc[self._dataframe.affiliations.notnull()] = \
            self._dataframe.affiliations.loc[self._dataframe.affiliations.notnull()].apply(eval)
        self._dataframe.subject_areas.loc[self._dataframe.subject_areas.notnull()] = \
            self._dataframe.subject_areas.loc[self._dataframe.subject_areas.notnull()].apply(eval)
        self._dataframe.authors.loc[self._dataframe.authors.notnull()] = \
            self._dataframe.authors.loc[self._dataframe.authors.notnull()].apply(eval)
        self._dataframe.author_affil.loc[self._dataframe.author_affil.notnull()] = \
            self._dataframe.author_affil.loc[self._dataframe.author_affil.notnull()].apply(eval)
        self._dataframe.references.loc[self._dataframe.references.notnull()] = \
            self._dataframe.references.loc[self._dataframe.references.notnull()].apply(eval)
        self._dataframe.publication_date = pd.to_datetime(self._dataframe.publication_date)

        # Normalizing the feature "abstract".
        self._dataframe.abstract.loc[self._dataframe.abstract.notnull()] = \
            self._dataframe.abstract.loc[self._dataframe.abstract.notnull()].apply(
                lambda x: x.replace("\\u0019", "").replace("\\%", "%").replace("\\s", "s").strip())

        # Normalizing the itens contained in the features "auth_keywords" and "index_terms".
        self._dataframe.auth_keywords.loc[self._dataframe.auth_keywords.notnull()] = \
            self._dataframe.auth_keywords.loc[self._dataframe.auth_keywords.notnull()].apply(
                lambda x: tuple([item.replace("\ufeff", "").strip() for item in x]))
        self._dataframe.index_terms.loc[self._dataframe.index_terms.notnull()] = \
            self._dataframe.index_terms.loc[self._dataframe.index_terms.notnull()].apply(
                lambda x: tuple([item.replace("\ufeff", "").strip() for item in x]))

        # Normalizing the affiliations contained in the features "affiliations" and "author_affil".
        self._dataframe.affiliations.loc[self._dataframe.affiliations.notnull()] = \
            self._dataframe.affiliations.loc[self._dataframe.affiliations.notnull()].apply(
                lambda x: tuple([{"id": affil["id"],
                    "affiliation": affil["affiliation"].replace("\u200b", "").replace(
                        "\u202f", "").strip(),
                    "country": affil["country"]}
                for affil in x]))
        self._dataframe.author_affil.loc[self._dataframe.author_affil.notnull()] = \
            self._dataframe.author_affil.loc[self._dataframe.author_affil.notnull()].apply(
                lambda x: tuple([{"id": item["id"], "name": item["name"],
                    "affil_id": item["affil_id"], "affiliation": item["affiliation"].replace(
                        "\u200b", "").replace("\u202f", "").strip() if item["affiliation"] else None,
                    "country": item["country"]}
                for item in x]))