Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

make_dataset.py 2.1 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
  1. import sys
  2. import os
  3. import pandas as pd
  4. import re
  5. from scipy.sparse import save_npz
  6. from sklearn.model_selection import train_test_split
  7. from sklearn.feature_extraction.text import TfidfVectorizer
  8. from nltk.stem import PorterStemmer
  9. def preprocessor(text):
  10. """Return a cleaned version of text"""
  11. # Remove HTML markup
  12. text = re.sub("<[^>]*>", "", text)
  13. # Save emoticons for later appending
  14. emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
  15. # Remove any non-word character and append the emoticons,
  16. # removing the nose character for standarization. Convert to lower case
  17. text = (
  18. re.sub("[\W]+", " ", text.lower()) + " " + " ".join(emoticons).replace("-", "")
  19. )
  20. return text
  21. def tokenizer_porter(text):
  22. porter = PorterStemmer()
  23. token = []
  24. for word in text.split():
  25. token.append(porter.stem(word))
  26. return token
  27. def preproc(data_path, output_folder):
  28. df = pd.read_csv(data_path)
  29. df["sentiment"] = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)
  30. X = df["review"]
  31. y = df["sentiment"]
  32. # split the dataset in train and test
  33. X_train, X_test, y_train, y_test = train_test_split(
  34. X, y, test_size=0.3, random_state=42
  35. )
  36. tfidf = TfidfVectorizer(
  37. tokenizer=tokenizer_porter, preprocessor=preprocessor, max_features=25000
  38. )
  39. X_train = tfidf.fit_transform(X_train)
  40. X_test = tfidf.transform(X_test)
  41. save_npz(output_folder + "X_train", X_train)
  42. save_npz(output_folder + "X_test", X_test)
  43. pd.DataFrame(y_train).to_csv(output_folder + "y_train.csv", index=False)
  44. pd.DataFrame(y_test).to_csv(output_folder + "y_test.csv", index=False)
  45. return X_train, X_test, y_train, y_test
  46. if __name__ == "__main__":
  47. if not (2 <= len(sys.argv) <= 3):
  48. print(
  49. "usage: %s <raw_data_file> <out_folder> (out_folder is optional)"
  50. % sys.argv[0],
  51. file=sys.stderr,
  52. )
  53. sys.exit(0)
  54. out_folder = sys.argv[2] if len(sys.argv) == 3 else "data/processed/"
  55. if not os.path.exists(out_folder):
  56. os.makedirs(out_folder)
  57. preproc(sys.argv[1], out_folder)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...