Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

featurization.py 2.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
  1. import os
  2. import sys
  3. import errno
  4. import pandas as pd
  5. import numpy as np
  6. import scipy.sparse as sparse
  7. from sklearn.feature_extraction.text import CountVectorizer
  8. from sklearn.feature_extraction.text import TfidfTransformer
  9. try:
  10. import cPickle as pickle
  11. except ImportError:
  12. import pickle
  13. np.set_printoptions(suppress=True)
  14. if len(sys.argv) != 3 and len(sys.argv) != 5:
  15. sys.stderr.write('Arguments error. Usage:\n')
  16. sys.stderr.write('\tpython featurization.py data-dir-path features-dir-path\n')
  17. sys.exit(1)
  18. train_input = os.path.join(sys.argv[1], 'train.tsv')
  19. test_input = os.path.join(sys.argv[1], 'test.tsv')
  20. train_output = os.path.join(sys.argv[2], 'train.pkl')
  21. test_output = os.path.join(sys.argv[2], 'test.pkl')
  22. try:
  23. reload(sys)
  24. sys.setdefaultencoding('utf-8')
  25. except NameError:
  26. pass
  27. def mkdir_p(path):
  28. try:
  29. os.makedirs(path)
  30. except OSError as exc: # Python >2.5
  31. if exc.errno == errno.EEXIST and os.path.isdir(path):
  32. pass
  33. else:
  34. raise
  35. def get_df(data):
  36. df = pd.read_csv(
  37. data,
  38. encoding='utf-8',
  39. header=None,
  40. delimiter='\t',
  41. names=['id', 'label', 'text']
  42. )
  43. sys.stderr.write('The input data frame {} size is {}\n'.format(data, df.shape))
  44. return df
  45. def save_matrix(df, matrix, output):
  46. id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
  47. label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
  48. result = sparse.hstack([id_matrix, label_matrix, matrix], format='csr')
  49. msg = 'The output matrix {} size is {} and data type is {}\n'
  50. sys.stderr.write(msg.format(output, result.shape, result.dtype))
  51. with open(output, 'wb') as fd:
  52. pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL)
  53. pass
  54. mkdir_p(sys.argv[2])
  55. # Generate train feature matrix
  56. df_train = get_df(train_input)
  57. train_words = np.array(df_train.text.str.lower().values.astype('U'))
  58. bag_of_words = CountVectorizer(stop_words='english',
  59. max_features=5000)
  60. bag_of_words.fit(train_words)
  61. train_words_binary_matrix = bag_of_words.transform(train_words)
  62. tfidf = TfidfTransformer(smooth_idf=False)
  63. tfidf.fit(train_words_binary_matrix)
  64. train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
  65. save_matrix(df_train, train_words_tfidf_matrix, train_output)
  66. # Generate test feature matrix
  67. df_test = get_df(test_input)
  68. test_words = np.array(df_test.text.str.lower().values.astype('U'))
  69. test_words_binary_matrix = bag_of_words.transform(test_words)
  70. test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
  71. save_matrix(df_test, test_words_tfidf_matrix, test_output)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...