Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

raw_code.py 5.3 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
  1. # import modules
  2. import numpy as np
  3. import pandas as pd
  4. from collections import Counter
  5. from sklearn.feature_extraction.text import CountVectorizer
  6. from sklearn.metrics.pairwise import cosine_similarity
  7. import matplotlib.pyplot as plt
  8. import networkx as nx
  9. import plotly.offline as py
  10. import plotly.io as pio
  11. import sys
  12. import argparse
  13. from plot import plot_network_graph
  14. def compute_similarities(df, THRESHOLD):
  15. #### Create a document-term-matrix
  16. vectorizer = CountVectorizer(lowercase=True, min_df=1, analyzer='word', stop_words=None)
  17. #### one dtm with matching unique words
  18. onewordingredients = [["".join(i.split()) for i in inner] for inner in list(df['ingredients'])]
  19. original_ingredient_corpus = [" ".join(i) for i in onewordingredients]
  20. dtm_orignal_ingredient = vectorizer.fit_transform(original_ingredient_corpus)
  21. #### And another dtm where each word is its own token
  22. separate_words_corpus = [" ".join(i) for i in list(df['ingredients'])]
  23. dtm_separate_words = vectorizer.fit_transform(separate_words_corpus)
  24. # concatenate matrices
  25. dtm = np.concatenate((dtm_orignal_ingredient.toarray(), dtm_separate_words.toarray()), axis=1)
  26. #### Compute similarity between any two recipes
  27. similarity_csr = cosine_similarity(dtm, dense_output=False)
  28. # get similar recipes by index
  29. sim_recipes = np.argwhere(similarity_csr > THRESHOLD)
  30. sim_recipes = sim_recipes[sim_recipes[:, 0] != sim_recipes[:, 1]]
  31. return similarity_csr, sim_recipes
  32. def build_graph(sim_recipes, RECIPE_INDEX, similarity_csr, THRESHOLD):
  33. first_order = [i[1] for i in sim_recipes if i[0] in [RECIPE_INDEX]]
  34. second_order = list(set([i[1] for i in sim_recipes if i[0] in first_order]))
  35. # remove original recipe
  36. if RECIPE_INDEX in second_order:
  37. second_order.remove(RECIPE_INDEX)
  38. second_order = [x for x in second_order if x not in first_order]
  39. third_order = list(set([i[1] for i in sim_recipes if i[0] in second_order]))
  40. # remove original recipe
  41. if RECIPE_INDEX in third_order:
  42. third_order.remove(RECIPE_INDEX)
  43. third_order = [x for x in third_order if x not in first_order+second_order]
  44. # get list of all recommended recipes by index
  45. all_recommendations = list(set([RECIPE_INDEX] + first_order + second_order + third_order))
  46. all_recommendations.sort()
  47. # keep only those recipes of interest
  48. # - note that a new matrix will change the index number of the recommended recipes
  49. row_idx = np.array(all_recommendations)
  50. col_idx = np.array(all_recommendations)
  51. recommendation_csr = similarity_csr[row_idx[:, None], col_idx]
  52. # for the connected nodes keep only those pairs that have a similarity > THRESHOLD
  53. direct_recommendation_csr = (recommendation_csr > THRESHOLD)
  54. # return the new indices of the narrowed matrix containing only the recommendations
  55. new_indices = [i for i in enumerate(all_recommendations)]
  56. # get the new index of the original recipe
  57. original_recipe_idx = [i[0] for i in new_indices if i[1]==RECIPE_INDEX][0]
  58. # get new indices of the recommendations
  59. first = []
  60. second = []
  61. third = []
  62. for idx,i in new_indices:
  63. if i in first_order:
  64. first.append(idx)
  65. if i in second_order:
  66. second.append(idx)
  67. if i in third_order:
  68. third.append(idx)
  69. # convert adjacency recommendation matrix to graph
  70. G = nx.from_numpy_matrix(direct_recommendation_csr)
  71. return all_recommendations, original_recipe_idx, first, second, third, G
  72. # map a color to the recommendation level
  73. def create_visualization(original_recipe_idx, first, second, third, df, all_recommendations, G, THRESHOLD):
  74. d = {}
  75. d[original_recipe_idx] = 0
  76. d.update({i: 1 for i in first})
  77. d.update({j: 2 for j in second})
  78. d.update({k: 3 for k in third})
  79. node_colors_by_position = [d[i] for i in sorted(d)]
  80. node_text_by_position = list(df.loc[all_recommendations]['id'].values)#list(pos.keys())
  81. fig = plot_network_graph(G, TITLE="Recommended recipes by distance with threshold of {}".format(THRESHOLD), list_of_colors_by_order_of_nodes=node_colors_by_position, list_of_text_by_order_of_nodes=node_text_by_position)
  82. pio.write_html(fig, '../figures/raw_code_graph_output.html')
  83. return py.iplot(fig)
  84. # main
  85. def recommend_recipes(args):
  86. df = pd.read_json('../data/train.json')
  87. df = df.head(1000)
  88. THRESHOLD = (args.threshold)
  89. RECIPE = (args.recipe)
  90. RECIPE_INDEX = df[df['id']==RECIPE].index.values[0]
  91. similarity_csr, sim_recipes = compute_similarities(df, THRESHOLD)
  92. all_recommendations, original_recipe_idx, first, second, third, G = build_graph(sim_recipes, RECIPE_INDEX, similarity_csr, THRESHOLD)
  93. fig = create_visualization(original_recipe_idx, first, second, third, df, all_recommendations, G, THRESHOLD)
  94. return fig
  95. # Add argument parser
  96. def parse_arguments(argv):
  97. parser = argparse.ArgumentParser()
  98. parser.add_argument('threshold', type=float, help='Threshold to compute similarity.')
  99. parser.add_argument('recipe', type=int, help='Recipe ID.')
  100. return parser.parse_args(argv)
  101. if __name__ == '__main__':
  102. recommend_recipes(parse_arguments(sys.argv[1:]))
  103. #### Parameters
  104. # 1. threshold
  105. # 2. recipe
  106. # Run it like this: python raw_code.py .5 41995
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...