Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

dvc.lock 12 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
  1. etl_instacart:
  2. cmd: python src/etl_instacart.py data/raw/order_products__prior.csv data/raw/products.csv
  3. deps:
  4. - path: data/raw/order_products__prior.csv
  5. md5: 8bca7ad1968ee8e07760b94db74461fb
  6. - path: data/raw/products.csv
  7. md5: dfcfded9d4af77d8b747913a4cbe6ffc
  8. - path: src/etl_instacart.py
  9. md5: 22e3b742c1fd6891b8c0ca48a950fbc7
  10. - path: src/tools/etl_tools.py
  11. md5: 8f4d02a9f5f12bc1afb93b24ca9f9482
  12. outs:
  13. - path: data/input/instacart_transformed.csv
  14. md5: bd8ffc219e463a7cc05ac1d1c8d10a1c
  15. etl_online_retail_ii:
  16. cmd: python src/etl_online_retail_ii.py data/raw/online_retail_II.xlsx
  17. deps:
  18. - path: data/raw/online_retail_II.xlsx
  19. md5: ed54ccfc5d358481c399cc11d0a244be
  20. - path: src/etl_online_retail_ii.py
  21. md5: b45477ef56c2f1907a5707e43a7ac431
  22. - path: src/tools/etl_tools.py
  23. md5: 8f4d02a9f5f12bc1afb93b24ca9f9482
  24. outs:
  25. - path: data/input/online_retail_transformed.csv
  26. md5: d816f3e9174cab9e66f849b353195ea1
  27. product_name_normalization:
  28. cmd: python src/names_cleaner.py data/input/
  29. deps:
  30. - path: data/input/
  31. md5: 03c3034ee49009d97c07a806864bf6d1.dir
  32. - path: src/names_cleaner.py
  33. md5: 8d26e21b8114b566f5194dd9b3d2535d
  34. - path: src/tools/preprocess_tools.py
  35. md5: cb3cff250564f34299890fa0fd25a95b
  36. outs:
  37. - path: data/input_names_normalized/
  38. md5: 38158811565ea03ea29516a96347583f.dir
  39. prepare_embeddings_sets:
  40. cmd: python src/prepare_embeddings_sets.py -o data/computed_embeddings_sets -p params.yaml
  41. deps:
  42. - path: src/prepare_embeddings_sets.py
  43. md5: 86b3bae7557d279f38b5a0ade62af219
  44. params:
  45. params.yaml:
  46. eimp.embeddings.dataset_size: 2000, 5000
  47. eimp.embeddings.train_size: 0.9
  48. eimp.embeddings.vector_size: 50,100,200, 300
  49. outs:
  50. - path: data/computed_embeddings_sets/test/
  51. md5: 6e4fe22bde4de43f4087f5833201dade.dir
  52. - path: data/computed_embeddings_sets/train/
  53. md5: 3788f0454bb06f2e2f3cb5a63dd30c53.dir
  54. prepare_eipt_params:
  55. cmd: python src/prepare_eipt_params.py -p params.yaml -o data/eipt/testing_params
  56. deps:
  57. - path: src/prepare_eipt_params.py
  58. md5: 2e6bef09101906509358c14fd8997c04
  59. params:
  60. params.yaml:
  61. eimp.model:
  62. faiss:
  63. venc:
  64. - 8
  65. - 16
  66. - 32
  67. - 64
  68. indexes:
  69. - Flat
  70. - HNSW32,Flat
  71. - IVF65536_HNSW32,Flat
  72. - HNSW32,SQ8
  73. - IVF65536_HNSW32,SQ8
  74. nprobe:
  75. - 1
  76. - 5
  77. - 10
  78. - 20
  79. - 40
  80. - 80
  81. - 100
  82. nlist:
  83. - 1
  84. - 5
  85. - 10
  86. - 20
  87. - 40
  88. - 80
  89. - 100
  90. M:
  91. - 1
  92. - 10
  93. - 100
  94. - 1000
  95. annoy:
  96. n_trees:
  97. - 10
  98. - 50
  99. - 100
  100. - 200
  101. - 500
  102. - 1000
  103. postgre:
  104. indexes:
  105. - gist
  106. - spgist
  107. KDTree:
  108. leaf_size:
  109. - 10
  110. - 50
  111. - 100
  112. - 200
  113. - 500
  114. - 1000
  115. outs:
  116. - path: data/eipt/testing_params
  117. md5: cb41b8d7955e681212ce07e2d6cbbd9e.dir
  118. fullscan_eipt_index_search:
  119. cmd: python src/fullscan_eipt_index_searcher.py
  120. deps:
  121. - path: data/computed_embeddings_sets/
  122. md5: 0f9055632dfcdd3cf10dc9db07a0682b.dir
  123. - path: src/fullscan_eipt_index_searcher.py
  124. md5: d1f1ccb9fcb6f4ffede93373ff28e36d
  125. params:
  126. params.yaml:
  127. eimp.search_params:
  128. k:
  129. - 1
  130. - 10
  131. outs:
  132. - path: data/eipt/metrics/by_model/fullscan/
  133. md5: fa278f48078e42a81fe61d76418d7c5c.dir
  134. - path: data/eipt/recommendations/fullscan/
  135. md5: 6ae0fff71a0405c1e8c2344cb58009d6.dir
  136. annoy_eipt_index_embeddings:
  137. cmd: python src/eipt_annoy_embeddings_indexer.py
  138. deps:
  139. - path: data/computed_embeddings_sets/train
  140. md5: 3788f0454bb06f2e2f3cb5a63dd30c53.dir
  141. - path: data/eipt/testing_params/annoy_grid.csv
  142. md5: 8a6b6372d0e84607a150972e9764fa88
  143. - path: src/eipt_annoy_embeddings_indexer.py
  144. md5: de64a03ae0f18540cc51666d83b1defd
  145. params:
  146. params.yaml:
  147. eimp.model.annoy:
  148. n_trees:
  149. - 10
  150. - 50
  151. - 100
  152. - 200
  153. - 500
  154. - 1000
  155. eimp.search_params:
  156. k:
  157. - 1
  158. - 10
  159. outs:
  160. - path: data/eipt/indexer_models/annoy/
  161. md5: 7d53ac1e5c2303ed594db682e6273a27.dir
  162. - path: data/eipt/metrics/by_model/annoy/train.csv
  163. md5: ed8f00e6452540e03d185798d2247f2d
  164. annoy_eipt_index_search:
  165. cmd: python src/annoy_eipt_index_searcher.py
  166. deps:
  167. - path: data/computed_embeddings_sets/test/
  168. md5: 6e4fe22bde4de43f4087f5833201dade.dir
  169. - path: data/eipt/indexer_models/annoy/
  170. md5: 7d53ac1e5c2303ed594db682e6273a27.dir
  171. - path: src/annoy_eipt_index_searcher.py
  172. md5: f2ce518d413dc9d7936fba646d12b63d
  173. params:
  174. params.yaml:
  175. eimp.model.annoy:
  176. n_trees:
  177. - 10
  178. - 50
  179. - 100
  180. - 200
  181. - 500
  182. - 1000
  183. eimp.search_params:
  184. k:
  185. - 1
  186. - 10
  187. outs:
  188. - path: data/eipt/metrics/by_model/annoy/test.csv
  189. md5: 8b417c14cc877b97357b011be9ecafc7
  190. - path: data/eipt/recommendations/annoy/
  191. md5: 3e82a6335b93b27a0a178433cee241c0.dir
  192. eipt_analysis:
  193. cmd: python src/eipt_analysis.py -p params.yaml -o data/eipt/metrics/ -m data/eipt/metrics/by_model/
  194. -r data/eipt/recommendations -g data/eipt/testing_params/ -t data/eipt/template.md
  195. -d embedding_indexer_performance_testing.md
  196. deps:
  197. - path: data/eipt/metrics/by_model/
  198. md5: f0c9c692650facf7083c8845cc2efdd0.dir
  199. - path: data/eipt/template.md
  200. md5: 08744d5b03ce51cb0213315d8a8a5df5
  201. - path: data/eipt/testing_params/
  202. md5: 7a450cd4c20113f7d25f116c5b70eb54.dir
  203. - path: src/eipt_analysis.py
  204. md5: e3884737e9a1a9e03d6e75807d0d00fa
  205. params:
  206. params.yaml:
  207. eimp:
  208. embeddings:
  209. vector_size: 50,100,200, 300
  210. dataset_size: 2000, 5000
  211. train_size: 0.9
  212. all_models_params:
  213. metric:
  214. - euclidean
  215. - manhattan
  216. - chebyshev
  217. - cosine
  218. - angular
  219. search_params:
  220. k:
  221. - 1
  222. - 10
  223. model:
  224. faiss:
  225. venc:
  226. - 8
  227. - 16
  228. - 32
  229. - 64
  230. indexes:
  231. - Flat
  232. - HNSW32,Flat
  233. - IVF65536_HNSW32,Flat
  234. - HNSW32,SQ8
  235. - IVF65536_HNSW32,SQ8
  236. nprobe:
  237. - 1
  238. - 5
  239. - 10
  240. - 20
  241. - 40
  242. - 80
  243. - 100
  244. nlist:
  245. - 1
  246. - 5
  247. - 10
  248. - 20
  249. - 40
  250. - 80
  251. - 100
  252. M:
  253. - 1
  254. - 10
  255. - 100
  256. - 1000
  257. annoy:
  258. n_trees:
  259. - 10
  260. - 50
  261. - 100
  262. - 200
  263. - 500
  264. - 1000
  265. postgre:
  266. indexes:
  267. - gist
  268. - spgist
  269. KDTree:
  270. leaf_size:
  271. - 10
  272. - 50
  273. - 100
  274. - 200
  275. - 500
  276. - 1000
  277. estimation:
  278. aimed_param_values:
  279. dataset_size: 5000
  280. metric: euclidean
  281. vector_size: 300
  282. k: 10
  283. x:
  284. - k
  285. - dataset_size
  286. - vector_size
  287. y:
  288. train:
  289. - training_time
  290. - saving_time
  291. - model_size
  292. test:
  293. - search_time
  294. - loading_time
  295. lines:
  296. - k
  297. - metric
  298. - model
  299. facet:
  300. - k
  301. - dataset_size
  302. - vector_size
  303. - metric
  304. topn: 3
  305. relative_graphs: false
  306. log10_graphs: true
  307. outs:
  308. - path: data/eipt/metrics/graphs/
  309. md5: 4340b476d009c0be56badba8406dec53.dir
  310. - path: data/eipt/metrics/test_metrics_summary.csv
  311. md5: d61de8d8d93de378b13af2b10820622e
  312. - path: data/eipt/metrics/train_metrics_summary.csv
  313. md5: 7df6f8addfc6cb2aed99077c4aaf044f
  314. - path: embedding_indexer_performance_testing.md
  315. md5: f9cd222fc5f00dffd37ca79f7135bd32
  316. drop_products_by_name:
  317. cmd: python3 src/products_removers.py data/input_names_normalized/
  318. deps:
  319. - path: data/input_names_normalized/
  320. md5: 38158811565ea03ea29516a96347583f.dir
  321. - path: src/products_removers.py
  322. md5: 1415f5df76d7a9caf988362c9258042c
  323. - path: src/tools/preprocess_tools.py
  324. md5: cb3cff250564f34299890fa0fd25a95b
  325. outs:
  326. - path: data/input_preprocessed/
  327. md5: 0cb51a4fdb39730877c0ddb00e3668fa.dir
  328. split_into_subsets_all_data:
  329. cmd: python3 src/train_test_split.py 0.8 data/input_preprocessed/
  330. deps:
  331. - path: data/input_preprocessed/
  332. md5: 0cb51a4fdb39730877c0ddb00e3668fa.dir
  333. - path: src/tools/preprocess_tools.py
  334. md5: cb3cff250564f34299890fa0fd25a95b
  335. - path: src/train_test_split.py
  336. md5: 825cf8cbe9aebb1a6d7b11d54818b64b
  337. outs:
  338. - path: data/test_data/
  339. md5: 19c6886bb90dc9a64a06aa37fe6eab07.dir
  340. - path: data/train_data/
  341. md5: 6a3a0bf946e6208c00ab0d71f26bcb31.dir
  342. preprocess_test_add_dummy_orders:
  343. cmd: python src/prepare_dummy_orders.py data/test_data data/test_final
  344. deps:
  345. - path: data/test_data
  346. md5: 19c6886bb90dc9a64a06aa37fe6eab07.dir
  347. - path: src/prepare_dummy_orders.py
  348. md5: 42bcb5d8efc555f39ab2904ee26c1c9e
  349. - path: src/tools/preprocess_tools.py
  350. md5: cb3cff250564f34299890fa0fd25a95b
  351. outs:
  352. - path: data/test_final
  353. md5: ecd8c089e4fcbba2f96355ff78987f74.dir
  354. preprocess_train_drop_by_size:
  355. cmd: python src/drop_by_order_size.py data/train_data/ data/train_final/
  356. deps:
  357. - path: data/train_data/
  358. md5: 6a3a0bf946e6208c00ab0d71f26bcb31.dir
  359. - path: src/tools/preprocess_tools.py
  360. md5: cb3cff250564f34299890fa0fd25a95b
  361. params:
  362. params.yaml:
  363. order_size_train.max: 10
  364. order_size_train.min: 2
  365. outs:
  366. - path: data/train_final/
  367. md5: 0e1b6c7dfb48399bbf49d17bcd55b469.dir
  368. input_data_for_random_model:
  369. cmd: python src/prepare_train_for_random_model.py data/input_preprocessed/ data/train_for_random/
  370. deps:
  371. - path: data/input_preprocessed/
  372. md5: 0cb51a4fdb39730877c0ddb00e3668fa.dir
  373. - path: src/prepare_train_for_random_model.py
  374. md5: daec1265a186a396254b13aa43a628f7
  375. - path: src/tools/preprocess_tools.py
  376. md5: cb3cff250564f34299890fa0fd25a95b
  377. outs:
  378. - path: data/train_for_random/
  379. md5: d034fc57afd69ef66a7302749a5ed032.dir
  380. input_data_for_embedding_model:
  381. cmd: python src/prepare_train_for_embedding_model.py data/train_final/ data/train_for_embeddings/
  382. deps:
  383. - path: data/train_final/
  384. md5: 0e1b6c7dfb48399bbf49d17bcd55b469.dir
  385. - path: src/prepare_train_for_embedding_model.py
  386. md5: e3c408f0f144c4e1803289c8f2fef84c
  387. - path: src/tools/preprocess_tools.py
  388. md5: cb3cff250564f34299890fa0fd25a95b
  389. outs:
  390. - path: data/train_for_embeddings/
  391. md5: b1016c20e6f47671fb74da7de646e1aa.dir
  392. train_basket_tfidf_perceptron:
  393. cmd: python src/models/trainer_basket_tfidf_perceptron_embedding.py data/train_for_embeddings/
  394. data/embedding_models/basket_tfidf_perceptron/
  395. deps:
  396. - path: data/train_for_embeddings/
  397. md5: b1016c20e6f47671fb74da7de646e1aa.dir
  398. - path: src/models/dataset_iterators.py
  399. md5: 6d64fead780b23af1dcefc0ee0d6f588
  400. - path: src/models/neural_network_embedding.py
  401. md5: ef0d898dd8dc943413ef7ad1ed29fd9b
  402. - path: src/models/trainer_basket_tfidf_perceptron_embedding.py
  403. md5: cc8859f7aad1f6854a1f22395db2b831
  404. params:
  405. params.yaml:
  406. basket_tfidf_perceptron_model.params_grid:
  407. batch_size: 512
  408. epoch_count: 20
  409. lr: 0.001
  410. momentum: 0.8
  411. basket_tfidf_perceptron_model.random_seed: 10027
  412. outs:
  413. - path: data/embedding_models/basket_tfidf_perceptron/
  414. md5: b8da71ff2a147db74c0d10ad59496519.dir
  415. train_random_model:
  416. cmd: python src/trainer_random_model.py data/train_for_random/ data/random_models/
  417. deps:
  418. - path: data/train_for_random/
  419. md5: d034fc57afd69ef66a7302749a5ed032.dir
  420. - path: src/models/trainer_random_model.py
  421. md5: b9b85136088582dc23d43542a4334098
  422. - path: src/trainer_random_model.py
  423. md5: 1ef638f544fb7b82f71d8bd6932eb259
  424. params:
  425. params.yaml:
  426. random_model.random_seed: 2019
  427. outs:
  428. - path: data/random_models/
  429. md5: dbb04670a0e829621d9d485538049f71.dir
  430. build_products_registry:
  431. cmd: python src/generate_products_registry.py data/input_preprocessed/ data/products_registry/
  432. deps:
  433. - path: data/input_preprocessed/
  434. md5: 0cb51a4fdb39730877c0ddb00e3668fa.dir
  435. - path: src/generate_products_registry.py
  436. md5: 6ebe92490a602ba8c749e97cc5fed9fb
  437. outs:
  438. - path: data/products_registry/
  439. md5: c06a85e1a802bd1b02a185e9f2bbd07b.dir
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...