dmitry.shilov
/
22_graph_test_2


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
            eimp:
  embeddings:
    vector_size: 50,100,200, 300 # list of values devided by coma, or range, e.g. 50:500:10
    dataset_size: 2000, 5000
    train_size: 0.9 # proportion of train data in the dataset
  all_models_params:
    metric: ['euclidean', 'manhattan', 'chebyshev', 'cosine', 'angular']
  search_params:
    k: [1, 10] #k-neighbors
  model:
    faiss:
      venc: [8, 16, 32, 64] #vectors encoding
      indexes: ['Flat', 'HNSW32,Flat', 'IVF65536_HNSW32,Flat', 'HNSW32,SQ8', 'IVF65536_HNSW32,SQ8']
      nprobe: [1, 5, 10, 20, 40, 80, 100] #the number of cells (out of nlist)
      nlist: [1, 5, 10, 20, 40, 80, 100] #the number of cells
      M: [1, 10, 100, 1000] #is the number of neighbors used in the graph
    annoy:
      n_trees: [10, 50, 100, 200, 500, 1000]
    postgre:
      indexes: ['gist', 'spgist']
    KDTree:
      leaf_size: [10, 50, 100, 200, 500, 1000]
  estimation: # different view on graphs to build
    aimed_param_values: # if the following parameters variations are not shown is the graph, only part of dataframe will be considered with their restrictions
      dataset_size: 5000
      metric: 'euclidean'
      vector_size: 300
      k: 10
    x: ['k', "dataset_size", 'vector_size'] # parameters to be used in x axis
    y:  # parameters to be used in y axis
      train: ["training_time", "saving_time", "model_size"]
      test: ["search_time", "loading_time"]
    lines: ['k', "metric", "model"]  # parameters to be used as different lines on the graph (hue/color/lines) 
    facet: ['k', "dataset_size", 'vector_size', "metric"]  # parameters to be used in facet construction
    topn: 3 # show best topn models in the report file
    relative_graphs: false # show relative (e.g. model search time /  fullscan search time) data on graphs
    log10_graphs: true # show results in semi-log scale
order_size_train:
  max: 10
  min: 2
random_model:
  random_seed: 2019
recommendations:
  n_items: 10
basket_tfidf_perceptron_model:
  random_seed: 10027
  params_grid: {"batch_size": 512, "epoch_count": 20, "lr": 0.001, "momentum": 0.8}