Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

select_val.py 1.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
  1. import argparse
  2. import json
  3. import os
  4. import random
  5. def read_dups(similarity):
  6. with open(similarity) as f:
  7. # read in the list of lists
  8. data = json.load(f)
  9. # flatten list of lists into a single list
  10. data = sum(data, [])
  11. return set(data)
  12. def gen_val(train, percent, avoid):
  13. num = int(len(train) * percent)
  14. valid_files = list(set(train).difference(set(avoid)))
  15. if num > len(valid_files):
  16. return valid_files
  17. return random.sample(valid_files, num)
  18. def main():
  19. parser = argparse.ArgumentParser('Selects a validation dataset, taking into account which images are duplicates. This prevents leakage from the training set to the validation set.')
  20. parser.add_argument('--similarity', required=True, help='similarity.json file')
  21. parser.add_argument('--train', required=True, help='training directory')
  22. parser.add_argument('--val', required=True, help='validation directory')
  23. parser.add_argument('--percent', type=float, default=0.2, help='percent of the training set to use for validation')
  24. args = parser.parse_args()
  25. dups = read_dups(args.similarity)
  26. train = list(os.listdir(args.train))
  27. val = gen_val(train, args.percent, dups)
  28. val.sort()
  29. with open('validation.txt', mode='w') as f:
  30. f.write('\n'.join(val))
  31. f.write('\n')
  32. os.makedirs(args.val, exist_ok=True)
  33. for img in val:
  34. inpath = os.path.join(args.train, img)
  35. outpath = os.path.join(args.val, img)
  36. os.rename(inpath, outpath)
  37. if __name__ == '__main__':
  38. main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...