Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

download_ddg.py 2.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
  1. import argparse
  2. import json
  3. import os
  4. import re
  5. import requests
  6. import tqdm
  7. def search(keywords, start=None, license='Public'):
  8. url = 'https://duckduckgo.com/'
  9. params = {
  10. 'q': keywords
  11. }
  12. print("Hitting DuckDuckGo for Token")
  13. # First make a request to above URL, and parse out the 'vqd'
  14. # This is a special token, which should be used in the subsequent request
  15. res = requests.post(url, data=params)
  16. searchObj = re.search(r'vqd=([\d-]+)\&', res.text, re.M | re.I)
  17. if not searchObj:
  18. print("Token Parsing Failed !")
  19. return -1
  20. print("Obtained Token")
  21. headers = {
  22. 'dnt': '1',
  23. 'accept-encoding': 'gzip, deflate, sdch, br',
  24. 'x-requested-with': 'XMLHttpRequest',
  25. 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6,ms;q=0.4',
  26. 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
  27. 'accept': 'application/json, text/javascript, */*; q=0.01',
  28. 'referer': 'https://duckduckgo.com/',
  29. 'authority': 'duckduckgo.com',
  30. }
  31. params = [
  32. ('l', 'us-en'),
  33. ('o', 'json'),
  34. ('q', keywords),
  35. ('vqd', searchObj.group(1)),
  36. ('f', f',,,,,license:{license}'),
  37. ('p', '1'),
  38. ]
  39. if start is not None:
  40. params.append(('s', str(start)))
  41. requestUrl = url + "i.js"
  42. try:
  43. res = requests.get(requestUrl, headers=headers, params=params)
  44. data = json.loads(res.text)
  45. return data
  46. except ValueError as e:
  47. print('Please try later.')
  48. def save_images(objs, keyword, start=0):
  49. path = f'downloads/{keyword}'
  50. os.makedirs(path, exist_ok=True)
  51. for i, obj in tqdm.tqdm(list(enumerate(objs))):
  52. num = str(i + start)
  53. num = num.zfill(6)
  54. img_link = obj['image']
  55. try:
  56. res = requests.get(img_link)
  57. except:
  58. pass
  59. if res.status_code != 200:
  60. continue
  61. img_data = res.content
  62. ext = os.path.splitext(img_link)[-1]
  63. ext = ext.split('?')[0].split('&')[0].split('!')[0]
  64. filename = os.path.join(path, f'{num}{ext}')
  65. with open(filename, 'wb+') as f:
  66. f.write(img_data)
  67. def main():
  68. parser = argparse.ArgumentParser('Downloads up to 1,000 images from DuckDuckGo based on keyword/phrase, filter by license')
  69. parser.add_argument('keywords', nargs='+')
  70. args = parser.parse_args()
  71. keywords = ' '.join(args.keywords)
  72. start = 0
  73. for page in range(0, 10):
  74. start = page * 100
  75. data = search(keywords, start=start, license='ShareCommercially')
  76. save_images(data['results'], keywords.replace(' ', '_'), start=start)
  77. if __name__ == '__main__':
  78. main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...