1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
- import argparse
- import json
- import os
- import re
- import requests
- import tqdm
- def search(keywords, start=None, license='Public'):
- url = 'https://duckduckgo.com/'
- params = {
- 'q': keywords
- }
- print("Hitting DuckDuckGo for Token")
- # First make a request to above URL, and parse out the 'vqd'
- # This is a special token, which should be used in the subsequent request
- res = requests.post(url, data=params)
- searchObj = re.search(r'vqd=([\d-]+)\&', res.text, re.M | re.I)
- if not searchObj:
- print("Token Parsing Failed !")
- return -1
- print("Obtained Token")
- headers = {
- 'dnt': '1',
- 'accept-encoding': 'gzip, deflate, sdch, br',
- 'x-requested-with': 'XMLHttpRequest',
- 'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6,ms;q=0.4',
- 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
- 'accept': 'application/json, text/javascript, */*; q=0.01',
- 'referer': 'https://duckduckgo.com/',
- 'authority': 'duckduckgo.com',
- }
- params = [
- ('l', 'us-en'),
- ('o', 'json'),
- ('q', keywords),
- ('vqd', searchObj.group(1)),
- ('f', f',,,,,license:{license}'),
- ('p', '1'),
- ]
-
- if start is not None:
- params.append(('s', str(start)))
- requestUrl = url + "i.js"
- try:
- res = requests.get(requestUrl, headers=headers, params=params)
- data = json.loads(res.text)
- return data
- except ValueError as e:
- print('Please try later.')
- def save_images(objs, keyword, start=0):
- path = f'downloads/{keyword}'
- os.makedirs(path, exist_ok=True)
-
- for i, obj in tqdm.tqdm(list(enumerate(objs))):
- num = str(i + start)
- num = num.zfill(6)
- img_link = obj['image']
- try:
- res = requests.get(img_link)
- except:
- pass
-
- if res.status_code != 200:
- continue
-
- img_data = res.content
- ext = os.path.splitext(img_link)[-1]
- ext = ext.split('?')[0].split('&')[0].split('!')[0]
- filename = os.path.join(path, f'{num}{ext}')
-
- with open(filename, 'wb+') as f:
- f.write(img_data)
- def main():
- parser = argparse.ArgumentParser('Downloads up to 1,000 images from DuckDuckGo based on keyword/phrase, filter by license')
- parser.add_argument('keywords', nargs='+')
-
- args = parser.parse_args()
-
- keywords = ' '.join(args.keywords)
-
- start = 0
- for page in range(0, 10):
- start = page * 100
- data = search(keywords, start=start, license='ShareCommercially')
- save_images(data['results'], keywords.replace(' ', '_'), start=start)
-
- if __name__ == '__main__':
- main()
|