|
# USAGE
|
|
# python search_bing_api.py --query "alan grant" --output dataset/alan_grant
|
|
# python search_bing_api.py --query "ian malcolm" --output dataset/ian_malcolm
|
|
# python search_bing_api.py --query "ellie sattler" --output dataset/ellie_sattler
|
|
# python search_bing_api.py --query "john hammond jurassic park" --output dataset/john_hammond
|
|
# python search_bing_api.py --query "owen grady jurassic world" --output dataset/owen_grady
|
|
# python search_bing_api.py --query "claire dearing jurassic world" --output dataset/claire_dearing
|
|
|
|
# import the necessary packages
|
|
from requests import exceptions
|
|
import argparse
|
|
import requests
|
|
import cv2
|
|
import os
|
|
|
|
# construct the argument parser and parse the arguments
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("-q", "--query", required=True,
|
|
help="search query to search Bing Image API for")
|
|
ap.add_argument("-o", "--output", required=True,
|
|
help="path to output directory of images")
|
|
args = vars(ap.parse_args())
|
|
|
|
# set your Microsoft Cognitive Services API key along with (1) the
|
|
# maximum number of results for a given search and (2) the group size
|
|
# for results (maximum of 50 per request)
|
|
API_KEY = "INSERT_YOUR_API_KEY_HERE"
|
|
MAX_RESULTS = 100
|
|
GROUP_SIZE = 50
|
|
|
|
# set the endpoint API URL
|
|
URL = "https://api.cognitive.microsoft.com/bing/v7.0/images/search"
|
|
|
|
# when attemping to download images from the web both the Python
|
|
# programming language and the requests library have a number of
|
|
# exceptions that can be thrown so let's build a list of them now
|
|
# so we can filter on them
|
|
EXCEPTIONS = set([IOError, FileNotFoundError,
|
|
exceptions.RequestException, exceptions.HTTPError,
|
|
exceptions.ConnectionError, exceptions.Timeout])
|
|
|
|
# store the search term in a convenience variable then set the
|
|
# headers and search parameters
|
|
term = args["query"]
|
|
headers = {"Ocp-Apim-Subscription-Key" : API_KEY}
|
|
params = {"q": term, "offset": 0, "count": GROUP_SIZE}
|
|
|
|
# make the search
|
|
print("[INFO] searching Bing API for '{}'".format(term))
|
|
search = requests.get(URL, headers=headers, params=params)
|
|
search.raise_for_status()
|
|
|
|
# grab the results from the search, including the total number of
|
|
# estimated results returned by the Bing API
|
|
results = search.json()
|
|
estNumResults = min(results["totalEstimatedMatches"], MAX_RESULTS)
|
|
print("[INFO] {} total results for '{}'".format(estNumResults,
|
|
term))
|
|
|
|
# initialize the total number of images downloaded thus far
|
|
total = 0
|
|
|
|
# loop over the estimated number of results in `GROUP_SIZE` groups
|
|
for offset in range(0, estNumResults, GROUP_SIZE):
|
|
# update the search parameters using the current offset, then
|
|
# make the request to fetch the results
|
|
print("[INFO] making request for group {}-{} of {}...".format(
|
|
offset, offset + GROUP_SIZE, estNumResults))
|
|
params["offset"] = offset
|
|
search = requests.get(URL, headers=headers, params=params)
|
|
search.raise_for_status()
|
|
results = search.json()
|
|
print("[INFO] saving images for group {}-{} of {}...".format(
|
|
offset, offset + GROUP_SIZE, estNumResults))
|
|
|
|
# loop over the results
|
|
for v in results["value"]:
|
|
# try to download the image
|
|
try:
|
|
# make a request to download the image
|
|
print("[INFO] fetching: {}".format(v["contentUrl"]))
|
|
r = requests.get(v["contentUrl"], timeout=30)
|
|
|
|
# build the path to the output image
|
|
ext = v["contentUrl"][v["contentUrl"].rfind("."):]
|
|
p = os.path.sep.join([args["output"], "{}{}".format(
|
|
str(total).zfill(8), ext)])
|
|
|
|
# write the image to disk
|
|
f = open(p, "wb")
|
|
f.write(r.content)
|
|
f.close()
|
|
|
|
# catch any errors that would not unable us to download the
|
|
# image
|
|
except Exception as e:
|
|
# check to see if our exception is in our list of
|
|
# exceptions to check for
|
|
if type(e) in EXCEPTIONS:
|
|
print("[INFO] skipping: {}".format(v["contentUrl"]))
|
|
continue
|
|
|
|
# try to load the image from disk
|
|
image = cv2.imread(p)
|
|
|
|
# if the image is `None` then we could not properly load the
|
|
# image from disk (so it should be ignored)
|
|
if image is None:
|
|
print("[INFO] deleting: {}".format(p))
|
|
os.remove(p)
|
|
continue
|
|
|
|
# update the counter
|
|
total += 1
|