import requests from urllib.parse import urlparse import pandas as pd API_URL = "https://data.similarweb.com/api/v1/data?domain=" HEADERS = { # TODO: update the User-Agent with your current agent, else you will get blocked 'User-Agent': 'User-Agent Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0' } def similarweb_get(domain): domain = urlparse(domain).netloc # Similarweb input is only the domains netloc resp = requests.get(API_URL + domain, headers = HEADERS) if resp.status_code == 200: return resp.json() else: resp.raise_for_status() return False if __name__ == "__main__": domains = [ 'https://github.com/', 'https://measuretheweb.org' # not in the dataset (yet :)), response is valid but empty ] responses = [] success_n = 0 consecutive_errors_n = 0 for domain in domains: # if you are often getting errors, you might try to add a sleep here try: resp = similarweb_get(domain) responses.append(resp) success_n += 1 consecutive_errors_n = 0 except HTTPError: responses.append("") consecutive_errors_n += 1 if consecutive_errors_n >= 5: break print(f'Succeeded on {success_n} out of {len(domains)} domains.') df = pd.json_normalize(responses) print(df) df.to_csv('similarweb.csv')