import requests
from urllib.parse import urlparse

import pandas as pd

API_URL = "https://data.similarweb.com/api/v1/data?domain="
HEADERS = {  # TODO: update the User-Agent with your current agent, else you will get blocked
    'User-Agent': 'User-Agent Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0'
}

def similarweb_get(domain):
    domain = urlparse(domain).netloc  # Similarweb input is only the domains netloc
    resp = requests.get(API_URL + domain, headers = HEADERS)
    if resp.status_code == 200:
        return resp.json()
    else:
        resp.raise_for_status()
        return False

if __name__ == "__main__":
    domains = [
        'https://github.com/',
        'https://measuretheweb.org'  # not in the dataset (yet :)), response is valid but empty
    ]

    responses = []
    success_n = 0
    consecutive_errors_n = 0

    for domain in domains:
        # if you are often getting errors, you might try to add a sleep here
        try:
            resp = similarweb_get(domain)
            responses.append(resp)
            success_n += 1
            consecutive_errors_n = 0
        except HTTPError:
            responses.append("")
            consecutive_errors_n += 1
            if consecutive_errors_n >= 5:
                break

    print(f'Succeeded on {success_n} out of {len(domains)} domains.')
    df = pd.json_normalize(responses)
    print(df)
    df.to_csv('similarweb.csv')