import pandas as pd  # to load it into Pandas DataFrame

from google.cloud import bigquery
from google.oauth2 import service_account

YYYYMM = 202411  # let's download only November 2024 data
RANKS = (1000, 5000, 10000, 50000, 100000, 500000, 1000000, 5000000)  # all ranks
COUNTRIES = (
'at','be','bg','hr','cy','cz','dk','ee','fi','fr','de','gr','hu','ie','it','lv','lt','lu','mt','nl','pl','pt','ro','sk','si','es','se',  #  EU states
'is','li','no','ch',  # EFTA states
'gb' # Other states
)
SITES_N = 500

client = bigquery.Client()  # this will fail if you have not set GOOGLE_APPLICATION_CREDENTIALS
query = f"SELECT DISTINCT country_code, origin, experimental.popularity.rank as rank FROM `chrome-ux-report.experimental.country` WHERE yyyymm = {YYYYMM}"

df = client.query(query).to_dataframe()
df.to_csv(f'crux_all_{YYYYMM}.csv')  # store crux original data

# this takes a while to process
websites = {
    country: {
        rank: set(df[(df['country_code'] == country) & (df['rank'] == rank)]['origin'].values)
        for rank in RANKS
    } 
    for country in COUNTRIES
}

sampled_websites = set()
for rank in RANKS:
    for country in COUNTRIES:
        source = websites[country][rank]
        sampled = set(sample(tuple(source), min(SITES_N, len(source))))
        sampled_websites = sampled_websites | sampled

with open(f'sampled_urls_{YYYYMM}.txt', 'w') as wf:
    for url in sampled_websites:
        wf.write(url + '\n')