import pandas as pd # to load it into Pandas DataFrame from google.cloud import bigquery from google.oauth2 import service_account YYYYMM = 202411 # let's download only November 2024 data RANKS = (1000, 5000, 10000, 50000, 100000, 500000, 1000000, 5000000) # all ranks COUNTRIES = ( 'at','be','bg','hr','cy','cz','dk','ee','fi','fr','de','gr','hu','ie','it','lv','lt','lu','mt','nl','pl','pt','ro','sk','si','es','se', # EU states 'is','li','no','ch', # EFTA states 'gb' # Other states ) SITES_N = 500 client = bigquery.Client() # this will fail if you have not set GOOGLE_APPLICATION_CREDENTIALS query = f"SELECT DISTINCT country_code, origin, experimental.popularity.rank as rank FROM `chrome-ux-report.experimental.country` WHERE yyyymm = {YYYYMM}" df = client.query(query).to_dataframe() df.to_csv(f'crux_all_{YYYYMM}.csv') # store crux original data # this takes a while to process websites = { country: { rank: set(df[(df['country_code'] == country) & (df['rank'] == rank)]['origin'].values) for rank in RANKS } for country in COUNTRIES } sampled_websites = set() for rank in RANKS: for country in COUNTRIES: source = websites[country][rank] sampled = set(sample(tuple(source), min(SITES_N, len(source)))) sampled_websites = sampled_websites | sampled with open(f'sampled_urls_{YYYYMM}.txt', 'w') as wf: for url in sampled_websites: wf.write(url + '\n')