programming:similarweb
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revision | |||
| programming:similarweb [2025/01/03 17:41] – Added discussion karelkubicek | programming:similarweb [2026/03/24 09:34] (current) – Updated API against blocking karelkubicek | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| - | While official API use is paid, Similarweb offers a [[https:// | + | While official API use is paid, Similarweb offers a [[https:// |
| + | |||
| + | ==== Methodology for Code Reproduction ==== | ||
| + | |||
| + | To update | ||
| + | |||
| + | - | ||
| + | - Go to '' | ||
| + | - | ||
| + | - | ||
| + | - Go to the **Network | ||
| + | - | ||
| + | - Look for the request to '' | ||
| + | - | ||
| + | - Give that as context + the source code of the extension to you favorite LLM and ask it to generate | ||
| + | |||
| + | ==== Implementation ==== | ||
| + | |||
| + | This is what LLM generated for me: | ||
| + | |||
| + | Modern web protections (like Cloudflare) used by Similarweb block standard Python '' | ||
| <file python main.py> | <file python main.py> | ||
| - | import requests | + | from curl_cffi |
| from urllib.parse import urlparse | from urllib.parse import urlparse | ||
| - | |||
| import pandas as pd | import pandas as pd | ||
| + | import time | ||
| - | API_URL | + | # Configuration copied from extension inspection |
| - | HEADERS = { # TODO: update the User-Agent with your current agent, else you will get blocked | + | EXTENSION_VER = " |
| - | | + | BASE_URL |
| + | |||
| + | HEADERS = { | ||
| + | " | ||
| + | | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| + | " | ||
| } | } | ||
| - | def similarweb_get(domain): | + | class SimilarWebScraper: |
| - | domain = urlparse(domain).netloc | + | |
| - | resp = requests.get(API_URL + domain, headers = HEADERS) | + | self.session = requests.Session() |
| - | if resp.status_code == 200: | + | self.has_identity = False |
| - | return resp.json() | + | |
| - | else: | + | |
| - | resp.raise_for_status() | + | """ |
| - | return | + | if not self.has_identity: |
| + | self.session.get(f" | ||
| + | self.has_identity = True | ||
| + | |||
| + | def get_data(self, | ||
| + | self._ensure_identity() | ||
| + | | ||
| + | |||
| + | | ||
| + | f" | ||
| + | | ||
| + | impersonate=" | ||
| + | | ||
| + | |||
| + | | ||
| + | return resp.json() | ||
| + | | ||
| + | print(f" | ||
| + | return | ||
| if __name__ == " | if __name__ == " | ||
| + | scraper = SimilarWebScraper() | ||
| domains = [ | domains = [ | ||
| ' | ' | ||
| - | ' | + | |
| + | | ||
| ] | ] | ||
| - | | + | |
| - | | + | |
| - | | + | |
| + | data = scraper.get_data(d) | ||
| + | if data: | ||
| + | results.append(data) | ||
| + | time.sleep(2) # Be kind to avoid rate limiting | ||
| - | | + | |
| - | | + | |
| - | try: | + | print(f"Succeeded on {len(results)} out of {len(domains)} domains.") |
| - | resp = similarweb_get(domain) | + | df.to_csv(' |
| - | responses.append(resp) | + | |
| - | success_n += 1 | + | |
| - | consecutive_errors_n = 0 | + | |
| - | | + | |
| - | responses.append("" | + | |
| - | consecutive_errors_n += 1 | + | |
| - | if consecutive_errors_n >= 5: | + | |
| - | break | + | |
| - | + | ||
| - | | + | |
| - | df = pd.json_normalize(responses) | + | |
| - | print(df) | + | |
| - | | + | |
| </ | </ | ||
| - | Output: | + | ==== Response Structure ==== |
| - | + | ||
| - | < | + | |
| - | Succeeded on 2 out of 2 domains. | + | |
| - | | + | |
| - | 0 1 | + | |
| - | 1 1 measuretheweb.org | + | |
| - | [2 rows x 39 columns] | + | The internal API returns a comprehensive JSON object. When flattened using '' |
| - | </ | + | |
| - | Columns: | + | ^ Column Group ^ Field ^ |
| - | * '' | + | | **Basic Info** | '' |
| - | * '' | + | | **Ranking** | '' |
| - | * '' | + | | **Engagement** | '' |
| - | * '' | + | | **Traffic Sources** | '' |
| - | * '' | + | | **Geography** | '' |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | | + | |
| - | | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | * '' | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | * '' | + | |
| - | * '' | + | |
| + | **Note on Verification: | ||
| ~~DISCUSSION~~ | ~~DISCUSSION~~ | ||
programming/similarweb.txt · Last modified: by karelkubicek
