Measure The Web

This is an old revision of the document!
@InProceedings{LePochat2019_tranco,
  author        = {{Le Pochat}, Victor and {Van Goethem}, Tom and Tajalizadehkhoob, Samaneh and Korczy\'{n}ski, Maciej and Joosen, Wouter},
  title         = {Tranco: A Research-Oriented Top Sites Ranking Hardened Against Manipulation},
  booktitle     = {Proceedings of the 26th Annual Network and Distributed System Security Symposium},
  year          = {2019},
  series        = {NDSS 2019},
  month         = 2,
  doi           = {10.14722/ndss.2019.23386},
}

@InProceedings{ruth2022_toppling,
  author        = {Ruth, Kimberly and Kumar, Deepak and Wang, Brandon and Valenta, Luke and Durumeric, Zakir},
  title         = {Toppling Top Lists: Evaluating the Accuracy of Popular Website Lists},
  booktitle     = {Proceedings of the 22nd ACM Internet Measurement Conference},
  year          = {2022},
  series        = {IMC '22},
  pages         = {374–387},
  address       = {New York, NY, USA},
  publisher     = {Association for Computing Machinery},
  __markedentry = {[kubicekk:6]},
  doi           = {10.1145/3517745.3561444},
  isbn          = {9781450392594},
  location      = {Nice, France},
  numpages      = {14},
  url           = {https://doi.org/10.1145/3517745.3561444},
}

@inproceedings{xie2024_crawling,
  title={Crawling to the Top: An Empirical Evaluation of Top List Use},
  author={Xie, Qinge and Li, Frank},
  booktitle={International Conference on Passive and Active Network Measurement},
  pages={277--306},
  year={2024},
  organization={Springer}
}

@inproceedings{scheitle2018_long,
author = {Scheitle, Quirin and Hohlfeld, Oliver and Gamba, Julien and Jelten, Jonas and Zimmermann, Torsten and Strowes, Stephen D. and Vallina-Rodriguez, Narseo},
title = {A Long Way to the Top: Significance, Structure, and Stability of Internet Top Lists},
year = {2018},
isbn = {9781450356190},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3278532.3278574},
doi = {10.1145/3278532.3278574},
booktitle = {Proceedings of the Internet Measurement Conference 2018},
pages = {478–493},
numpages = {16},
location = {Boston, MA, USA},
series = {IMC '18}
}

@inproceedings{xie2022_building,
  title={Building an Open, Robust, and Stable Voting-Based Domain Top List},
  author={Xie, Qinge and Tang, Shujun and Zheng, Xiaofeng and Lin, Qingran and Liu, Baojun and Duan, Haixin and Li, Frank},
  booktitle={31st USENIX Security Symposium (USENIX Security 22)},
  pages={625--642},
  year={2022}
}

@inproceedings{vallina2020_misshapes,
    author = {Vallina, Pelayo and Le Pochat, Victor and Feal, \'{A}lvaro and Paraschiv, Marius and Gamba, Julien and Burke, Tim and Hohlfeld, Oliver and Tapiador, Juan and Vallina-Rodriguez, Narseo},
    title = {Mis-shapes, Mistakes, Misfits: An Analysis of Domain Classification Services},
    year = {2020},
    isbn = {9781450381383},
    publisher = {Association for Computing Machinery},
    address = {New York, NY, USA},
    url = {https://doi.org/10.1145/3419394.3423660},
    doi = {10.1145/3419394.3423660},
    abstract = {Domain classification services have applications in multiple areas, including cybersecurity, content blocking, and targeted advertising. Yet, these services are often a black box in terms of their methodology to classifying domains, which makes it difficult to assess their strengths, aptness for specific applications, and limitations. In this work, we perform a large-scale analysis of 13 popular domain classification services on more than 4.4M hostnames. Our study empirically explores their methodologies, scalability limitations, label constellations, and their suitability to academic research as well as other practical applications such as content filtering. We find that the coverage varies enormously across providers, ranging from over 90\% to below 1\%. All services deviate from their documented taxonomy, hampering sound usage for research. Further, labels are highly inconsistent across providers, who show little agreement over domains, making it difficult to compare or combine these services. We also show how the dynamics of crowd-sourced efforts may be obstructed by scalability and coverage aspects as well as subjective disagreements among human labelers. Finally, through case studies, we showcase that most services are not fit for detecting specialized content for research or content-blocking purposes. We conclude with actionable recommendations on their usage based on our empirical insights and experience. Particularly, we focus on how users should handle the significant disparities observed across services both in technical solutions and in research.},
    booktitle = {Proceedings of the ACM Internet Measurement Conference},
    pages = {598–618},
    numpages = {21},
    location = {Virtual Event, USA},
    series = {IMC '20}
}

@article{snyder2020_who,
  author = {Snyder, Peter and Vastel, Antoine and Livshits, Ben},
  title = {Who Filters the Filters: Understanding the Growth, Usefulness and Efficiency of Crowdsourced Ad Blocking},
  year = {2020},
  issue_date = {June 2020},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  volume = {4},
  number = {2},
  url = {https://doi.org/10.1145/3392144},
  doi = {10.1145/3392144},
  journal = {Proc. ACM Meas. Anal. Comput. Syst.},
  month = jun,
  articleno = {26},
  numpages = {24},
  keywords = {web privacy, web measurement, filter lists, easylist}
}

@inproceedings{iqbal2020_adgraph,
  author={Iqbal, Umar and Snyder, Peter and Zhu, Shitong and Livshits, Benjamin and Qian, Zhiyun and Shafiq, Zubair},
  booktitle={2020 IEEE Symposium on Security and Privacy (SP)}, 
  title={AdGraph: A Graph-Based Approach to Ad and Tracker Blocking}, 
  year={2020},
  volume={},
  number={},
  pages={763-776},
  keywords={Advertising;Chromium;Uniform resource locators;Browsers;Privacy;Tools;Robustness},
  doi={10.1109/SP40000.2020.00005}
}

@inproceedings{siby2022_webgraph,
  author = {Sandra Siby and Umar Iqbal and Steven Englehardt and Zubair Shafiq and Carmela Troncoso},
  title = {{WebGraph}: Capturing Advertising and Tracking Information Flows for Robust Blocking},
  booktitle = {31st USENIX Security Symposium (USENIX Security 22)},
  year = {2022},
  isbn = {978-1-939133-31-1},
  address = {Boston, MA},
  pages = {2875--2892},
  url = {https://www.usenix.org/conference/usenixsecurity22/presentation/siby},
  publisher = {USENIX Association},
  month = aug
}

@inproceedings{iqbal2022_khaleesi,
  author = {Umar Iqbal and Charlie Wolfe and Charles Nguyen and Steven Englehardt and Zubair Shafiq},
  title = {Khaleesi: Breaker of Advertising and Tracking Request Chains},
  booktitle = {31st USENIX Security Symposium (USENIX Security 22)},
  year = {2022},
  isbn = {978-1-939133-31-1},
  address = {Boston, MA},
  pages = {2911--2928},
  url = {https://www.usenix.org/conference/usenixsecurity22/presentation/iqbal},
  publisher = {USENIX Association},
  month = aug
}

@article{mhaidli2023researchers,
    title={Researchers’ Experiences in Analyzing Privacy Policies: Challenges and Opportunities},
    author={Mhaidli, Abraham and Fidan, Selin and Doan, An and Herakovic, Gina and Srinath, Mukund and Matheson, Lee and Wilson, Shomir and Schaub, Florian},
    journal={Proceedings on Privacy Enhancing Technologies},
    issue={4},
    number={},
    volume={2023},
    pages={287--305},
    year={2023},
    doi={10.56553/popets-2023-0111},
    url={https://petsymposium.org/popets/2023/popets-2023-0111.php},
}

@article{hosseini2021unifying,
    title={Unifying privacy policy detection},
    author={Hosseini, Henry and Degeling, Martin and Utz, Christine and Hupperich, Thomas},
    journal={Proceedings on Privacy Enhancing Technologies},
    doi={10.2478/popets-2021-0081},
    issue={4},
    volume={2021},
    pages={480–499},
    year={2021}
}