Copy disabled (too large)
Download .txt
Showing preview only (18,777K chars total). Download the full file to get everything.
Repository: cliqz-oss/whotracks.me
Branch: master
Commit: 44f3b9290cd0
Files: 179
Total size: 17.9 MB
Directory structure:
gitextract_rc25zhaz/
├── .github/
│ └── workflows/
│ └── test.yml
├── .gitignore
├── .tool-versions
├── Dockerfile
├── Jenkinsfile
├── LICENSE.md
├── README.md
├── RIGHT_TO_AMEND.md
├── blog/
│ ├── adblockers_performance_study.md
│ ├── block-third-party-cookies.md
│ ├── cookie-consent.md
│ ├── cookies.md
│ ├── dexie_transaction_bug.md
│ ├── fingerprinting.md
│ ├── gdpr-what-happened.md
│ ├── generating_adblocker_filters.md
│ ├── google_domains.md
│ ├── government_websites_september.md
│ ├── how_cliqz_antitracking_protects_users.md
│ ├── how_facebook_knows_exactly_what_turns_you_on.md
│ ├── manifest_v3_privacy.md
│ ├── private_analytics.md
│ ├── static_site.md
│ ├── static_site_blog.md
│ ├── static_site_generation.md
│ ├── static_site_visualization.md
│ ├── tracker-tax.md
│ ├── tracker_categories.md
│ ├── trackers-who-steal.md
│ ├── trackers_in_your_favorite_site.md
│ ├── tracking_and_ux.md
│ ├── tracking_pixel.md
│ ├── update_apr_2018.md
│ ├── update_dec_2017.md
│ ├── update_feb_2018.md
│ ├── update_jan_2018.md
│ ├── update_jun_2018.md
│ ├── update_may_2018.md
│ ├── updating_our_tracking_prevalence_metrics.md
│ ├── what_is_a_tracker.md
│ └── where_is_the_data_from.md
├── contrib/
│ ├── generating_adblocker_filters.py
│ ├── tracker_map_notebook.ipynb
│ ├── wtm_april_update.ipynb
│ └── wtm_may_update.ipynb
├── deploy_to_s3.py
├── docs/
│ └── local-build.md
├── pyproject.toml
├── static/
│ ├── font-awesome-4.7.0/
│ │ ├── HELP-US-OUT.txt
│ │ ├── css/
│ │ │ └── font-awesome.css
│ │ ├── fonts/
│ │ │ └── FontAwesome.otf
│ │ ├── less/
│ │ │ ├── animated.less
│ │ │ ├── bordered-pulled.less
│ │ │ ├── core.less
│ │ │ ├── fixed-width.less
│ │ │ ├── font-awesome.less
│ │ │ ├── icons.less
│ │ │ ├── larger.less
│ │ │ ├── list.less
│ │ │ ├── mixins.less
│ │ │ ├── path.less
│ │ │ ├── rotated-flipped.less
│ │ │ ├── screen-reader.less
│ │ │ ├── stacked.less
│ │ │ └── variables.less
│ │ └── scss/
│ │ ├── _animated.scss
│ │ ├── _bordered-pulled.scss
│ │ ├── _core.scss
│ │ ├── _fixed-width.scss
│ │ ├── _icons.scss
│ │ ├── _larger.scss
│ │ ├── _list.scss
│ │ ├── _mixins.scss
│ │ ├── _path.scss
│ │ ├── _rotated-flipped.scss
│ │ ├── _screen-reader.scss
│ │ ├── _stacked.scss
│ │ ├── _variables.scss
│ │ └── font-awesome.scss
│ ├── fonts/
│ │ └── RationalTWSemiBold.otf
│ ├── js/
│ │ ├── bootstrap.js
│ │ ├── d3.layout.cloud.js
│ │ ├── explorer.js
│ │ ├── ghostery.js
│ │ ├── highlight.pack.js
│ │ └── search.js
│ └── scss/
│ ├── _colors.scss
│ ├── blog/
│ │ ├── card.scss
│ │ ├── github.scss
│ │ └── post.scss
│ ├── bootstrap.min.scss
│ ├── companies/
│ │ └── reach-chart.scss
│ ├── custom.scss
│ ├── datatables.colReorder.min.scss
│ ├── datatables.min.scss
│ ├── explorer/
│ │ └── table.scss
│ ├── home/
│ │ └── index.scss
│ ├── trackers/
│ │ ├── list.scss
│ │ └── profile.scss
│ └── websites/
│ ├── overview.scss
│ └── profile.scss
├── templates/
│ ├── base.html
│ ├── blog-page.html
│ ├── blog.html
│ ├── company-page.html
│ ├── components/
│ │ ├── blog-card.html
│ │ ├── breadcrumb.html
│ │ ├── category-item.html
│ │ ├── company-card.html
│ │ ├── cookies.html
│ │ ├── fingerprinting.html
│ │ ├── footer.html
│ │ ├── home/
│ │ │ └── header.html
│ │ ├── navbar.html
│ │ ├── tag_cloud.html
│ │ ├── top-5-info-box.html
│ │ ├── top-5-trackers.html
│ │ ├── tracker-list.html
│ │ ├── trackers/
│ │ │ ├── category.html
│ │ │ └── header.html
│ │ ├── tracking-methods.html
│ │ ├── unified-ui-tracker-list.html
│ │ ├── website-list.html
│ │ └── websites/
│ │ ├── header.html
│ │ └── tracker-list.html
│ ├── explorer.html
│ ├── imprint.html
│ ├── index.html
│ ├── not-found.html
│ ├── privacy-policy.html
│ ├── reach-chart-page.html
│ ├── tracker-not-found.html
│ ├── tracker-page.html
│ ├── trackers.html
│ ├── website-not-found.html
│ ├── website-page.html
│ └── websites.html
├── tests/
│ ├── __init__.py
│ ├── test_data_integrity.py
│ ├── test_db_integrity.py
│ ├── test_db_validity.py
│ ├── test_site_categories.py
│ └── test_sites_data.py
├── update_trackerdb.sh
├── update_trackers_preview.py
└── whotracksme/
├── __init__.py
├── data/
│ ├── Readme.md
│ ├── __init__.py
│ ├── assets/
│ │ ├── trackerdb.sql
│ │ └── trackers-preview.json
│ ├── db.py
│ ├── loader.py
│ └── pack.py
├── main.py
├── qa/
│ ├── __init__.py
│ ├── todo.py
│ └── utils.py
└── website/
├── __init__.py
├── api/
│ └── meta.py
├── build/
│ ├── __init__.py
│ ├── blog.py
│ ├── companies.py
│ ├── data.py
│ ├── explorer.py
│ ├── home.py
│ ├── trackers.py
│ └── websites.py
├── builder.py
├── plotting/
│ ├── .vscode/
│ │ └── settings.json
│ ├── __init__.py
│ ├── colors.py
│ ├── companies.py
│ ├── plots.py
│ ├── sankey.py
│ ├── trackers.py
│ └── utils.py
├── serve.py
├── templates.py
└── utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/test.yml
================================================
name: Tests
on:
push:
branches: [master]
pull_request:
branches: [master]
jobs:
test:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Install sass
run: |
sudo apt-get update
sudo apt-get install --yes ruby-sass build-essential
- name: Install uv
uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
with:
python-version: '3.13'
- name: Install dependencies
run: |
uv sync --locked
uv run whotracksme --help
- name: Fetch test data assets
run: |
aws --no-sign-request s3 cp --recursive s3://data.whotracks.me/2017-06 2017-06
aws --no-sign-request s3 cp --recursive s3://data.whotracks.me/2021-06 2021-06
working-directory: whotracksme/data/assets
env:
AWS_DEFAULT_REGION: us-east-1
- name: Run tests
run: |
uv run pytest
- name: Check build
run: |
uv run whotracksme website
================================================
FILE: .gitignore
================================================
*.pyc
.cache/
.sass-cache/
__pycache__/
_site/
dist/
whotracksme.egg-info/
.DS_Store
venv/
whotracksme/data/assets/**/*.csv
whotracksme.db
================================================
FILE: .tool-versions
================================================
python 3.11.6
================================================
FILE: Dockerfile
================================================
# Set base image to build upon
FROM python:3.11-slim
# Set arg and env
ARG VERSION
ARG UID=1000
ARG GID=1000
ARG USER=jenkins
ARG GROUP=jenkins
# Add jenkins user and group
RUN groupadd -g ${GID} ${GROUP} && \
useradd -u ${UID} -g ${GID} -m -s /bin/bash ${USER}
# Set labels to identify image
LABEL vendor="Ghostery GmbH" \
maintainer="chrmod@ghostery.com" \
version=${VERSION}
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \
libffi-dev \
ruby-sass \
&& \
rm -rf /var/lib/apt/lists/* && \
rm -f /var/cache/apt/*.bin
# Copy application python requirements
COPY requirements-dev.txt /home/jenkins/
# Install python dependencies
RUN pip install -r /home/jenkins/requirements-dev.txt
================================================
FILE: Jenkinsfile
================================================
def testReport = 'test-report.xml'
def stagingBucket = 'internal.clyqz.com'
def stagingPrefix = '/docs/whotracksme'
def productionBucket = 'whotracksme'
def productionPrefix = ''
node('magrathea') {
stage ('Checkout') {
checkout([
$class: 'GitSCM',
branches: [[name: 'refs/heads/'+env.BRANCH_NAME]],
extensions: [[$class: 'GitLFSPull']],
userRemoteConfigs: [
[refspec: '+refs/heads/*:refs/remotes/origin/* +refs/pull/*/head:refs/remotes/origin/PR-* +refs/tags/*:refs/remotes/origin/*',
url: 'https://github.com/ghostery/whotracks.me.git']
]
])
}
def img
stage('Download Datasets') {
dir('whotracksme/data/assets') {
sh('aws s3 sync --no-sign-request --no-progress s3://data.whotracks.me/ .')
}
}
stage('Build Docker Image') {
img = docker.build('whotracksme', '. --build-arg user=`whoami` --build-arg UID=`id -u` --build-arg GID=`id -g`')
}
img.inside() {
try {
stage('Install') {
sh("python -m pip install --user -e '.[dev]'")
}
stage('Test') {
try {
sh(script: "pytest --junit-xml=${testReport}")
} catch(err) {
junit(testReport)
currentBuild.result = "FAILURE"
}
}
stage('Build site') {
sh('/home/jenkins/.local/bin/whotracksme website')
}
if (env.BRANCH_NAME == 'master') {
withCredentials([[
$class: 'AmazonWebServicesCredentialsBinding',
accessKeyVariable: 'AWS_ACCESS_KEY_ID',
credentialsId: '04e892d6-1f78-400e-9908-1e9466e238a9',
secretKeyVariable: 'AWS_SECRET_ACCESS_KEY'
]]) {
stage('Publish Site') {
sh("python deploy_to_s3.py ${productionBucket} ${productionPrefix} --production")
}
}
}
} finally {
// cleanup
sh('rm -rf _site; rm -rf .sass-cache')
}
}
junit(testReport)
}
================================================
FILE: LICENSE.md
================================================
MIT License
Copyright (c) 2017 - to present Ghostery GmbH
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
<p align="center">
<img src="https://raw.githubusercontent.com/ghostery/whotracks.me/master/static/img/who-tracksme-logo.png" width="300px" alt="WhoTracks.Me" />
</p>
<h3 align="center">Bringing Transparency to Online Tracking</h3>
<p align="center">
<em>
Transparency
· Privacy
· Tracking landscape
· Built by Ghostery
</em>
<br />
<em>
<a href="https://www.ghostery.com/whotracksme/trackers" target="_blank" rel="noopener noreferrer">Trackers</a>
· <a href="https://www.ghostery.com/whotracksme/websites" target="_blank" rel="noopener noreferrer">Websites</a>
· <a href="https://www.ghostery.com/whotracksme/explorer" target="_blank" rel="noopener noreferrer">Explorer</a>
</em>
</p>
<p align="center">
<a href="https://www.ghostery.com" target="_blank" rel="noopener noreferrer">
<img alt="powered by Ghostery" src="https://img.shields.io/badge/ghostery-powered-blue?logo=ghostery&style=flat-square">
</a>
<a href="https://github.com/cliqz-oss/adblocker/blob/master/LICENSE">
<img alt="License Badge" src="https://img.shields.io/github/license/ghostery/whotracks.me?style=flat-square"></a>
</p>
# Downloading the data
Each month, we release a new version of the web site. The data from the last month can be directly [accessed through the website](https://www.ghostery.com/whotracksme/explorer).
The raw data, from which the graphs have been computed, is also available as an open data set (updated every month). You can also
download historical data. More information on the raw data can be found [here](whotracksme/data/Readme.md).
WhoTracks.me also builts heavily on another open source project called [TrackerDB](https://github.com/ghostery/trackerdb);
all meta data (e.g. company descriptions) is maintained there.
# Using the data
You can directly use the [raw data](whotracksme/data/Readme.md), which are all text files. As an alternative, you an also
download it locally and use the Python API:
```
uv sync --frozen
. .venv/bin/activate
```
The Python API can now be accessed as follows (make sure you have already downloaded data):
```python
from whotracksme.data.loader import DataSource
data = DataSource()
# available entities
data.trackers
data.companies
data.sites
```
A whitepaper for WhoTracks.me is available at https://arxiv.org/abs/1804.08959, and here's a BibTeX entry that you can use to cite it in a publication:
```
@misc{whotracksme,
title={WhoTracks.Me: Shedding light on the opaque world of online tracking},
author={Arjaldo Karaj and Sam Macbeth and Rémi Berson and Josep M. Pujol},
year={2018},
eprint={1804.08959},
archivePrefix={arXiv},
primaryClass={cs.CY}
}
```
# Contributing
We rely on contributions for the community to keep the quality of this project high. If you want, you can support us in multiple ways:
* Do you see inconsistencies in the data? Please open a Github issue [here](https://github.com/whotracksme/whotracks.me/issues). We will have a look!
* Do you see wrong company descriptions? Did we put something in the category? Please check out the [TrackerDB project](https://github.com/ghostery/trackerdb), where all the meta data is kept, and open an [issue](https://github.com/ghostery/trackerdb/issues), or send us a pull request.
* Do you have any feedback on the [WhoTracks.me homepage](https://www.ghostery.com/whotracksme) or about the documentation? Please, let us know, so we can improve.
You can also contact us via email at [info@whotracks.me](mailto:info@whotracks.me)
# Right to Amend
Please read our [Guideline for 3rd parties](https://github.com/ghostery/whotracks.me/blob/master/RIGHT_TO_AMEND.md) wanting to suggest
corrections to their data.
# Local builds
[Readme on local builds](docs/local-build.md) (this is mostly relevant for the maintainer of this project)
# License
The content of this project itself is licensed under the [Creative
Commons Attribution 4.0 license](https://creativecommons.org/licenses/by/4.0/), and the underlying source code used
to generate and display that content is licensed under the [MIT
license](https://github.com/ghostery/whotracks.me/blob/master/LICENSE.md).
================================================
FILE: RIGHT_TO_AMEND.md
================================================
# Right to Amend
A Guideline for 3rd parties wanting to suggest corrections to their data
whotracks.me has already grown to be the most comprehensive transparency
and monitoring tool on online tracking that we are aware of. It already
features longitudinal data of more than 800 million page loads.
We are constantly working on improving the quality of our database, and at this
scale, it is possible to have minor inaccuracies. Hence, we are always happy to
amend incorrect entries when pointed out.
This is a guideline for 3rd parties wanting to suggest corrections to their
data.
1. Please reach out by opening an issue in the repository.
2. Identify yourself, and your role in the organization on behalf of which you are
proposing an amendment of the data.
3. Provide urls to the whotracks.me pages that contain inaccuracies (e.g. https://whotracks.me/trackers/name-of-tracker)
4. Clearly list the inaccuracy(y/ies), and for each write a sentence or two providing
evidence/justification w.r.t. why the current information is not correct.
We dislike inaccurate information just as much as you do, and
will do our best to resolve the issue as soon as we can.
================================================
FILE: blog/adblockers_performance_study.md
================================================
title: Adblockers Performance Study
subtitle:
description: A detailed comparison of popular adblockers with tips to help you block ads effectively and improve your digital privacy.
author:
type: article
publish: True
date: 2019-02-15
tags: blog, adblocker, performance
header_img: blog/adblocker-perf-study.jpg
redirect_url: https://www.ghostery.com/blog/adblockers-performance-study
+++
_In this study, we show_
* _That all popular content-blockers are very efficient, having sub-millisecond
median decision time per request_
* _That the manifest v3 performance claim is inaccurate based on our measurements_
* _That the ad blocker used by Cliqz and [Ghostery](https://www.ghostery.com/) consistently performs as well
or better than other popular content-blockers_
* _How to block ads effectively and improve your digital privacy._
---
## About the adblocker performance study
Here, we present a detailed analysis of the performance of some of the
most popular adblockers and content-blocker engines: *uBlock Origin*, *Adblock Plus*,
*Brave*, *DuckDuckGo* and *Cliqz/Ghostery's* advanced adblocker (shipped
since Ghostery 8), which we will refer to as *Ghostery* for the rest of
the article.
## Why was the study conducted?
This study was motivated by the recent [Manifest V3
controversy](https://bugs.chromium.org/p/chromium/issues/detail?id=896897).
One of the proposed changes involves crippling the WebRequest APIs to
limit their blocking abilities.
Two justifications were put forth:
one related to *performance* and another related to privacy. The
privacy argument deserves its own separate analysis and will not be
covered here.
### What were the findings?
In this study, we show that the *performance* argument
does not hold. Our comparison demonstrates that the most popular
content-blockers and adblockers are already very efficient (having a sub-millisecond
median decision time per request) and should not result in any
overhead noticeable by users.
We showed in another study [The Tracker
Tax](https://www.ghostery.com/lp/trackertax/) that blocking ads and
trackers actually reduces the loading time of websites by **up to
a factor of 2**. Besides, efficiency is continuously improved and
technologies such as WebAssembly will enable further optimizations.
### What did the study compare?
This comparison does not involve full extensions, but instead **focuses
on network request-blocking engines**. This is the most CPU-intensive
task performed by content-blockers (in particular, this does not account
for cosmetics engines or subscription management).
Here are the home pages for all content-blockers compared:
* Ghostery and Cliqz's [adblocker v0.6.9](https://github.com/cliqz-oss/adblocker/commit/58d89689af95d09e02a52e57aceb75151153d4ab).
* Brave's [ad-block 4.1.3](https://github.com/brave/ad-block/commit/cfb714387fef649bd4ec7c1242ae442d58e4d41f).
* DuckDuckGo's [abp-filter-parser 0.2.0](https://github.com/duckduckgo/abp-filter-parser/commit/01a864e84f472e31b9f5c47bbc05a7d75ee1ca62).
* uBlock Origin commit [29b10d2](https://github.com/gorhill/uBlock/commit/29b10d215184aef1a9a12b715b47de9656ecdc3c).
* AdblockPlus' [adblockpluscore 34c49bb](https://github.com/adblockplus/adblockpluscore/commit/34c49bbf029e586226220c067c50cec6e8bf8842).
We did not include native blockers from Chromium and Safari projects
as this would require some significant effort to package them in a way
that allows benchmarking against the other libraries. We leave this for
future work.
### How were the adblockers compared?
Every adblocker, except *uBlock Origin*, are available as JavaScript libraries which
can be loaded in Node.js. To allow a comparison of *uBlock Origin*, we had to
extract the static network filtering engine [out of the
extension](https://github.com/cliqz-oss/adblocker/blob/master/bench/comparison/ublock.js).
The version of *uBlock Origin* running in this benchmark *does not make
use of the Webassembly* version of domain matching.
All benchmarks were run on an X1 Carbon 2016 (i7 U6600 + 16 GB) in
Node.js 11.9.0. Memory measurements were performed in Google Chrome version
72.0.3626.96 using the memory snapshot tool.
## Results
Before presenting the detailed analysis of the results, let us highlight
our findings in a nutshell:
- All content-blockers except *DuckDuckGo* have **sub-millisecond median decision
time** per request.
- **Time to Process a Request in Ghostery** (median): **0.007 ms**
- 2.7x faster than *uBlock Origin*
- 2.9x faster than *Adblock Plus*
- 6.3x faster than *Brave*
- 1258.4x faster than *DuckDuckGo*
- **Loading Ghostery's Blocking Engine** (from cache): **0.03 ms**
- 368x faster than *Brave*
- 588x faster than *uBlock Origin*
- 3575x faster than *Adblock Plus*
- *DuckDuckGo*'s engine does not offer serialization, so the loading cost is always the one from parsing the lists.
- **Memory Consumption of Ghostery's Blocking Engine** (at startup, in Chrome): **1.8 MB**
- 1.6x less memory than *uBlock Origin*
- 8.4x less memory than *Adblock Plus*
- 8.8x less memory than *DuckDuckGo*
- The memory usage of *Brave* could not be evaluated using the devtools
and thus is not included in this section.
### 0. About the Dataset
To measure the performance of each adblocker, we replayed requests
from popular domains and tracked the time it took to decide
if they should be blocked or not.
We then analyzed the results in three
different ways: all requests, blocked only and not blocked (taken from
the same run).
### How the dataset was created
The dataset was created using a pool of Chrome
headless browsers (driven by the [`puppeteer` library](https://github.com/GoogleChrome/puppeteer))
to visit home pages of the *top 500 domains* (as reported by Cliqz
Search). Up to 3 pages of each domain (picked randomly from
the home page) and all the network requests seen (URL, frame
URL and type) were also collected.
The dataset was shuffled in such a way that the different
pages were visited in a random order, but requests seen on each page
were replayed as they were recorded initially.
In summary:
* The dataset is composed of 242944 requests.
* We released the data publicly at
this URL: [requests_top500.json.gz](https://cdn.cliqz.com/adblocking/requests_top500.json.gz).
* The script to create the dataset is also available:
[create_dataset.js](https://github.com/cliqz-oss/adblocker/blob/master/bench/comparison/create_dataset.js) and
[shuffle_dataset.js](https://github.com/cliqz-oss/adblocker/blob/master/bench/comparison/shuffle_dataset.js) was used to shuffle the
requests to produce the final data.
### 1. Composition of Requests
For the purpose of this comparison, we consider that each network
request can be either blocked or allowed by the content-blocker; we call
the process of deciding whether a request should be blocked or not:
*matching*.
We observed that from our dataset, only ~19.2% are blocked
(average across all content-blockers).
<img class="img-responsive" src="../static/img/blog/adblockers_performance/requests-composition.svg" alt="Composition of requests" />
### Key takeaway: how to make an adblocker more effective
This observation suggests that content-blockers will perform better on
average if they can efficiently decide which requests to *not block*.
The filters used to determine whether or not a request is to be blocked
are the ones from [Easylist](https://easylist-downloads.adblockplus.org/easylist.txt),
where we removed all the cosmetic rules before running the benchmarks.
The final list contains *38978 network filters* and is available here:
[easylist.txt](https://github.com/cliqz-oss/adblocker/blob/master/bench/comparison/easylist.txt).
It should be noted at this point that a larger proportion of requests
would be blocked by enabling extra filters lists such as *EasyPrivacy*.
### 2. Time To Match All Requests
We first look at all of the requests (whether they will eventually
be blocked or not).
We use a log scale for the x-axis (time in
milliseconds) to facilitate the comparison of the cumulative
distribution of the time it takes for adblockers to decide whether
or not a request should be blocked.
Here is a break-down of the 99th percentile and median times for each
content-blocker:
<table class="table table-hover">
<thead>
<tr>
<th></th>
<th>99% OF REQUESTS</th>
<th>MEDIAN</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>Ghostery</strong></td>
<td><strong>0.050ms</strong></td>
<td><strong>0.007ms</strong></td>
</tr>
<tr>
<td>uBlock Origin</td>
<td>0.124ms (<strong>2.5x slower</strong>)</td>
<td>0.017ms (<strong>2.7x slower</strong>)</td>
</tr>
<tr>
<td>Adblock Plus</td>
<td>0.103ms (<strong>2.1x slower</strong>)</td>
<td>0.019ms (<strong>2.9x slower</strong>)</td>
</tr>
<tr>
<td>Brave</td>
<td>1.288ms (<strong>25.9x slower</strong>)</td>
<td>0.041ms (<strong>6.3x slower</strong>)</td>
</tr>
<tr>
<td>DuckDuckGo</td>
<td>12.085ms (<strong>242.5x slower</strong>)</td>
<td>8.270ms (<strong>1258.4x slower</strong>)</td>
</tr>
</tbody>
</table>
Below you can find the cumulative distribution plots of these timings:
<img class="img-responsive" src="../static/img/blog/adblockers_performance/ghostery-ublock-origin-brave-duckduckgo-adblock-plus-all.svg" alt="Time To Match All Requests" />
### 3. Time To Match Requests Which Are Not Blocked
The following table details 99th percentile and median timings for requests not
blocked:
<table class="table table-hover">
<thead>
<tr>
<th></th>
<th>99% OF REQUESTS</th>
<th>MEDIAN</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>Ghostery</strong></td>
<td><strong>0.049ms</strong></td>
<td><strong>0.006ms</strong></td>
</tr>
<tr>
<td>uBlock Origin</td>
<td>0.112ms (<strong>2.3x slower</strong>)</td>
<td>0.018ms (<strong>2.8x slower</strong>)</td>
</tr>
<tr>
<td>Adblock Plus</td>
<td>0.105ms (<strong>2.2x slower</strong>)</td>
<td>0.020ms (<strong>3.1x slower</strong>)</td>
</tr>
<tr>
<td>Brave</td>
<td>1.270ms (<strong>26.2x slower</strong>)</td>
<td>0.038ms (<strong>5.9x slower</strong>)</td>
</tr>
<tr>
<td>DuckDuckGo</td>
<td>11.190ms (<strong>230.5x slower</strong>)</td>
<td>6.781ms (<strong>1060.5x slower</strong>)</td>
</tr>
</tbody>
</table>
<img class="img-responsive" src="../static/img/blog/adblockers_performance/ghostery-ublock-origin-brave-duckduckgo-adblock-plus-not-blocked.svg" alt="Time to match requests which are not blocked" />
### 4. Time To Match Requests Which Are Blocked
The following table details 99th percentile and median timings for requests blocked:
<table class="table table-hover">
<thead>
<tr>
<th></th>
<th>99% OF REQUESTS</th>
<th>MEDIAN</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>Ghostery</strong></td>
<td><strong>0.052ms</strong></td>
<td><strong>0.007ms</strong></td>
</tr>
<tr>
<td>uBlock Origin</td>
<td>0.165ms (<strong>3.1x slower</strong>)</td>
<td>0.016ms (<strong>2.2x slower</strong>)</td>
</tr>
<tr>
<td>Adblock Plus</td>
<td>0.099ms (<strong>1.9x slower</strong>)</td>
<td>0.014ms (<strong>1.9x slower</strong>)</td>
</tr>
<tr>
<td>Brave</td>
<td>1.468ms (<strong>28.0x slower</strong>)</td>
<td>0.062ms (<strong>8.5x slower</strong>)</td>
</tr>
<tr>
<td>DuckDuckGo</td>
<td>13.025ms (<strong>248.5x slower</strong>)</td>
<td>8.31ms (<strong>1130.6x slower</strong>)</td>
</tr>
</tbody>
</table>
<img class="img-responsive" src="../static/img/blog/adblockers_performance/ghostery-ublock-origin-brave-duckduckgo-adblock-plus-blocked.svg" alt="Time to match requests which are blocked" />
### Summary of findings: How the adblockers performed
On these graphs, we observe a plateau for *Adblock Plus*, *Brave* and
*Duckduckgo*.
This can be explained by the fact that these engines
implement some form of caching internally, thus having a very fast
response time for some requests (redundancy in
requests comes from both common third parties seen on multiple websites
as well as the fact that we load several pages for each domain).
This caching can be implemented on top of any content-blocker and does not
tell much about the efficiency of each. We can see this as a means to
trade *memory* against *CPU usage*.
### Ghostery’s adblocker performance
From the previous measurements, we see that Ghostery outperforms other
libraries in terms of matching speed. Without going into too many
details, here are some of the optimizations which can explain these
results:
* Ghostery makes use of a reverse index associating tokens to filters. Contrary
to other libraries, we make sure that we pick *the best* token for each filter
at construction time (best being defined as the *least seen token*). This incurs
a one-time extra cost but results in maximized dispatching capabilities.
* Filters are stored in a very compact form, in typed arrays, and only loaded in
memory lazily, when there is a chance they will be blocked (if we encounter
identical tokens in URLs).
* Filters loaded in memory are optimized on-the-fly and multiple filters can be
combined for increased efficiency. The optimizations were carefully crafted
based on common cases observed in Easylist.
### 5. Serialization And Deserialization
In this section, we have a look at the performance of content-blockers
when it comes to serializing their internal representation for faster
subsequent loading.
Only *DuckDuckGo*'s engine does not provide this
feature. *uBlock Origin*, *Ghostery*, *Adblock Plus* and *Brave* all allow to
serialize or cache (*uBlock Origin*'s terminology is: *selfies*) the
entire blocking engine to either a string or a buffer, which can then be
used to speed-up subsequent loads.
### The impact of load time on user experience
As this is a one-time operation, having a higher loading time does not
impact desktop users significantly. On the other hand, the ability to quickly
initialize the content-blocker is critical on mobile.
### Comparison of serialization and deserialization times
Another use-case allowed by such capability is to perform the parsing
of the lists on the backend and ship the serialized form of the
content-blocker to clients directly. This removes the cost of
initialization completely.
We performed 100 serializations for each content-blocker and display the
results below:
<img class="img-responsive" src="../static/img/blog/adblockers_performance/ghostery-ublock-origin-brave-adblock-plus-serializationtimings.svg" alt="Serialization timings" />
This bar plot contains the median time taken to serialize the engine for each
content-blocker:
<img class="img-responsive" src="../static/img/blog/adblockers_performance/serializationtimings.svg" alt="Serialization timings" />
Similarly, we measure the time it takes to restore the content-blocker from its
serialized form:
<img class="img-responsive" src="../static/img/blog/adblockers_performance/ghostery-ublock-origin-brave-adblock-plus-deserializationtimings.svg" alt="Deserialization timings" />
And here is the median time:
<img class="img-responsive" src="../static/img/blog/adblockers_performance/deserializationtimings.svg" alt="Deserialization timings" />
Last but not least, we measured the size of the serialized buffer for each
content-blocker:
<img class="img-responsive" src="../static/img/blog/adblockers_performance/cache-size.svg" alt="Cache size" />
From these measurements, we see that *Ghostery* offers both significantly
faster serialization and deserialization times as well as a smaller
cache size.
The reason is the following:
* The internal representation is already
mostly stored in a compact form (using typed arrays).
* This means that serialization only adds a small amount of metadata
alongside the already available arrays. Deserialization is
*essentially instantaneous* since it's enough to create some typed array
views on top of the serialized buffer (think of `mmap` but using typed
arrays).
* This also explains the very low memory consumption: after
initialization, the memory footprint is only slightly higher than the size
of the serialized form.
### 6. Memory Consumption at Start-up
Here, we consider the memory usage of each content-blocker, initialized
from lists (not from cache) after one full garbage collection.
### How memory consumption was measured
The measurements were performed using Chrome's devtools memory snapshot. We
did not measure *Brave* here since the memory used from C++ side does not
seem to be taken into account in the snapshot. Keep in mind that
this memory usage can vary at run-time as content-blockers might cache
frequently used resources, etc.
<img class="img-responsive" src="../static/img/blog/adblockers_performance/memory-usage-at-startup.svg" alt="Memory usage at start-up" />
As mentioned in the previous section on serialization, the very low
memory usage of *Ghostery* can be explained by the fact that the
internal representation mostly consists of very compact typed arrays
with some small overhead for extra meta-data.
Again, we need to stress
here that this measures the network filtering engine of Ghostery only,
not the full extension, as described in the introduction.
### 7. Parsing Lists
In this graph, we present the time it takes for each content-blocker to
be initialized from the lists (without any prior caching, which means
initializing all internal resources by parsing the raw list).
We see that only *Brave* seems to be significantly slower and that *uBlock Origin*,
*Ghostery*, *Adblock Plus* and *DuckDuckGo* all perform well.
<img class="img-responsive" src="../static/img/blog/adblockers_performance/time-to-parse-easylist-all.svg" alt="Time to parse Easylist" />
It seems that the long parsing time for *Brave* is a [known
issue](https://github.com/brave/ad-block/issues/158) tracked on their
GitHub repository.
### Key findings on parsing lists
If we remove *Brave*, we see that there are still differences between
*uBlock Origin*, *Ghostery*, *Adblock Plus* and *DuckDuckGo*. One reason
*Ghostery* is slower than *uBlock Origin* and *AdblockPlus* here is that to
achieve maximum performance while matching as well as minimize memory
usage, there is a bit more work to do up-front.
In practice, this does
not matter so much since it is a one-time operation and subsequent
loads are performed from cache. This is really fast. In fact, we
can even perform the parsing backend-side and just ship the serialized
version of the blocker, which removes this step completely.
<img class="img-responsive" src="../static/img/blog/adblockers_performance/time-to-parse-easylist-without-brave.svg" alt="Time to parse Easylist without Brave" />
### 8. Conclusion
In this study, we looked closely at the performance of some of the most
popular content-blockers in use today. In particular, we focused on the
efficiency of their network filtering engines, which is the most CPU
intensive task they perform.
This work was motivated by one of the claims formulated in the [Manifest V3
proposal](https://bugs.chromium.org/p/chromium/issues/detail?id=896897)
of the Chromium project: *"the extension then performs arbitrary (and
potentially very slow) JavaScript"*, talking about content-blockers'
ability to process all network requests.
### Key findings:
* We do not think the Manifest V3 claim is accurate as all popular content-blockers are already
very efficient and should not incur any noticeable slow-down for users.
* Moreover, the efficiency of content-blockers is *continuously improving*,
either thanks to more innovative approaches or using technologies like
WebAssembly to reach native performance.
* While most content-blockers are indeed efficient, they are not
equivalent. We observed that *Ghostery* performs consistently as well
or better across all dimensions, often surpassing other libraries.
We hope these benchmarks will give content-blocker developers the opportunity
to measure their own progress against other popular libraries.
This will benefit all users, no matter which extension they use, as the efficiency of
content-blockers improves.
[*Edit of 20-02-2019*](https://github.com/ghostery/whotracks.me/pull/154): The study has been updated with the specific version of each content-blocker measured.
[*Edit of 15-03-2019*](https://github.com/ghostery/whotracks.me/pull/161): DuckDuckGo's description has been amended to more accurately describe the way their content-blocker is used in practice: focusing on blocking third-party trackers, but not ads.
================================================
FILE: blog/block-third-party-cookies.md
================================================
title: Third-party cookies - the guests who won't leave
subtitle: How the web ecosystem is preventing us from reverting the third-party cookie mistake.
author: privacy team
type: primer
publish: True
date: 2018-08-27
tags: blog, cookies
header_img: blog/blog-third-party-cookies.jpg
+++
Summary: _In this post we describe_
* _How third-party cookies are the cause of multiple privacy and security issues on the web._
* _That, despite the cookie spec writers recommending against it, all major browsers ended up allowing third-party cookies by default._
* _How several major sites and services, including those from Microsoft and Google, fail badly when third-party cookies are blocked._
* _How Cliqz and Ghostery are defusing the privacy issues of third-party cookies by blocking them, while preventing the breakage issues caused by developers assuming a cookie free-for-all._
---
Cookies are a fundamental browser technology which enables state to be kept between browser and servers over the normally stateless HTTP protocol. Cookies enable sites to remember your login, what you've put in your shopping cart, and allow a site to remember language or currency preferences.
These features enable the sites you're visiting, 'first parties', to improve your experience and provide some content behind authentication, only accessible to you. However, browsers also, by default, send cookies to any third-parties embedded by the site operators. In some cases these can be used to allow third-party widgets, such as [Disqus](../trackers/disqus.html) comments, to automatically log you in to embedded content in the page. However, it also enables these third-parties to track your browsing across the web.
Allowing cookies to third-parties opens up a privacy hole in your browser. On [many sites](../websites.html), just visiting a page will set cookies for over 50 different third-party domains. Each of these are setting cookies so they can [correlate requests](./how_facebook_knows_exactly_what_turns_you_on.html) coming from your browser over days, months, or even years. For example, when you visit any page with a Facebook widget (or visit Facebook itself), they will set a cookie which will only expire in 2 years time. Some google.com cookies expire in 20 years. The `facebook.com` and `google.com` domains are present as a third-party to [24%](../trackers/facebook.html) and [30%](../google.html) of page loads
on the web respectively, allowing these services to tracker this proportion of the average user's web browsing history.
<div class="row">
<div class="col-md-6 col-xs-12">
<img class="img-responsive" src="../static/img/blog/cookie_block/google_cookie.png" alt="Google's consent cookie" />
</div>
<div class="col-md-6 col-xs-12">
<img class="img-responsive" src="../static/img/blog/cookie_block/economist_cookie.png" alt="A user identifier cookie on the Economist which expires in 2086"/>
</div>
</div>
<p class="img-caption">Google's consent cookie lasts for 20 years; A tracking cookie on the Economist which lasts for 68 years.</p>
Third-party cookies also represent a security risk to you. [Cross-site request forgery](https://en.wikipedia.org/wiki/Cross-site_request_forgery) (CSRF) attacks are based on the idea that I can make a third-party request to a site that the browser has previously authenticated with, and the browser will send the credentials with the request. If browsers did not allow third-party cookies these attacks would be much harder to exploit than they currently are. These kinds of attacks have been around for over 15 years, and methods to mitigate them are [still being proposed](https://blog.mozilla.org/security/2018/04/24/same-site-cookies-in-firefox-60/), while
browser-side protection, such as [first-party isolation](https://wiki.mozilla.org/Security/FirstPartyIsolation), have very limited distribution.
Furthermore, the use-cases which legitimately use third-party cookies, like Single-Sign-On portals, or third-party authentication mechanisms, have alternatives which do not require cookies. Sites using a centralised authentication domain can obtain authenication tokens via first-party redirects, and OAuth[^2] can be used to log in to sites using third-party credentials. These mechanisms have the added bonus of transparency and implied consent: When a user logs in with Facebook on a site, the user is actively allowing this connection between the site and Facebook to proceed.
So why do we have third-party cookies? Actually, the original 1997 [RFC Specification](https://tools.ietf.org/html/rfc2109) of the cookie standard proposed that third-party cookies should not be allowed on privacy grounds:
> This restriction prevents a malicious service author from using unverifiable transactions to induce a user agent to start or continue a session with a server in a different domain. The starting or continuation of such sessions could be contrary to the privacy expectations of the user, and could also be a security problem.
and browsers should have this setting by default:
> User agents may offer configurable options that allow the user agent, or any autonomous programs that the user agent executes, to ignore the above rule, so long as these override options default to "off".
However, these recommendations were not implemented by browser developers at that time, and the default of _'allow all cookies'_ has remained since then.
Currently, almost all major browsers have a default to allow all cookies. The one exception is Safari, which only allows third-party cookies for domains which have been visited as a first party. This setting mitigates tracking from unknown domains, but still allows others to track, and does not prevent CSRF attacks. Mozilla also previously [attempted](https://blog.mozilla.org/netpolicy/2013/02/25/firefox-getting-smarter-about-third-party-cookies/) to change Firefox's default handling of third-party cookies in 2013, but pressure from the Ad industry led to a [U-turn](https://blog.mozilla.org/blog/2013/05/10/personalization-with-respect/) before these changes went live. The failure of browsers to handle third-party cookie tracking [has argueably led](https://medium.com/the-graph/how-to-reverse-publisher-revenue-drain-c33e41bf0665) to the increase in adblocker usage since then.
The effect that this default has had over the last 20 years, is that developers now assume that cookies are allowed in all contexts. This causes many workflows to break once this assumption is broken. This leads to a vicious cycle, where attempts to limit third-party cookies are foiled because they break too many sites. Apple's push to reduce third-party cookie tracking with their [Intelligent Tracking Prevention](https://webkit.org/blog/7675/intelligent-tracking-prevention/) technology had to include a section to explain to developers how to solve several use-cases when their cookies are limited. This technology still allows third-party cookies from visited sites however, and this method is also recommended for implementing single sign-ons.
## Moving away from third-party cookies
In 2015 Cliqz[^1] released an anti-tracking technology which [aggressively blocks third-party cookies](./how_cliqz_antitracking_protects_users.html). Third-party cookies are blocked unless certain heuristics are triggered. These heuristics aim to mitigate common cases where cookie blocking breaks workflows, but also require user action to trigger. A Facebook button can be loaded without cookies, but if the user then clicks on it, there is an implied consent to allow the cookies in this case. This method blocks 97% of third-party cookies, with minimal breakage of pages.
<table class="table table-hover">
<thead>
<tr>
<th>Browser</th>
<th>Default Cookie setting</th>
</tr>
</thead>
<tbody>
<tr>
<td>Google Chrome</td>
<td>Allow all.</td>
</tr>
<tr>
<td>Mozilla Firefox</td>
<td>Allow all.</td>
</tr>
<tr>
<td>Apple Safari</td>
<td>Allow from visited; tracking cookies limited.</td>
</tr>
<tr>
<td>Cliqz Browser / Ghostery extension</td>
<td>Block all third-party, unless user interaction or compatibility exception.</td>
</tr>
</tbody>
</table>
In December 2017, this technology was included in the [Ghostery 8 release](https://www.ghostery.com/blog/product-releases/browse-smarter-with-ghostery-8/). This increased the number of users with this aggressive cookie blocking behaviour and this increased exposure also highlighted more cases where cookie blocking causes problems for websites. In many cases it may not be surprising that developers have not considered or tested the possibility of third-party cookies not being allowed. What surprised us though, is that this is so pervasive that the biggest players fail to handle cookies properly, in some cases causing critical bugs.
At present, if you browse the web with third-party cookies disabled, you may come across issues logging in and making payments. Here we outline several cases we have found involving major tech companies (which should be able to solve this issues), which have consequences from just preventing login or completing payment, to potentially leaking private company data from a Microsoft Office account.
### Microsoft Logout Issues
We found that, when all third-party cookies are disabled, logging out on office.com seems to succeed, but actually fails. Additionally this authenication remains available when subequently navigating to office.com, which can be used to extract data from the SharePoint API. This issue was submitted to Microsoft but was rejected on the grounds that it is not remotely exploitable. It is however a risk on any shared computer that a subsequent user would be able to see document metadata for the organisation, and perhaps modify data via the SharePoint API.
This can be simply reproduced by disabling third-party cookies in any browser, then logging in and then out again on office.com.
<img class="img-responsive" src="../static/img/blog/cookie_block/office_signed_out.png" alt="Office logged out message" />
<p class="img-caption">Looks like I'm logged out...</p>
After the confirmation of a successful logout, simply navigate back to www.office.com, and one is returned to the view after login, including an up-to-date feed of recently changed documents.
<img class="img-responsive" src="../static/img/blog/cookie_block/office_documents.png" alt="Office documents shown after logout" />
<p class="img-caption">Document change feed still shown after logout.</p>
The API that makes this information available after logout is fetched via the [SharePoint REST API](https://docs.microsoft.com/en-us/sharepoint/dev/sp-add-ins/sharepoint-net-server-csom-jsom-and-rest-api-index), and the authentication token for this is not deleted nor expired after the failed logout - hence the page can continue to access this information. The token can be collected from the developer tools and then reused for API calls, for example to list folders in this organisation's SharePoint:
```javascript
var accessToken = "eyJ0...";
var baseUrl = 'https://org-my.sharepoint.com/_api/';
var headers = new Headers();
headers.append('Authorization', `Bearer ${accessToken}`);
headers.append('Accept', 'application/json;odata=verbose');
fetch(`${baseUrl}web/lists`, { headers })
.then(resp => resp.json())
.then(res => console.log(res))
```
The broken logout state can only be resolved by manually deleting office.com cookies. We also found the session may eventually be expired, but this only happened after multiple hours. Hence, users affected by this will 1) likely not be aware that they're not logged out properly, as the logout appears to be successful, and 2) would not be able to logout anyway if they noticed the issue.
The issues with Office continue when trying to purchase an Office365 trial from `https://products.office.com/try`. This time, the source of the problem is detected, but the user is given no choice to continue unless they compromise their security and privacy by enabling cookies. Ironically, they also imply that allowing third-party cookies is somehow safer.
<img class="img-responsive" src="../static/img/blog/cookie_block/ms_payment_fail.png" alt="Cookie error in Microsoft office checkout" />
<p class="img-caption">It is not possible to buy Office without allowing third-party cookies.</p>
### Pay with your Cookies
It is common practice for E-Commerce sites to embed payment systems from third-party vendors, such as Paypal, on their checkout pages. Such widgets should not require third-party cookies - usually the user can be redirected to pay at the payment provider's site. This method is preferable, as it reduces the chances of phishing: loading the payment page as a first party will make the url and certificate status visible, and only prompting users to enter payment information on the first party site is also good practice.
Despite this, we see examples of payment being blocked when third-party cookies are disabled. One such example is on the German E-Commerce site [Thomann.de](../websites/thomann.de.html). When attempting to checkout with Amazon pay, we get an error mentioning that third-party cookies are being blocked:
<img class="img-responsive" src="../static/img/blog/cookie_block/amazon_pay_thomann.png" alt="Amazon Pay error on Thomann.de" />
<p class="img-caption">"There was an error processing the Amazon payment. A possible cause is third-party cookie blocking."</p>
### Connect with Google? Third-party cookies required
Many sites use Google's connect SDK, to allow users to login to sites with their Google account. When testing cases on [www.tripadvisor.com](https://www.tripadvisor.com) and [www.stumbleupon.com](https://www.stumbleupon.com) with third-party cookies disabled, the 'Connect with Google' button fails to do anything when clicked. Both these sites also offer Facebook login too which works with cookies disabled. It is not clear why the Google implementation requires third-party cookies to be allowed.
<img class="img-responsive" src="../static/img/blog/cookie_block/tripadvisor_connect.png" alt="Tripadvisor connect social" />
<p class="img-caption">Tripadvisor signup buttons. </p>
### Please let me track your tracking opt-out
Following GDPR, websites using third-party services which collect data about users [acquire consent](./update_jun_2018.html) for this, as well as provide a reasonable way of opting-out of data collection and processing. While many publishers have converged on a solution which [gathers consent as a first-party cookie](https://iabtechlab.com/standards/gdpr-transparency-and-consent-framework/) which can then be passed to third-parties, other still rely on an older system of setting opt-out cookies for each vendor. Obviously, if third-party cookies are blocked, this mechanism will not work, as can be seen on the [Telegraph](../websites/telegraph.co.uk.html):
<img class="img-responsive" src="../static/img/blog/cookie_block/telegraph_optout.png" alt="Cookie opt out on telegraph.co.uk" />
<p class="img-caption">"You browser is currently blocking 3rd party cookies ... you will need to enabled 3rd party cookies if you want all of the opt-outs on this page to work."</p>
In this case, users with third-party cookies disabled will be denied their right to opt-out (though blocking these cookies will effectively prevent a large proportion of tracking).
Third-party vendors may say that this mechanism is required in order to remember a user's consent settings. However, previous attempts to allow browsers to convey tracking consent explicitly to servers, via the ['Do Not Track'](https://www.w3.org/TR/tracking-dnt/) standard were killed by the same vendors collectively saying they would [ignore this signal](https://blogs.harvard.edu/doc/2015/09/23/how-adtech-not-ad-blocking-breaks-the-social-contract/).
## Conclusion
When the idea of cookies was first proposed, the standard writers were concerned about the privacy implications of allow third-party cookies, and specified that browser vendors should disable them by default. Fast-forward 20 years and the majority of browsers on the web will allow all third-party cookies. The result of this are significant challenges to protect against Cross-site request forgery, with countless sites and accounts compromised along the way, and pervasive privacy invasion in the form of cross-site tracking of users.
We argue that we should aim to return to a web where third-party cookies are blocked by default, and are making that possible for users of our anti-tracking technology in [Cliqz](https://cliqz.com/) and [Ghostery](https://www.ghostery.com/), however this is made difficult by the prevailing assumption that cookies are a free-for-all, making many sites fail to function properly in this environment. In this regard we are constantly improving heuristics to mitigate the breakage issues we do find.
We showed multiple cases where the assumption that third-party cookies will be allowed lead to both benign and potentially dangerous issues for users who block cookies. Some of these cases affect payments, so perhaps if cookie-blocking becomes more common and companies' bottom lines are effected these issues will be fixed. This is a chicken and egg problem though, if the web is broken for users blocking cookies, then we may never achieve the critical mass required to get it fixed.
**For users**, getting control over which cookies your browser sends out, and to whom, is a key part of protecting privacy online, but also something that is not universally recognised by
browser privacy tools. Most adblockers, for example, do nothing to the cookies of third-party requests which are not on their blocklists. More adoption of the kind of cookie blocking that Cliqz and Ghostery do help us to achive this critical mass, and push more websites to ensure that their services still work correctly for users who chose more private browser configurations.
**Developers** have a part to play here too. By building services which do not require third-party cookies, or at least continue to function without them, it becomes easier for users to turn off third-party cookies, and the web becomes more privacy-friendly. As we have seen in this article, even the biggest tech companies are currently failing at this, but this seems to be more due to a lack of awareness, than any difficultly in implementation.
[^1]: Disclosure: WhoTracks.Me is operated by Cliqz.
[^2]: Note that both of these methods also have some privacy issues. First-party redirection has been exploited for [user tracking](https://brave.com/redirection-based-tracking/), and OAuth dialogs can trick users into granting [many more permissions](https://lifehacker.com/how-to-revoke-pokemon-go-s-extensive-permissions-to-you-1783466118) than they actually need.
================================================
FILE: blog/cookie-consent.md
================================================
title: Improving Cookie Consent
subtitle: Cliqz' new feature to make consent fairer
author: privacy team
type: article
publish: True
date: 2019-11-28
tags: blog, gdpr, consent
header_img: blog/autoconsent/cookie-blocker-prompt.png
+++
Since the GDPR came into force in May last year, the Cookie-Consent Popup has become a fixture of browsing the web. These popups are ostensibly there to allow you to choose whether you agree or disagree to your data being used for certain purposes on the site, but confusing UI design and tricks mean that many users are not able to select their desired consent settings. A recent [study](https://arxiv.org/pdf/1909.02638.pdf) showed that user fatigue with consent popups, and simple UI tricks are able to artificially inflate the opt-in rate. The study also showed that, when opt-out is the default, only 0.1% of users would consent to all data processing. This is in stark contrast to the over 90% opt-in rate that the [industry claims](https://www.thedrum.com/news/2018/07/31/over-90-users-consent-gdpr-requests-says-quantcast-after-enabling-1bn-them), and uses to justify that users are OK with tracking.
How can we restore balance to this situation, and allow users a fair choice about how their data is used? At Cliqz we have been developing a new feature to aim to address the difficulty of denying consent based around 3 core principles:
1. Opt-out and opt-in should both require maximum of one click, i.e. the time-cost should be the same, no matter which choice is made.
2. The user should not have to decide individually for every site. Their default choice can be used to give consent after their initial decision.
3. Consent banners only offering an 'OK' or 'Allow' option do not allow user choice. The are at best a distraction for the user, and at worst drive consent fatigue and encourage the bad practice of automatically clicking away message prompts. These should be hidden.
Unfortunately, implementing an automated consent choice in the browser is made challenging by the lack of adoption or adherence to browser standards. The [Do Not Track](https://www.w3.org/blog/2018/06/do-not-track-and-the-gdpr/) standard enables users to broadcast preferences around tracking, and for sites to communicate tracking status to the browser. Before that, the [P3P Project](https://www.w3.org/P3P/) attempted to standardise privacy practices and allow automated decision making around them. Both of these standards have been rejected by the tracking industry, who prefer to present consent on their terms. The industry have instead proposed and implemented the [Transparency and Consent Framework](https://iabeurope.eu/transparency-consent-framework/), which primarily focuses on communicating consent between vendors. It is a read-only API, so the browser can only read the consent status as set by the site, and not modify it. This means that consent can currently only be expressed by clicking through HTML forms.
<img class="img-responsive" src="../static/img/blog/autoconsent/cookie-blocker-before.gif" alt="Navigating a Cookie-Consent Popup manually" />
<p class="img-caption">Navigating a Cookie-Consent Popup manually.</p>
Luckily, the number of vendors offering consent solutions is limited, and browser extensions can simulate clicking through forms. Thus, [autoconsent](https://github.com/cliqz-oss/autoconsent) was born - a library of rules standardising the navigation of consent forms for the most popular sites and vendors. This library is able to:
* Detect the presence of supported Consent Management Providers on a page.
* Determine whether a popup or overlay is being shown on the page.
* Execute an opt-in (allow all purposes) or opt-out (reject all purposes).
* Where available, re-open the popup to allow modification of the settings.
In practice, this allows consent popups to be rapidly dismissed when loading a new site. The speed depends on the provider and how quickly their UI can be manipulated. In all cases, however, this is faster than a user could navigate the interface.
<img class="img-responsive" src="../static/img/blog/autoconsent/cookie-blocker-after.gif" alt="Automatic navigation of the Cookie-Consent Popup" />
<p class="img-caption">Automatic navigation of the Cookie-Consent Popup.</p>
For popups that are informational only, or force affirmative consent, we apply simple cosmetic rules. These are CSS rules that define elements in the page that should be hidden. As with the consent rules, we benefit from the defacto standardisation of tools for displaying of popups, such that a small number of rules can support the majority of popups shown by websites.
These elements combined mean that we now just have to ask the user once whether they want to opt-in or opt-out, then they will not be bothered by consent popups on the majority of sites they visit. At the same time, they will signal to these sites their approval or dissapproval of their data collection practices.
This signal of non-consent is important to encourage and incentivise a shift in data usage practices on the web. When sites realise they cannot just trick users into allowing invasive data collection, they will have a strong incentive to change the way they operate and respect users more.
The new Cliqz Cookie-Popup blocker is available in the latest version of the Cliqz browser. Get it at [cliqz.com](https://cliqz.com/download).
================================================
FILE: blog/cookies.md
================================================
title: Cookies
subtitle: A small piece of data sent from a website, meant to 'help', used to track.
author: privacy team
type: primer
publish: True
date: 2017-07-22
tags: primer, tracking
header_img: blog/blog-cookies.jpg
+++
An HTTP cookie (also called web cookie, Internet cookie, browser cookie, or simply cookie) is a small piece of data sent from a website and stored on the user's computer by the user's web browser while the user is browsing. Cookies were designed to be a reliable mechanism for websites to remember stateful information (such as items added in the shopping cart in an online store) or to record the user's browsing activity (including clicking particular buttons, logging in, or recording which pages were visited in the past). They can also be used to remember arbitrary pieces of information that the user previously entered into form fields such as names, addresses, passwords, and credit card numbers.
Other kinds of cookies perform essential functions in the modern web. Perhaps most importantly, authentication cookies are the most common method used by web servers to know whether the user is logged in or not, and which account they are logged in with. Without such a mechanism, the site would not know whether to send a page containing sensitive information, or require the user to authenticate themselves by logging in. The security of an authentication cookie generally depends on the security of the issuing website and the user's web browser, and on whether the cookie data is encrypted. Security vulnerabilities may allow a cookie's data to be read by a hacker, used to gain access to user data, or used to gain access (with the user's credentials) to the website to which the cookie belongs (see cross-site scripting and cross-site request forgery for examples).[[1](http://news.cnet.com/8301-10789_3-9918582-57.html)]
## Tracking Cookies
The tracking cookies, and especially third-party tracking cookies, are commonly used as ways to compile long-term records of individuals' browsing histories – a potential privacy concern that prompted European [[2](http://webcookies.org/faq/#Directive)] and U.S. lawmakers to take action in 2011. European law [[3](http://www.bbc.co.uk/news/technology-12668552)] requires that all websites targeting European Union member states gain "informed consent" from users before storing non-essential cookies on their device.
The excerpt above has been retrieved from [wikipedia](https://en.wikipedia.org/wiki/HTTP_cookie).
#### References:
[1] [Gmail cookie stolen via Google Spreadsheets](http://news.cnet.com/8301-10789_3-9918582-57.html) <br>
[2] [What about the "EU Cookie Directive"?](http://webcookies.org/faq/#Directive) <br>
[3] [New net rules set to make cookies crumble](http://www.bbc.co.uk/news/technology-12668552) <br>
[4] Source: [Wikipedia](https://en.wikipedia.org/wiki/HTTP_cookie)
================================================
FILE: blog/dexie_transaction_bug.md
================================================
title: A quantum bug in Firefox Quantum
subtitle: DevTools - how we tracked down an observant-dependent bug.
author: privacy team
type: article
publish: True
date: 2019-04-30
tags: blog
header_img: blog/dexie_transaction_bug/release.png
+++
Summary: _When observing your own program can change its behavior, and an unexpected but real use-case to detect if DevTools are open._
---
Occasionally one comes along weird bugs, some might even call them mystic or heisenbugs. We recently stumbled upon one such bug while working on the [Cliqz Extension](https://cliqz.com/en/).
It started with a rather innocent warning in our browser console. While testing an upcoming release, we noticed some warnings emitted from the [Dexie.js](https://dexie.org/) library, which is a popular wrapper for IndexedDB, a browser database API:
Unhandled rejection: r@moz-extension://a4671cfd-e7e2-4264-9f96-b21f9b289cd9/modules/vendor/dexie.min.js:1:25586
_promise/i<@moz-extension://a4671cfd-e7e2-4264-9f96-b21f9b289cd9/modules/vendor/dexie.min.js:2:1746
U@moz-extension://a4671cfd-e7e2-4264-9f96-b21f9b289cd9/modules/vendor/dexie.min.js:1:6115
q@moz-extension://a4671cfd-e7e2-4264-9f96-b21f9b289cd9/modules/vendor/dexie.min.js:1:5934
_promise@moz-extension://a4671cfd-e7e2-4264-9f96-b21f9b289cd9/modules/vendor/dexie.min.js:2:1720
_trans@moz-extension://a4671cfd-e7e2-4264-9f96-b21f9b289cd9/modules/vendor/dexie.min.js:1:25312
_idbstore@moz-extension://a4671cfd-e7e2-4264-9f96-b21f9b289cd9/modules/vendor/dexie.min.js:1:25632
get@moz-extension://a4671cfd-e7e2-4264-9f96-b21f9b289cd9/modules/vendor/dexie.min.js:1:25747
getTag/<@moz-extension://a4671cfd-e7e2-4264-9f96-b21f9b289cd9/modules/webextension-specific/app.bundle.js:15679:35
…
At first, it did not look too serious, but after digging in a bit more, it turned out that core functionalities of both the Cliqz and [Ghostery extensions](https://www.ghostery.com/) were negatively impacted. Each time the extension attempted to aggregate some statistics for display on our FreshTab page, the operation would mysteriously fail. And this is how the investigation started...
We quickly noticed that the bug could neither be reproduced on the [Cliqz Browser](https://cliqz.com/en/) nor on Firefox 66 (which is at this time the stable release). Only Firefox 67 (developer edition) and Firefox 68 (Nightly) were affected. Further testing confirmed that the issue could be reproduced with previous versions of the Cliqz and Ghostery extensions (which share part of the code base). At this point we suspected a bug in Dexie or IndexedDB and started looking deeper.
We realized that whenever a Dexie operation was attempted as part of a transaction, the operation would fail, a warning would be displayed, and the transaction would abort. With that insight, we were able to create a first workaround by avoiding the use of transactions. Still, it was not clear why the code would fail on Firefox 67 and simply avoiding transactions without understanding the root cause was not satisfactory. For this reason, we tried to narrow it down further. A first attempt to reproduce the problem with a minimal extension and the same type of transactions was not successful. All database operations succeeded normally; the issue seemed to happen only as part of the Cliqz extension.
Then we started trimming down our bundle by removing code from the Cliqz extension, step by step. We had the feeling that it had to be a kind of timing issue, but it was not clear which conditions were necessary or sufficient to trigger the bug. It took us time to realize, but finally we had the Eureka moment! It only seemed that the transactions would fail when DevTools were opened! This was literally a heisenbug, only triggering when observed in the console. Not opening the console would be enough to make the code work as expected. In other words, if the code ran fast enough that we did not have time to open the debugger, everything would work perfectly.
It was a race condition indeed, but not in the traditional sense of the term, rather it was a race between the developer and the code...
With that understanding, we were able to create a minimal example and could file proper bug reports for both Firefox and Dexie.js:
* [Bug 1545400 - Webextension console dev-tools opened makes IndexedDB/Dexie transaction fail](https://bugzilla.mozilla.org/show_bug.cgi?id=1545400)
* [Dexie #831 - Transaction fails if console dev-tools opened (Firefox 67)](https://github.com/dfahlander/Dexie.js/issues/831)
Detecting whether DevTools are open
===
Let us take a closer look at how this bug can be used to run code only when DevTools are opened. Ideally, a browser should not allow that, although it is difficult to prevent it in practice. Some discussion on that topic can be found in [this Chromium issue](https://bugs.chromium.org/p/chromium/issues/detail?id=672625). This has been exploited in the past by some websites to hide malicious behavior from the eyes of developers or users; whenever DevTools would be opened, the site would conceal some of the logic immediately, preventing inspection.
Most existing techniques to detect DevTools are leveraging browser bugs, which eventually got patched (examples can be found [here](https://stackoverflow.com/q/7798748/783510) and [here](https://github.com/sindresorhus/devtools-detect/issues/15)). Typically, they use quirks in various Browser APIs, but there are also solutions based on timing attacks.
Another approach is to look at the window size, which is currently used in the [devtools-detect library](https://github.com/sindresorhus/devtools-detect). The drawback is that it can be easily bypassed if the DevTools are opened in a separate window.
Using the transaction bug, we can build a DevTools detector for Firefox 67 and above that will also work for undocked windows. Here is a sketch of the idea:
function onDevToolsOpen() {…}
function onDevToolsClosed() {…}
const db = new Dexie('test_db');
db.version(1).stores({ test_table: 'test_key' });
setInterval(() => {
// uses the bug that the Dexie transaction will fail
// with open DevTools in Firefox 67 and above
db.transaction('rw', db.test_table, async () => {
await db.test_table.toArray()
}).then(onDevToolsOpen, onDevToolsClosed);
}, 1000);
If you want to try yourself, a working example is available [here](https://cdn.cliqz.com/browser-f/fun-demo/firefox_devtools.html).
Until a patch for the bug gets released, the technique will work in pages and as well as in web extensions. In the [bug ticket](https://bugzilla.mozilla.org/show_bug.cgi?id=1545400), we also included a working example that detects DevTools from within a WebExtension.
This issue joins an array of techniques which enable malicious actors to detect special states of the browser, such as private browsing mode, and when users are [auditing the activities of sites with DevTools](https://github.com/gorhill/uBO-Extra#purpose). The former has been used to [deny access to sites in private browsing mode](https://arstechnica.com/information-technology/2017/05/boston-globe-website-no-longer-lets-you-read-articles-in-private-mode/), attempting to force users to reduce their protection against tracking, while the latter is used by [systems which circumvent adblocker to cover their tracks](https://www.theregister.co.uk/2017/08/11/ad_blocker_bypass_code/).
================================================
FILE: blog/fingerprinting.md
================================================
title: Fingerprinting
subtitle: Let me tell you what's unique about your device.
author: privacy team
type: primer
publish: True
date: 2017-07-22
tags: primer, tracking
header_img: blog/blog-fingerprinting.jpg
+++
A device fingerprint or machine fingerprint or browser fingerprint is information collected about a remote computing device for the purpose of identification. Fingerprints can be used to fully or partially identify individual users or devices even when [cookies](/blog/cookies.html) are turned off.
Basic web browser configuration information has long been collected by web analytics services in an effort to accurately measure real human web traffic and discount various forms of click fraud. With the assistance of client-side scripting languages, collection of much more esoteric parameters is possible [[1](http://browserspy.dk/)]. Assimilation of such information into a single string comprises a device fingerprint. In 2010, EFF measured at least 18.1 bits of entropy possible from browser fingerprinting, [[2](https://panopticlick.eff.org/static/browser-uniqueness.pdf)] but that was before the advancements of canvas fingerprinting, which claims to add another 5.7 bits.
Prior to early 2017, device fingerprinting was limited to single browsers. If a user switched browsers regularly, fingerprinting could not be used to link the user to these browsers [_citation needed_]. A cross browser fingerprinting method has been published [[3](http://yinzhicao.org/TrackingFree/crossbrowsertracking_NDSS17.pdf)] which allows tracking of a user across multiple browsers on the same device.
The excerpt above has been retrieved from [wikipedia](https://en.wikipedia.org/wiki/Device_fingerprint).
## References
[1] [Browser Spy](http://browserspy.dk/) <br>
[2] EFF: [How Unique Is Your Web Browser?](https://panopticlick.eff.org/static/browser-uniqueness.pdf) <br>
[3] [(Cross-)Browser Fingerprinting via OS and Hardware Level Features](http://yinzhicao.org/TrackingFree/crossbrowsertracking_NDSS17.pdf) <br>
[4] Source: [Wikipedia](https://en.wikipedia.org/wiki/Device_fingerprint)
================================================
FILE: blog/gdpr-what-happened.md
================================================
title: GDPR - What happened?
subtitle: The tracking landscape post GDPR, adverse effects on competition and a market for compliance technologies
author: privacy team
type: article
publish: True
date: 2018-09-03
tags: blog, gdpr
header_img: blog/gdpr/gdpr-header.png
+++
_In this article we look at the effect GDPR has had on the tracking landscape,
online advertising in Europe, and provide a set of recommendations for machine
readable legislation._
## GDPR: A primer
Having been a hot topic of discussion for at least the last 2 months,
it is unlikely that GDPR needs an introduction. So in brief:
the regulation applies to the processing of personal data of European
citizens. Companies engaged in such processing activities
are subject to compliance, regardless of whether or not they operate in the
EU[^1]. An important aspect of the regulation is the pressure
put on companies to obtain consent from their European users for the
processing of their personal data. This is why prior to May 25th,
2018 there was a surge in emails in your inbox, and it's also the
reason for the numerous popups you see when visiting websites.
GDPR was announced two years before it came into force, and
it was not the overhaul of privacy legislation one could believe it was from
reading the press. In fact, GDPR came as a major update of the previous
EU Data Protection Directive[^2] which has been around since 1995. Designed
to harmonize the data protection legislation in the EU,
and catch up with technological progress in the last 23 years, GDPR
comes heavily loaded with legal language that many have found hard
to navigate. Daphne Keller of the Standford Center for Internet and Society said
"*The final GDPR text is riddled with ambiguous passages*", suggesting
that the ones who will benefit the most from GDPR are data protection
lawyers[^3].
**So, what happened?** This is the question we are all left with.
We will be using whotracks.me data to make sense of the effect of
GDPR on the tracking landscape in the web and on online advertising,
the behemoth of third party services on the web.
## Tracking Landscape on Websites
We take 2000 [websites](../websites.html) profiled on **whotracks.me**
and compare the tracking landscape in these sites as a function of the
origin of the users visiting. We want to compare the EU, subject of GDPR,
with the US.

Merely looking at the average number of trackers per page for each
category of site being visited reveals a general downward trend
in Europe. The opposite is true in the US. The blue area indicates
the average number of trackers across categories.
In fact, if we look closely, since April 2018 the average number of
trackers per page in the EU has dropped by almost 4% while in
the US it has increased by 8%.

If we take the top 2000 domains visited by European residents, and
check how the average number of trackers per page by the category
of the website, we notice that there as been a reduction in the
number of third parties almost across the board.

The reduction seems more prevalent among categories of sites with a
lot of trackers. We see a 7.5% reduction in the average number of
third parties per page from April to July in News websites. This is in
the same direction as what was identified by a study published last
month by the University of Oxford. They looked at news websites,
and found that the number of cookies set on page decreased
on average by 22%[^4].
Some websites, like [The Los Angeles Times](../websites/latimes.com.html), interrupted
their operations in Europe, others decided to offer text only versions
of their websites if the user does not consent to sharing data with third
parties, like [npr.org](../websites/npr.org.html). What we are certainly
observing is a rise in the usage of consent management tools, for which
we [wrote about in more detail](./update_jun_2018.html) back in June.
## Third party services: the winner takes it all
Both the most lucrative, and the most pervasive of all services performed
by third parties is online advertising - the 'fuel' that keeps a large part
of the web running.
Online advertising in 2018 is estimated to be a $270 billion[^5]
market, and expected to grow by more than 20% in the next two years.
This is the market that third parties are competing for, and when
the prize is so high, worries over GDPR having regressive effects
on competition are understandable. As in most markets, the presence
of monopolies is something regulators try hard to avoid[^6]. The question
is then: ***Has GDPR, designed to enhance user privacy in the web,
had any adverse effects on competition?***
At present, whotracks.me has profiles for more than 1000 trackers, out of which
about **200** are classified as advertising services. For each of these
trackers we have data on what percentage of the measured web traffic we
have observed the tracker to be loaded - `reach`[^12], as well as what
percentage of websites the tracker is present on - `site_reach`[^13].
Monitoring `reach` and `site_reach` gives us interesting insights into the
structure of the market these trackers operate in, as well as their relative market share.
Using whotracks.me data, we can do that at scale. Each month we have on average
about 300 million page loads and more than half a million websites.
If we rank each tracker by its reach, and measure changes in reach since April,
we notice that in Europe, most advertisers appear less.

The same trend persists when we look at `site_reach`.

Google's advertising services have maintained their market share, while
other advertisers across the board have lost reach. There could be several
reasons to explain Google's favorable state post GDPR:
1. **Resources thrown at compliance**: Google and other big companies have had
significant resources dedicated to compliance[^7].
2. Google acts in the capacity of a gatekeeper, hence it is
conceivable to assume it may have used that position in punitive ways. Reports
indicate that Google could have encouraged publishers to reduce the number of AdTech
vendors[^8].
3. Websites owners trying to minimize their exposure opt for 'safer choices',
dropping smaller advertisers that may have a harder time proving compliance.
Using a tracker's reach as a proxy for market share, we measure that GDPR
may have had regressive effects on competition in the online advertising
space in Europe.
## Recommendations for GDPR 2.0
GDPR has had a measurable impact on advancing the rights of European
citizens on the web. We believe one of the most important contributions
of GDPR is the increased transparency on how personal data is moved around,
as well as the management of consent from the user, and think GDPR 2.0
should strive to be **machine-readable** as opposed to **human-readable**.
GDPR came as an update to the EU Data Protection Directive, primarily because the
evolution of the web in the last 20 years rendered the old regulation obsolete.
**Human Readable** is not the way to design a law that aims in large part to regulate
interaction in the web. The design of the tools meant to empower
users are left to the service providers, whose incentives don't exactly align
with that of the users. This can give rise to deceptive interfaces and UX patters,
designed to exploit human cognitive biases.
What we need, is a GDPR 2.0 that pushes for *machine readable* standards,
giving rise to user-focused solutions, simple, non-deceiving interfaces, thus
creating an industry of privacy and compliance, where technologists
keep other technologists in check.
Here are our recommendations:
1. **`/privacy-policy.txt`** - require websites to host the privacy policy
in a standard location of the sitemap. At present, identifying the location
of the privacy policy of a website is not as straightforward as one
may hope. Last year as part of the Mozilla Global Sprint we built
[Privacy Bot](https://github.com/cliqz-oss/privacy-bot), which aimed to
gather, persist and analyze privacy policies. One of the challenging
problems we had to solve, was identifying where the privacy policy was hosted.
2. **`/third-parties.json`** - provide a structured list of third parties
present on the site, the service being performed by them, list of data points
they have access to (e.g. IP, user agent, pages visited on site ... ), and
default state of consent. This would enable browsers to assume the role they should
have had baked in: a unified control center for the user to
manage consent. This is especially important given the rise of deceiving UX patterns
we are increasingly used to seeing in websites these days[^9].There are
standards that can be built upon, like the Content Security Policy, and the
Do Not Track Standards, which are widely adopted by browser vendors.
A similar effort is `/ads.txt`, initiated by the iab techlab, aiming towards
a mechanism to define authorized sellers for web content from the perspective
of the domain owner[^10].
3. **`/dpo.json`** - increase oversight of the Data Protection Officer,
detaching the role further away from the organization. Provide machine
parsable details of the DPO, for users to be able to reach out more easily,
as well as providing incentives for the establishment of a new market
around privacy management.
4. **`/incidents-and-cases.json`** - Data Incidents reported have increased
as a consequence of GDPR[^11]. This information should also be made available
to the public. The web is currently built and operated largely on trust. As such,
transparency over the amount of incidents a given website or service has had
to report is very important. Furthermore, provide a list of the open
court cases involving the mismanagement of personal data the company
is involved in.
## Conclusions
In Europe, GDPR has thus far had a measurable impact in reducing the average
number of trackers websites put in their pages, while in the US the opposite is
true. The increase in transparency benefits users as they enjoy an increased
control over their data, but the UX of the services managing that consent
does not always have users' best interest in mind.
On the other hand, in Europe GDPR has led the online advertising market to become more
concentrated, as the majority of advertisers lose market share. If this trend persists,
it is possible that GDPR is having adverse effects on competition. For users this
means that while the number of third parties asking for access to their
data is decreasing, a tiny few are getting more of their data.
To Regulators, and especially the supervising authorities responsible
for the enactment of GDPR, we think you should strive for creating incentives
for industry players to keep each other in check, thereby creating a market for
privacy. The only way this can be achieved is by pushing for a machine readable
legislation that enforces standards.
#### Footnotes
[^1]: GDPR Article 3 on [Territorial Scope](https://gdpr-info.eu/art-3-gdpr/)
[^2]: EU Data Protection Directive [[source]](https://en.wikipedia.org/wiki/Data_Protection_Directive)
[^3]: GDPR is vague [Standford Center for Internet and Society](http://cyberlaw.stanford.edu/blog/2015/12/final-draft-europes-right-be-forgotten-law)
[^4]: News Websites post GDPR [[Factsheet]](https://reutersinstitute.politics.ox.ac.uk/our-research/changes-third-party-content-european-news-websites-after-gdpr)
[^5]: Digital Advertising Market [[Statista]](https://www.statista.com/statistics/237974/online-advertising-spending-worldwide/)
[^6]: Google's Anti-trust cases in Europe [[source]](https://www.reuters.com/article/us-eu-google-antitrust-timeline/googles-antitrust-cases-in-europe-idUSKBN1K81CC)
[^7]: Preparation for GDPR [[source]](https://www.theguardian.com/technology/2018/may/25/facebook-google-gdpr-complaints-eu-consumer-rights)
[^8]: Google's Funding Choices [[source]](https://adexchanger.com/online-advertising/googles-gdpr-consent-tool-will-limit-publishers-to-12-ad-tech-vendors/)
[^9]: Dark Patterns: How UX design tricks you into giving away your privacy [[source]](https://cliqz.com/en/magazine/dark-patterns-how-ux-design-tricks-you-into-giving-away-your-privacy)
[^10]: IAB Techlab `ads.txt` [[source]](https://iabtechlab.com/wp-content/uploads/2017/09/IABOpenRTB_Ads.txt_Public_Spec_V1-0-1.pdf)
[^11]: Reported Data Incidents increase with GDPR [[source]](https://www.itgovernance.co.uk/blog/ico-statistics-show-increase-in-reported-incidents-ahead-of-gdpr/)
[^12]:`reach`: Proportional presence across all page loads (i.e. if a tracker is present on 50 out of 1000 page loads, the reach would be 0.05). Value is a float between 0 and 1.
[^13]:`site_reach`: Presence across unique first party sites. e.g. if a tracker is present on 10 sites, and we have 100 different sites in the database, the site reach is 0.1. Value is a float between 0 and 1.
================================================
FILE: blog/generating_adblocker_filters.md
================================================
title: Generating Ad-Blocker filters from whotracks.me data
subtitle: Let's never miss a new tracker again.
author: remusao
type: article
publish: True
date: 2017-11-20
tags: privacy, tracking, adblocking
header_img: blog/blog-generate-adb-filters.jpg
redirect_url: https://www.ghostery.com/blog/generating-adblocker-filters
+++
*TL;DR* In this post we see how to:
1. Load the data from [whotracks.me](https://github.com/ghostery/whotracks.me) to get access to trackers' information
2. Create a mapping from tracking categories to list of domains
3. Filter each domain based on the amount of tracking of each *app*
3. Generate a filter list for each category
The full source code used in this article can be found on the [Github repository](https://github.com/ghostery/whotracks.me/blob/master/contrib/generating_adblocker_filters.py).
Most popular content blockers are using filter lists to decide what requests
leaving the browser should be blocked. In this regard, filter lists act as a
privacy *ground truth*: deciding what is safe, and what is not safe for users.
It means that your privacy protection is only as good as the filters your are
using. The community is doing an amazing job, but still there can be gaps in
your protection; one such situation is when a new tracker appears.
With the right data, updated regularly, we believe it is possible to
build powerful tools to help increase users' privacy. Knowing more about
trackers, in real time, allows to provide better anti-tracking but can also
*help* the tedious process of curating the filter lists.
In this post we'd like to demonstrate how we can make use of the
open-sourced [whotracks.me](https://whotracks.me) data to automatically
generate *up-to-date*, *per-category*, filter lists supported by the
most popular ad-blockers out there. Leveraging this data can improve
user experience and make maintaining the lists easier.
In the future, we can imagine generating per-country lists as well,
in the spirit of the different [easylists](https://easylist.to/) already
in existence:
* `DEU: Pornvertising blocking Germany`
* `DEU: Site_Analytics blocking Germany`
* ...
* `FR: Site_Analytics blocking France`
They could also be dispatched in the already existing lists such as
`advertising`, `privacy`, etc. Another option could be to use this as a
tool to assist maintainers to keep an eye on the ecosystem; allowing to
learn about new trackers in real time.
Let's get started!
## Loading the data
The first step is to install the `whotracksme` package, available on [PyPI](https://pypi.python.org/pypi/whotracksme)
and [Github](https://github.com/ghostery/whotracks.me). You can get started by
installing `whotracksme` with `pip`:
```sh
$ pip install whotracksme
```
We start by loading the tracker-related data from [trackerdb.sql](https://github.com/ghostery/whotracks.me/blob/master/whotracksme/data/assets/trackerdb.sql), using the
helper function found in the `whotracksme.data` module:
```python
from collections import defaultdict
from whotracksme.data import load_tracker_db
# Categories to tracker domains
tracker_domains_per_category = defaultdict(list)
# Keep track of normalized "app" name for each tracker domain. A given "app"
# such as "doubleclick" can use several domains: 2mdn.net, doubleclick.net, etc.
tracker_domains_to_app = {}
# Load trackers and group them by category
sql_query = """
SELECT categories.name, tracker, domain FROM tracker_domains
INNER JOIN trackers ON trackers.id = tracker_domains.tracker
INNER JOIN categories ON categories.id = trackers.category_id;
"""
with load_tracker_db() as connection:
for (category, tracker, domain) in connection.execute(sql_query):
tracker_domains_per_category[category].append(domain)
tracker_domains_to_app[domain] = tracker
```
Here is a sample of what we get in `tracker_domains_per_category`. Note that if
you run the same script, you might get slightly different results as the
data is being constantly updated:
```python
defaultdict(list, {
'advertising': [
'doubleclick.net',
...
],
'audio_video_player': [
'soundcloud.com'
...
],
'cdn': [
'googleapis.com',
...
],
'comments': [
'disqus.com',
...
],
'customer_interaction': [
'zendesk.com',
...
],
'essential': [
'googletagmanager.com',
...
],
'extensions': [
'kaspersky-labs.com',
...
],
'hosting': [
'amazonaws.com',
...
],
'misc': [
'linkedin.com',
...
],
'pornvertising': [
'pornhub.com',
...
],
'site_analytics': [
'google-analytics.com',
...
],
'social_media': [
'twitter.com',
...
]
]})
```
## Filtering based on tracking behavior
It is tempting to generate filters for each domain loaded so far, but it
would be very aggressive. Indeed, some domains identified as potential
trackers might in fact not send [unsafe identifiers](https://whotracks.me/blog/what_is_a_tracker.html) (or not a lot). For example
[createjs](https://whotracks.me/trackers/createjs.html) is not using any
*fingerprinting* and does not seem to be doing tracking via *cookies*,
hence, it should not be blocked systematically.
Fortunately, we can make use of the data from [apps.json](https://github.com/ghostery/whotracks.me/blob/master/whotracksme/data/assets/apps.json) to learn
more about each tracker. An *app* is an entity which can contain several
domains (e.g.: *doubleclick* is an *app* for which we identified three
domains: `2mdn.net`, `invitemedia.com` and `doubleclick.net`). We also
provide information about companies to which each app belongs, but we
will leave the exploration of this data for another article.
```python
import json
from whotracksme.data import load_apps
apps = load_apps()
```
`apps` is a dictionary with keys being *app ids* (e.g.: `google_analytics`)
and values containing all we know about each *app*. Let's take an example:
```json
apps["google_analytics"]
{
"overview": {
"bad_qs": 0.4377430033329568,
"content_length": 14771.492718357234,
"cookies": 0.0015869678941083753,
"https": 0.7507222054912428,
"id": "google_analytics",
"reach": 0.44292899275150094,
"requests": 3.834100333790446,
"requests_tracking": 1.202157901660253,
"site_reach": 0.616005569531587,
"tracked": 0.4383474801843971
},
"history": ...,
"rank": ...,
"sites": ...
}
```
That's a lot of data, and we plan to release a more complete
documentation about what all this is about soon. For now let's just say
that everything is already made accessible on the website, in form of
nice graphs and aggregations!
For our use-case, we will only consider the field: `tracked`. It
represents the proportion of page loads including *app*, identified as
performing some form of tracking (using either identifying *cookies*
or *fingerprinting*). In the case of `google_analytics`, it means that
out of 100 page loads where `google_analytics` was present, tracking
occurred 44 times.
Before generating the filter list, let's keep only *apps* tracking
users more than `10%` of the time. Please note that finding the right
threshold would require some finer analysis, and could depend on the
application.
```python
def filter_domains(domains):
for domain in domains:
app_name = tracker_domains_to_app[domain]
if app_name in apps:
app = apps[tracker_domains_to_app[domain]]
tracked = app['overview']['tracked']
if tracked >= 0.1:
yield domain
```
We need to check if the *app* exists first because we currently only have the
top 500 hosted on Github. We will host more in the future.
## Generating the lists
We now proceed to generate the filter lists from these domains. They can take
two forms:
* ADB compatible syntax: `||{domain}$third-party`
* Hostname syntax: `127.0.0.1 {domain}`
Note that the second option will probably be too aggressive in a lot of
cases, as it will also block the domain even if they are first-party (e.g.,
`google.com` might get blocked by these rules).
```python
def generate_adb_filters(domains):
"""Given a list of domains, generate filters using the
ADB syntax to be used in an adblocker"""
for domain in domains:
yield f"||{domain}$third-party"
def generate_hostname_filters(domains):
"""Given a list of domains, generate filters using
the hostname syntax"""
for domain in domains:
yield f"127.0.0.1 {domain}"
# Generate filters with *ADB* syntax
adb_filters = {
category: '\n'.join(generate_adb_filters(filter_domains(domains)))
for (category, domains) in tracker_domains_per_category.items()
}
# Generate filters with *hostname* syntax
hostname_filters = {
category: '\n'.join(generate_hostname_filters(filter_domains(domains)))
for (category, domains) in tracker_domains_per_category.items()
}
```
Each dictionary now contains a valid adblocking list for each category:
```python
hostname_filters.keys()
```
```python
dict_keys([
'advertising',
'audio_video_player',
'cdn',
'comments',
'customer_interaction',
'essential',
'extensions',
'hosting',
'misc',
'pornvertising',
'site_analytics',
'social_media',
'unknown'
])
```
And here is what we get for example in the `advertising` category:
```python
print(adb_filters['advertising'])
```
```
||doubleclick.com$third-party
||criteo.com$third-party
...
```
And the same domains but as `hostname` filters:
```python
print(hostname_filters['advertising'])
```
```
127.0.0.1 doubleclick.com
127.0.0.1 criteo.com
...
```
To put it in a nutshell, here is what we just did:
1. Load the data from [whotracks.me](https://github.com/ghostery/whotracks.me) to get access to trackers' information
2. Create a mapping from tracking categories to list of domains
3. Filter each domain based on the amount of tracking of each *app*
3. Generate a filter list for each category
There is so much more we can do with this database. At the moment the
API to load the data is pretty-low level, but it will be improved over
time.
================================================
FILE: blog/google_domains.md
================================================
title: The end of google.{your country}?
subtitle: Google's move to keep their cookies.
author: privacy team
type: article
publish: True
date: 2018-04-23
tags: google, cookies, tracking protection
header_img: blog/google_domains/dataflow.png
+++
There have been [recent reports](https://twitter.com/vtoubiana/status/987365270187634688) that
Google has started redirecting users from regional variants of Google search (served on
google.{de, fr, co.uk, etc}) to google.com for search results. This has implications for the
[4th most prevalent](../trackers/google.html) tracker on the web, so we decided to check
the data to see what is going on.
By looking at WhoTracks.Me data from April, we see that around April 16th there is a shift in
traffic from Google's European search results pages (`www.google.{de,fr,at,co.uk,etc}`) towards
`www.google.com`. The figure below shows that the former domains all saw a 50% drop in number
of page loads over the last week, while `www.google.com` is up 100-150%, suggesting Google
are doing a gradual rollout of this change. These changes lie well outside the bounds of
the weekly traffic fluctuations we usually see.

<p class="img-caption">Percentage change in traffic to google search result pages, April 2018</p>
We can further see the magnitude of this change by focusing on data for Germany. If we look at the
relative proportion of pages loaded on `www.google.de` and `www.google.com` in Germany over the last
month, we see a marked increase, with the share of traffic to `www.google.com` going up from around
5% to over 40%.

<p class="img-caption">Search results pages used in Germany, April 2018</p>
Why is Google doing this? We don't know - we're not aware of any official announcement. However,
one reason for this could be a reaction to increased usage of restrictive cookie settings, such
as allowing cookies only from visited sites, or Apple's [Intelligent Tracking Prevention](https://webkit.org/blog/7675/intelligent-tracking-prevention/).
If a user is rarely visiting the google.com domain, these technologies can expire this cookie earlier,
or prevent its use in third-party contexts.
As `google.com` is the domain used to authenticate with Google services, if the browser sends
`google.com` cookies in third-party context, these visits can be directly attributed to one's
Google profile. Therefore, this change increases the likelihood that the user will have recently
visited `www.google.com`, and therefore Google's tracking can continue uninterrupted.
Tracking from `google.*` domains [reaches 30% of web traffic](https://whotracks.me/trackers/google.html),
and the majority of this reach is contributed by the [`google.com` domain](https://github.com/ghostery/whotracks.me/blob/master/whotracksme/data/assets/2018-03/global/domains.csv#L5).
As, with this change, it is very difficult to avoid visiting `google.com` domain as a first party,
preventing this tracking in a vanilla browser would require disabling all third-party cookies.
Alternatively, [Cliqz](https://cliqz.com/) and [Ghostery's](https://www.ghostery.com/) AI anti-tracking
technologies block all third-party tracking cookies (Disclosure: the author works on this product).
[Privacy Badger](https://www.eff.org/privacybadger) is also able to block third-party tracking cookies.
================================================
FILE: blog/government_websites_september.md
================================================
title: Government websites
subtitle: If you are not the product, you're the taxpayer
author: privacy team
type: article
publish: True
date: 2018-10-10
tags: trackers, government
header_img: blog/gov_trackers/gov.png
redirect_url: https://www.ghostery.com/blog/government-websites-trackers
+++
_This post is one of our regular monthly blogs accompanying an update to the data
displayed on WhoTracks.Me. In these posts we introduce what data has been added as
well as point out interesting trends and case-studies we found in the last month._
<br />
On WhoTracks.me we typically profile websites where we see presence of tracking.
One new category of site we observed loading trackers this month was government websites.
Government websites act as information portals, allowing citizens to access information
or services from their government. In some cases the use of government sites will be
mandatory, for example services set up for submitting tax or visa information.
Thus, it is concerning that we see third-party tracking appearing on these sites, where
users do not have a choice whether or not they access the service, and are then forced
to hand over data to third-party companies by their governments.

<p class="img-caption">Average number of trackers seen on selected government websites from
the WhoTracks.Me September dataset.</p>
Here's a list of the government websites ending up in this month's release:
<table class="table table-hover">
<thead>
<tr>
<th>Country</th>
<th>Site</th>
<th>Notable trackers</th>
</tr>
</thead>
<tbody>
<tr>
<td>Australia</td>
<td><a href="../websites/bom.gov.au.html">bom.gov.au</a> </td>
<td>
<a href="../trackers/google_analytics.html">Google Analytics</a>,
<a href="../trackers/doubleclick.html">Doubleclick</a>
</td>
</tr>
<tr>
<td>Europe</td>
<td><a href="../websites/europa.eu.html">europa.eu</a></td>
<td>
<a href="../trackers/google_analytics.html">Google Analytics</a>,
<a href="../trackers/google.html">Google</a>,
<a href="../trackers/twitter.html">Twitter</a>
</td>
</tr>
<tr>
<td>France</td>
<td><a href="../websites/ants.gouv.fr.html">ants.gouv.fr</a></td>
<td>
<a href="../trackers/google_analytics.html">Google Analytics</a>,
<a href="../trackers/doubleclick.html">Doubleclick</a>
</td>
</tr>
<tr>
<td>France</td>
<td><a href="../websites/legifrance.gouv.fr.html">legifrance.gouv.fr</a></td>
<td>
<a href="../trackers/at_internet.html">AT Internet</a>
</td>
</tr>
<tr>
<td>France</td>
<td><a href="../websites/impots.gouv.fr.html">impots.gouv.fr</a></td>
<td>
<a href="../trackers/at_internet.html">AT Internet</a>
</td>
</tr>
<tr>
<td>Russia</td>
<td><a href="../websites/zakupki.gov.ru.html">zakupki.gov.ru</a></td>
<td>
<a href="../trackers/yandex.html">Yandex</a>
</td>
</tr>
<tr>
<td>UK</td>
<td><a href="../websites/tax.service.gov.uk.html">tax.service.gov.uk</a></td>
<td>
<a href="../trackers/google_analytics.html">Google Analytics</a>,
<a href="../trackers/optimizely.html">Optimizely</a>
</td>
</tr>
<tr>
<td>US</td>
<td><a href="../websites/ca.gov.html">ca.gov</a></td>
<td>
<a href="../trackers/google_analytics.html">Google Analytics</a>,
<a href="../trackers/google.html">Google</a>,
<a href="../trackers/addthis.html">AddThis</a>
</td>
</tr>
<tr>
<td>US</td>
<td><a href="../websites/dhs.gov.html">dhs.gov</a></td>
<td>
<a href="../trackers/google_analytics.html">Google Analytics</a>,
<a href="../trackers/doubleclick.html">Doubleclick</a>
</td>
</tr>
<tr>
<td>US</td>
<td><a href="../websites/irs.gov.html">irs.gov</a></td>
<td>
<a href="../trackers/google_analytics.html">Google Analytics</a>,
<a href="../trackers/new_relic.html">New Relic</a>,
<a href="../trackers/lockerz_share.html">AddToAny</a>,
<a href="../trackers/youtube.html">Youtube</a>,
<a href="../trackers/foresee.html">Foresee</a>
</td>
</tr>
<tr>
<td>US</td>
<td><a href="../websites/nih.gov.html">nih.gov</a></td>
<td>
<a href="../trackers/google_analytics.html">Google Analytics</a>,
<a href="../trackers/doubleclick.html">Doubleclick</a>,
<a href="../trackers/google.html">Google</a>
</td>
</tr>
<tr>
<td>US</td>
<td><a href="../websites/noaa.gov.html">noaa.gov</a></td>
<td>
<a href="../trackers/google_analytics.html">Google Analytics</a>
</td>
</tr>
<tr>
<td>US</td>
<td><a href="../websites/state.gov.html">state.gov</a></td>
<td>
<a href="../trackers/google_analytics.html">Google Analytics</a>,
<a href="../trackers/google.html">Google</a>,
<a href="../trackers/youtube.html">Youtube</a>,
<a href="../trackers/qualtrics.html">Qualtrics</a>
</td>
</tr>
<tr>
<td>US</td>
<td><a href="../websites/weather.gov.html">weather.gov</a></td>
<td>
<a href="../trackers/google_analytics.html">Google Analytics</a>,
<a href="../trackers/addthis.html">AddThis</a>
</td>
</tr>
</tbody>
</table>
It also surprised us that Germany (where the majority of our contributors reside)
does not appear. A brief check of a few sites like
<a target="_blank" rel="noreferrer" href="https://bundestag.de">bundestag.de</a> and the
<a target="_blank" rel="noreferrer" href="https://bzst.de">Federal Tax Office</a> shows a preference for self-hosted analytics,
such as Matomo, rather than third-party solutions.
Note that, as we do not collect data about pages with no third-party trackers, the data we show here may be biased for sites where sensitive areas do not have tracking. Further study would be required to access whether the tracking reported here leaks sensitive information when accessing public services. However, the presence of tracking on these pages is enough to leak valuable metadata about citizens to third-party companies. We should be asking if it is acceptable for our governments to expose us to this risk...
================================================
FILE: blog/how_cliqz_antitracking_protects_users.md
================================================
title: How Cliqz anti-tracking protects users
subtitle: Using an algorithmic, data-driven approach to remove unique identifiers that track users.
author: privacy team
type: article
publish: True
date: 2017-07-22
tags: privacy, circle
header_img: blog/blog-anti-tracking.jpg
+++
There are already many tools available which aim to prevent the kinds of tracking we have described. In general these act as browser extensions which monitor network traffic and intervene when tracking is detected, i.e. blocking UIDs in transmission. These can be categorised into two groups:
1. Blocklist-based: These tools use curated blocklists in order to block third parties seen to be tracking. This predominately targets advertising trackers as an extension or side-effect to Adblocking features. Examples include Ghostery, Disconnect, Firefox tracking protection, and uBlock Origin.
2. Heuristic/algorithm-based: These tools use heuristic and/or algorithmic analysis to determine when to block or modify requests. Examples include Privacy Badger and Cliqz’s own anti-tracking system.
Blocklist-based methods have several shortcomings. Firstly, blocking requests is very coarse grained and can easily break site functionality. Overly broad blocking rules may block many requests which are of no privacy risk. On the other side, exceptions made to prevent site breakage may then allow some privacy leaks, for example the Facebook like button is still allowed when using the [EasyPrivacy](https://easylist-downloads.adblockplus.org/easyprivacy.txt) blocking list, and there are [many other](https://easylist-downloads.adblockplus.org/exceptionrules.txt) such exceptions.
<img class="img-responsive" src="../static/img/blog/antitracking_intro/antitracking-comparison-v2.png"/>
Secondly, as these lists are manually maintained, they will always be playing catchup against trackers. Tracking companies can constantly change their URLs, domains and methods to prevent blocking, and those generating the lists must respond in each case.
Finally, blocking lists bestow significant power to their curators. With blocking browser extensions being used ever more ([over 40% of users](https://downloads.pagefair.com/wp-content/uploads/2016/05/Adblocking-Goes-Mainstream.pdf) for some market segments), those who write the block list would have the power to cut off a significant proportion of a company’s traffic — deservedly or not.
Heuristic approaches like Privacy Badger are limited by just having local knowledge. In many cases we will not know if data sent is unique to us until we have tested it in another browser and seen a different value, like in the fingerprinting example in the previous post. Thus some kind of collaboration is required between users to determine what data is safe, and what is not – and this is the method Cliqz’s anti-tracking uses.
# Cliqz Anti-tracking
As we outlined in the previous section, blocklists have several drawbacks, and we did not want such an aggressive system. Likewise, relying purely local evaluation of whether data is a UID or not has significant limitations. Therefore, we designed a system which combines local with global evaluation of tracking data. It is also designed to be conservative — we only remove data which we determine to be UIDs, and leave the rest alone. Our system modifies request URLs instead of blocking. This aims to reduce site breakage, and enable services to collect data, provided it does not compromise the user’s privacy.
Like existing tools we focus on removing UIDs in transmission, rather than trying to prevent UID generation. Therefore we have three transmission vectors: HTTP Headers, URL Path and Post data. The latter we currently do not handle as our data shows that the reach of this method is very low, however our system allows us to continually monitor the situation, should this change.
The Cliqz anti-tracking system is split into two subsystems. One that handles only Cookies, and the other which handles all other data sent in headers and the URL path.
## Cookie Protection
Protecting from tracking cookies is relatively simple. This is because the vast majority of third-party cookies have no function beyond tracking. Therefore we can very simply strip these from the request without breaking the page.
However, blanket third-party cookie blocking is not an ideal solution, because some third-party widgets do require cookies, to, for instance, authenticate with their service. In order to enable this use case, our system allows cookies in cases when user interaction with the widget is detected. When this happens, the third party is temporarily whitelisted to allow cookies.
This implementation effectively prevents all cookie tracking, and rarely breaks the user-experience on web pages. This is despite the fact that over 96% of third-party cookies are removed.
## Unsafe Data Removal
The second anti-tracking subsystem deals with the non-trivial problem of identifying whether the data sent in a request is ‘safe’ or not. By this we mean, that the data point could be used as a UID, i.e. it is unique to the user. Once we identify what data is unsafe we remove it from the request before it is sent by the browser. This means that if a tracker tries to aggregate their data using this UID, then all Cliqz users will appear as one, and thus will gain crowd anonymity. The algorithm therefore runs as follows:
When a page is loaded, for each third-party request:
1. Analyse the URL, headers and postdata of the request.
2. Tokenise this data into key-value pairs.
3. Evaluate the safeness of each key-value pair.
4. If there are unsafe values, remove the data from the request.
How can we determine what data is safe or not? A UID is characterised as a value which is unique to a single user, and which is repeatedly seen by this user. Such values can be detected by aggregating the data seen by multiple users over a period of browsing time. However, by the time this aggregation would tell us what the UIDs were, it would be too late — the trackers would already have the data. Therefore, our algorithm does the inverse: Detecting the values which cannot be UIDs, and removing all other data.
The advantage of this method is that the protection for new users is available straight away. New UIDs will not be known by the system, and therefore be removed by default. Furthermore, the set of safe values is significantly smaller than the set of unsafe. Safe values will be categorical in nature, and therefore be only ever be `O(1)` in size, while the set of UIDs will be `O(n)`, where n is the number of users.
We can also classify many values as safe locally, without having to consult the global safe value set. Our system uses the following rules for local classification:
- If a value has not been seen previously for the `(third party, key)` pair, it is safe.
- If a value is too short, i.e. has too little entropy to be a UID, it is _safe_.
- If more than 3 different values have been seen for a `(third party, key)` pair over a two day period, then the value is not persistant, and therefore safe.
If none of these rules are able to classify the value as safe, we use the global safe set, which tells us which values have achieved a quorum of users who all saw the same value.
### Example
Consider a hypothetical visit to the site `example.com`, which has tracker.de as a third party. After processing a request we generate a set of `T = [(s, d, k, v)]` tuples as follows:
```javascript
T = [
(s= example.com, d= tracker.de, k= z, v= 1459866821),
(s= example.com, d= tracker.de, k= fl, v= 21.0),
(s= example.com, d= tracker.de, k= u, v= CCAAAABI),
(s= example.com, d= tracker.de, k= vr, v= 1440x1024),
(s= example.com, d= tracker.de, k= c7, v= e9d4a7e4d2185cec),
]
```
We can then evaluate these values:
- `(s= example.com, d= tracker.de, k= z, v= 1459866821)`: This was the first time we saw the value `1459866821` for the given `(d, k)`, so this data is _safe_.
- `(s= example.com, d= tracker.de, k= fl, v= 21.0)`: The value `21.0` is too short to be a UID, so it is _safe_.
- `(s= example.com, d= tracker.de, k= u, v= CCAAAABI)`: More than 3 different values seen in the last two days for the same `(d, k)`, so _safe_.
- `(s= example.com, d= tracker.de, k= vr, v= 1440x1024)`: Always the same value seen for this `(d, k)` pair, however the value is in the global safe set (as it represents a common screen resolution), therefore _safe_.
- `(s= example.com, d= tracker.de, k= c7, v= e9d4a7e4d2185cec)`: Always the same value seen, and not in the globally safe set, so this value is _unsafe_ and will be removed.
### Building the Global Safe Set
We build the global safe set daily using the data sent from users’ clients. Clients collect the tuples of data from each request while browsing and send this back to us every hour, adding a timestamp parameter. The values in the tuple are hashed to prevent user-identifiable information being sent to us. The client guarantees that a maximum of one message is sent per user per hour, so this can be used instead of a user id to count the number of users for each (third party, value) tuple. If the number of users exceeds the quorum threshold for a given hour, it is added to the safe set.
This model allows users to create this collaborative safe value set without compromising their privacy. Any privacy sensitive information in the data is obfuscated by the hash function, and, as no UID is required for our aggregation, we cannot derive any browsing history beyond the single hour granularity (there is no way to link messages from different hours), and the most information that could be gained is the first-party domain names visited. This protection is further strengthened by the use of our [Human Web](https://events.linuxfoundation.org/sites/events/files/slides/collecting-user-data-socially-responsibly.pdf) technology which further obfuscates the source of each message we receive.
# Cliqz vs Other Anti-tracking Systems
The algorithmic, data-driven system for removing UIDs from third-party requests which we have described has several advantages over other anti-tracking solutions. As an online system we can respond much quicker, and without human oversight, to changes in trackers and their techniques. If a tracking company switched domains to try and avoid blocklists, we would have the data to block this tracking within a day. Human-curated blocklists would take comparatively longer to update.
In our [paper](https://static.cliqz.com/wp-content/uploads/2016/07/Cliqz-Studie-Tracking-the-Trackers.pdf) we did several tests to measure the difference between our system and other blocklist systems. Our tests indicated a reduction in breakage on web pages caused by our system, compared to adblockers. We also saw that blocklist-based systems blocked more often, but a large proportion of these blocks were false positives: requests which did not contain any UIDs.
The downside of our method is that, unlike other blockers, we see a net performance loss when loading complex web sites. This is because, other blocking systems simply block the javascript which will then attempt to calculate and send a fingerprint, while we will block just the outgoing request with UID. Thus blockers get a performance benefit of avoiding running this resource-heavy tracking javascript code.
However, we believe that in the long term, this property is a net benefit. Unlike other anti-tracking systems, Cliqz can forgive. If a tracker updates its code, and switches to a method which no [longer sends UIDs](http://www.slideshare.net/jmpujol/data-collection-without-privacy-sideeffects-at-big2016-www-2016), then our system will immediately stop blocking their data. With blocklists, there is no such mechanism — trackers are then incentivised to circumvent the block (for example using a new domain name), rather than improving their data collection methods.
# Summary
Our anti-tracking system has now been running successfully for over a year, blocking around 300 million cookies and removing around 10 million UIDs per day to keep our users free from tracking. The same system, as described here, is now also available on [our mobile browsers](https://cliqz.com/en/mobile) to provide the same protection across even more devices, and give users control over what data third-parties can collect about them.
================================================
FILE: blog/how_facebook_knows_exactly_what_turns_you_on.md
================================================
title: How facebook knows exactly what turns you on
subtitle: A technical analysis of the methods used to track users as a third party. Deep dive into a couple of case studies.
author: privacy team
type: article
publish: True
date: 2017-07-22
tags: privacy, circle
header_img: blog/blog-facebook.jpg
+++
The modern web is built around advertising. A multi-billion dollar industry ([$42bn in 2013 in the US](http://www.iab.net/about_the_iab/recent_press_releases/press_release_archive/press_release/pr-041014) and about [6bn in 2015](https://de.statista.com/statistik/daten/studie/456157/umfrage/umsaetze-im-markt-fuer-digitale-werbung-in-deutschland/) in Germany) primarily concerned with one question: How to make the most money by serving exactly the right ad, at the right time, to the right user.
Web pages are extremely complicated constructions, often meshing together multiple software tools and services from different providers, from analytics and social sharing widgets, to dynamic advertising and content recommendation engines. Consider an average news site with social media sharing buttons. More often than not, these are created by linking to scripts from Facebook, Google, Twitter, etc., which then inject the required content into the page. These third parties may then in turn load other required services into the page.
In isolation, this seems mainly harmless. Services are being provided to the website owners to better integrate third-party services such as social networks, add extra widgets such as comment sections and related content, and improve the website’s monetisation through targeted advertising.
However, the implementation of these services often cause a privacy side-effect: they allow third-parties to track your web-browsing across the web, and in some cases even link this history to you personally. When a user visits a new site the third parties included in the page can then look up the browsing history they have collected for this user, and then generate a personalised response based on this information. This is akin to being given a personalised newspaper where the adverts have been selected based on which articles you have read previously, in both this and other newspapers, any magazine articles you might have read, where you shop and what items you were looking for, [where you bank](http://cliqz.com/magazine/pressemitteilung-cliqz-tracking-beim-online-banking) and more. Our data shows that largest of these tracking third party services can be seen on [almost half of all pages you might visit](http://josepmpujol.net/public/papers/pujolTrackingTheTrackers.pdf), and many others share and [trade user data](https://big.exchange/) amongst each other in order to build a comprehensive user browsing history.
Luckily, as the web is an open system, we can see what these companies are up to, and equip the browser with the capability of foiling their attempts to send tracking data.
This post is the first part of a two-part series. In the second part we will describe how our Anti-tracking system works. This part acts as a background to that, describing the how and why behind online tracking. The methodology and data we present here is based on our [published work](http://josepmpujol.net/public/papers/pujolTrackingTheTrackers.pdf) on Anti-tracking, which we [presented](http://www.slideshare.net/jmpujol/tracking-the-trackers-www-2016) at WWW2016.
## How online tracking works
The mechanism behind online user tracking is simple enough. First, one must be a third party to many page loads across the web. Our data shows that almost 30% of web sites require 10 or more different service providers to fully load their content, and dynamic advertising alone can bring this many different companies into the page. Secondly, the request a third party receives when loaded into a page should contain some kind of user identifier (uid) for the client visiting the page, as well as the address of the first party page visited (usually provided by the Referer [sic] header). Collecting together the first-party pages seen for each uid will then yield the browsing histories of all the users seen.
A simple list of visited web pages may not seem like a significant privacy violation to some, however further analysis can yield much more information than one might expect. Trackers can collect users’ browser and operating system, which can be used for [price discrimination](http://news.northeastern.edu/2014/10/ecommerce-study/), and rough geographical location can be checked using [IP geolocation](https://en.wikipedia.org/wiki/Geolocation). One can also find private urls in the history to determine membership of certain services, such as some [online banking portals](https://static.cliqz.com/wp-content/uploads/2016/07/Cliqz-study-tracking-in-online-banking.pdf) which contain trackers. Another example is the twitter analytics dashboard (e.g. analytics.twitter.com/user/sammacbeth/home). This url is only accessible when logged in as a specific user, and when accessed the browser will transmit this user name in the url to the trackers in this page (in testing, these included Google, Microsoft and tellapart.com), thus enabling these services to add a user’s twitter handle to the previously collecting browsing history. Private urls, such as this, are particularly dangerous, because they often contain Personal Identifiable Information (PII) which puts a real identify on the other urls that are being collected in that session (See [http://www.slideshare.net/jmpujol/data-collection-without-privacy-sideeffects-at-big2016-www-2016](http://www.slideshare.net/jmpujol/data-collection-without-privacy-sideeffects-at-big2016-www-2016) for an example.).
### UID Generation Techniques
The uid that trackers need in order to attribute page loads to specific users can be generated in several different ways:
1. Cookies – This is the simplest and most common method for generating uids. Cookies are a web standard for sharing state between a client and server over the stateless HTTP protocol. It is an important part of the web, which enables sites to keep track of your login and/or preferences between visits. Cookies work as follows: When a client makes a request to a server, in the response the server can set a header Set-Cookie with a value of its choosing. The client will, from then on, send this value in headers for any subsequent requests for this domain, and thus the server will know which user it was who sent the request.
2. Network fingerprint – This method uses the properties of the network from which the request comes from as an identifier, usually the IP address. This varies in effectiveness based on whether users have unique IPs or not.
3. Client fingerprint – Here, code is run in the client browser to try and build a unique identifier from data accessed in Javascript, Flash and other APIs, for example installed fonts, browser plugin versions, screen resolution, browser version and more. Techniques such as canvas fingerprinting are further able to fingerprint the specific hardware configuration of the user’s computer. Together this can generate a unique fingerprint which is stateless, and endures even when private data is cleared, and private tabs are used.
Once generated, these uids must be transmitted to the tracker with information about the page the user is visiting. Again there are three primary methods:
1. HTTP Headers – This is metadata send along with a request with information for the server. This is where Cookies are transmitted, but also other data can be sent here. Our data shows that 45% of requests to third parties on web pages seen by our users contain a cookie header.
2. URL Path – Arbitrary data can be sent in the URL path requested from the server. This is commonly in the form of a query or parameter string — key/value pairs separated by & or ; characters at the end of the query. 52% of third party requests have some kind of query string, and 1.5% a parameter string.
3. Post data – This is data sent from the client as part of the main body of the request. We see this kind of request in 0.05% of cases.
### Case Study 1: Facebook cookie tracking
Facebook use cookies to link your web-browsing behaviour to your Facebook account. Facebook widgets are embedded in various sites around the web, and will send the address of the page you are viewing along with your Facebook cookie, enabling Facebook to build a list of sites you have visited. Our data shows that Facebook’s widget reaches 25% of pages loaded by our users – this means that Facebook could collect 25% of an average user’s browsing history.
We can see this tracking in action by inspecting requests in the web browser. First, if we visit the Facebook home page, we can see a cookie called datr being set:
<img class="img-responsive" src="../static/img/blog/how_facebook_post/facebook-casestudy.png"/>
Now, upon visiting a site which has a Facebook widget, in this case bild.de, we can see a request to facebook.com. As third-party cookies are enabled in the browser (the default setting in all major browsers), we will send the cookie we got on the previous page along with the request. The Referer header of this request will also contain the site I am visiting: www.bild.de.
<img class="img-responsive" src="../static/img/blog/how_facebook_post/bild-tracker.png"/>
As I continue to browse the web, this process will repeat, and Facebook will collect a series of requests with this datr cookie and the pages I was viewing.
Finally, if I now log into my Facebook account, we see that the datr cookie remains, and now alongside a cookie with my Facebook user ID. This means that Facebook can now attribute all the pages I have viewed with my personal Facebook account.
<img class="img-responsive" src="../static/img/blog/how_facebook_post/facebook-tracker.png"/>
This mechanism allows Facebook to collect your browsing habits across the web, in order to tailor adverts and recommendations within their site. [Our measurements](http://josepmpujol.net/public/papers/pujolTrackingTheTrackers.pdf) show that this tracking covers around 25% of pages visited by our users.
Facebook were banned last year from using this tracking on European users who had not logged into their site, however this was recently [overturned](http://www.theverge.com/2016/6/30/12069626/facebook-belgian-privacy-commission-cookie-user-tracking-case-overturned), so this practice continues.
### Case Study 2: Moatads fingerprinting
[Moat](https://moat.com/) is an analytics and advertising provider. They are present on many popular news sites, where their JavaScript is loaded into the page, and then a tracking pixel is sent back to their servers. We can observe this behaviour by opening two different sites in our web browser and inspecting the requests to moatads.com:
<img class="img-responsive" src="../static/img/blog/how_facebook_post/moat-mac-uai-1440x774.png"/>
Here we can see many parameters are sent in the request, and many values match across both requests. However, we cannot know for sure if these represent uids, or just other values used legitimately for the service. However, the qn value is suspicious, as a long cryptic value which remains the same when visiting different sites.
We now try opening the same sites in a different browser:
<img class="img-responsive" src="../static/img/blog/how_facebook_post/moat_linux.png"/>
Again, pixels are generated with various parameters set in the request URL. Some are the same as we saw in the first test, for example the qq parameter. However, looking at the qn value we see that it is again the same on both web pages, but different to the value we saw on Mac. We can hypothesise that this is a fingerprint of this browser which functions as a uid, however we would need more examples from more unique browsers to properly test this.
Finally, we test the qn in a private tab in the first browser. As shown below, we see that the same fingerprint is generated. Therefore, Moat are able to also tag page views in private tabs with the same uid as in a normal window, suggesting that they can bypass this protection for their tracking purposes.
<img class="img-responsive" src="../static/img/blog/how_facebook_post/bild-tracker2.png"/>
# Where are the trackers
These two case studies have shown the technical means with which companies can collect the pages you visit, and group them by a particular user, be that against a specific facebook profile or just a hash value which uniquely identifies one’s computer. Having established that third parties may snoop on some pages you visit on the web (with the first party’s permission), the question is how far does this tracking reach, and how much of our browsing habits can these third parties collect?
We presented our data on the [online tracking](http://josepmpujol.net/public/papers/pujolTrackingTheTrackers.pdf) seen by 200,000 users over a two week period at the [WWW2016](http://www2016.ca/) conference in April, which analysed over 13 million page loads by our users. A large study of 1 million sites has also been done by researchers at Princeton with similar findings to ours, although the study is not based on real user traffic but rather on data collected by instrumented browsers that download and scan for trackers top sites on the Web. We present some updated results from our on-going browser telemetry, during August 2016, and containing over 140 million page loads over 1.8 million unique domains. Multiple visits to the same site and/or page are counted multiple times, thus the data set weighs more popular pages more strongly, and represents the tracking observed by an average user of our browser.
The first result we observe from our data, is that a small collection of third parties are are installed in a huge number of visited pages. From a list of 2000 domains, representing the top tracking domains, we see that 96% of page loads include a request to a third party in this list. Over 80% of these page loads (and 78% of the total) contain some kind of tracking attempt. Thus, a user browsing the web with no tracking protection could be tracked on 78% of the pages they visit.
<img class="img-responsive" src="../static/img/blog/how_facebook_post/tracking-per-page-load.png"/>
We can further look at how much of an average user’s browsing history each third-party company might be able to tracking. We analyse the ‘reach’ – the proportion of total page loads in the data set seen – by domains associated with particular companies or products. The figure below shows the top companies in terms of total reach, and for each we indicate the types of behaviour seen on each page. ‘Safe’ means that no tracking behaviour was seen, just that a request was made to the domain; ‘cookie’ and ‘qs’ mean that there was an attempt to transmit a uid with one of these methods, and ‘both’ means that both methods were used.
<img class="img-responsive" src="../static/img/blog/how_facebook_post/tracking-reach-uai-1032x678.png"/>
The figures show that the big players – and particularly Google, with their products taking the top three places – have significant reach across the web. Some specific company behaviours can also be observed, for example Google Analytics does not use cookies, using a weaker kind of fingerprint. Also, Amazon, offer CDN services on their cloudfront.net and amazonaws.com domains, thus a high proportion of their reach is safe.
The other feature of the tracking landscape is the long tail of tracking companies. There are 27 companies/services with over 5% reach, 110 with over 1%, then 450 over 0.1%. This 0.1% still corresponds to over 140,000 pages seen on this data set.
Finally, we can look at how many trackers are seen on each page load. The figure below shows how many distinct tracking domains were contacted for page loads in the dataset. We see that over 10% of pages have over 20 different trackers in them, and the vast majority of page loads have multiple trackers. Therefore, not only are users tracked across most of the web, after there are many companies who are able to generate comprehensive user profiles.
<img class="img-responsive" src="../static/img/blog/how_facebook_post/domains.png"/>
# Conclusion
In this post we’ve given a general description of how online tracking works, and looked at the extent of tracker companies’ reach across the web. In the next post we will look at how we can stop this tracking, and give an in depth description of how our Cliqz Anti-tracking technology works to prevent tracking without an adverse effect on user experience.
================================================
FILE: blog/manifest_v3_privacy.md
================================================
title: Chrome's Manifest V3 - Improving Privacy?
subtitle: How Chrome's changes will reduce user privacy
author: privacy team
type: article
publish: True
date: 2019-06-18
tags: blog, extensions, privacy, chrome
header_img: blog/adblocker-perf-study.jpg
redirect_url: https://www.ghostery.com/blog/manifest-v3-privacy
+++
The Chrome team's proposed changes to browser extension APIs, known as Manifest v3, have proven controversial due to their expected impact on adblockers and privacy extensions. Of particular concern are the changes to the `webRequest` API, whose blocking capabilities are being replaced by the `declarativeNetRequest` API. In repeated posts the Chrome team claim that these changes are required to improve the *performance*, *security* and *privacy* of extensions. In a [previous post](./adblockers_performance_study.html) we showed that, for the most popular adblocker engines, performance is already very good, and these changes are unlikely to improve much. In this post we assess the privacy argument for the changes to request handling, if the proposed changes do improve privacy, and how Ghostery specifically will be affected. We find that:
* The Chrome team have only belatedly stated specific privacy concerns with the `webRequest` API, and these are still not included in the design document.
* The proposed changes do not provide any protections against the stated privacy issues.
* Privacy extensions like Ghostery will be negatively impacted by the changes, reducing their ability to keep users safe online.
## Extension privacy
Browser extensions have the potential to cause many privacy problems - when granted permissions, they can see every page you visit in the browser, view their contents, read and write form data, and send requests to any server on the internet. These powers are required for some of the valuable features extensions provide. Therefore, as the Chrome team rightly [point out](https://blog.chromium.org/2019/05/taking-action-on-deceptive-installation.html), ensuring extensions are consentfully installed is the first step to address privacy.
The Manifest v3 changes, however, primarily address extensions' capabilities post install. As privacy at this point is also a stated goal, what are the privacy concerns and attacks that the changes seek to address? In the Manifest V3 [design document](https://docs.google.com/document/d/1nPu6Wy4LWR66EFLeYInl3NzzhHzc-qnk4w4PX-0XMw8/edit#heading=h.9lwe237fxtp2) this goal is stated as follows:
> Users should have increased control over their extensions. A user should be able to determine what information is available to an extension, and be able to control that privilege.
Later in the document the changes to the `webRequest` API are described, but only using a performance-based reasoning:
> … the extension then performs arbitrary (and potentially very slow) JavaScript, and returns the result back to the browser process. This can have a significant effect on every single network request, ...
They also acknowledge that the `webRequest` API should remain in place for observation.
> The non-blocking implementation of the webRequest API, which allows extensions to observe network requests, but not modify, redirect, or block them (and thus doesn't prevent Chrome from continuing to process the request) will not be discouraged.
This implies that the potential privacy impact of extensions being able to observe all requests going out of the browser are not a concern for these API changes. While the `webRequest` API remains, the switch to allow blocking only via the `declarativeNetRequest` API does nothing for the stated privacy goal of increasing user control over the information extensions can access.
Despite this, since [our study](./adblockers_performance_study.html) showed that the performance cost of `webRequest` blocking for leading adblockers was not an issue, the Chrome team have focused on privacy reasons for the changes. In their [recent blog](https://blog.chromium.org/2019/06/web-request-and-declarative-net-request.html) about web request and declarative net request changes, they state:
> In order to improve the security and privacy guarantees of the extensions platform, we are rethinking some of the extension platform's core APIs. That's why we're planning to replace the blocking Web Request API with the Declarative Net Request API.
This shift in angle has also come up in public statements by Chrome devs:
> "… The big problem with webRequest is unfixable privacy and security holes. …" @justinschuh ([Source](https://twitter.com/justinschuh/status/1134060703231254528))
In the blog post they also mention one potential malicious use of webRequest:
> Because all of the request data is exposed to the extension, it makes it very easy for a malicious developer to abuse that access to a user’s credentials, accounts, or personal information.
If this is the single privacy loophole the `webRequest` changes are targeting, then it seems strange that the solution is to remove the blocking capabilities of `webRequest` and leave the observational ones. Post Manifest V3, the exact same malicious extension will be possible. We can imagine that the Chrome team's strategy may be, that by providing a simple alternative API for blocking use-cases, the extension review process can be tougher for extensions asking for `webRequest` permissions. This, however, would also be possible by just introducing the new API, leaving `webRequest` as it is, and providing developer incentives to switch unless they really need `webRequest` for their use-case.
It is strange that this privacy issue was not stated in the original design document, and the proposed change to `webRequest` is seemingly just collateral damage that does not address the stated goals. More transparency is needed on what the strategy is here, and why keeping `webRequest` observation with blocking removed should be the solution.
To summarise:
- The stated privacy improvements of Manifest V3 are addressed elsewhere in the proposals.
- The privacy and security issues with `webRequest` blocking have not been fully articulated by the Chrome team, with only a brief mention of malicious behaviour in a blog post last month.
- The removal of `webRequest` blocking does not improve the privacy of extensions.
Therefore at this point, the primary impact on privacy from the proposed changes will be the neutering of the capabilities of several privacy extensions. Privacy Badger devs [expect their core functionality to be broken](https://github.com/EFForg/privacybadger/issues/2273) by the changes. Similarly, we expect it to be difficult to provide the same level of protection in Ghostery should these changes come into effect, and we will describe why in the rest of this post.
It is ironic that a change ostensibly aimed at improving user privacy will actually reduce it for many users who rely on privacy extensions to protect them online. Some have suggested that the changes simply align Chrome with Apple's Safari, which provides a similar declarative blocking API for extensions. This overlooks the fact that Safari comes with significant privacy protections by default, having been blocking most third-party cookies by default for years, and recently bringing in advanced anti-tracking measures in the form of [ITP](https://webkit.org/blog/8613/intelligent-tracking-prevention-2-1/). Chrome on the other hand, ships with zero tracking protection by default, and is now hindering extensions which try to provide comparible protections to other browsers.
## How removing webRequest blocking affects Ghostery
This analysis is based on the `declarativeNetRequest` [API documentation](https://developer.chrome.com/extensions/declarativeNetRequest) as of 17th June 2019. The primary features of the API are:
1. A matching grammar for specifying rules that will trigger blocking, header modification or redirects.
2. Up to 30,000 static rules per extension
3. The ability to add _dynamic_ rules at runtime, up to a maximum of 5,000 rules.
4. Rules can have a white- or black-list of first-party sites, to control triggering.
5. Individual sites can be dynamically whitelisted, up to a maximum of 100 per extension.
Ghostery contains the following components which will be affected by the webRequest API changes:
### 1. Tracker matching and blocking
Ghostery contains a blocklist of over 4,000 filters which are used to detect and block trackers. The extension allows users fine-grained control over these, allowing or blocking specific trackers on specific sites or globally. The list of detected trackers is shown in the Ghostery UI for each page visited.
To support the `declarativeNetRequest`, these 4,000 filters would have to be re-written to the new filter grammar that Chrome offers. We are likely to lose some filters in the process, as certain types of matching rule, for example Regex's likely cannot be implemented in the more restrictive grammar.
The more challenging issue, however, is maintaining Ghostery's rich configurability with the low threshold of dynamic rules allowed. As every rule should be toggleable, all 4,000 filters would have to be _dynamic_ rules. This means that we are already using 80% of our allowance from the start, before we have even started adding supplementary rules for adblocking and cookie blocking.
Likewise, the limit of 100 whitelisted sites is prohibitively low, as many users may us the Ghostery 'Trust Site' feature for more sites than this. It is unclear how to handle hitting this limit, as to the user it will seem like the feature is broken if they trust a site, but it does not get saved.
Furthermore, the new API, in it's current form, does not report the results of blocking back to the extension. This means that we will still have to run our filters on all urls via the `webRequest` API anyway, in order to display the list of trackers seen and blocked. This means that the user pays the cost of keeping the block list loaded in memory and matching against each url twice.
### 2. Cookie blocking
The Ghostery extension uses a heuristic third-party cookie blocker as part of the 'Enhanced Anti-Tracking' feature. This feature blocks third-party cookies in most cases, using a set of heuristics to decide when cookies should be allowed. It is currently not clear if these heuristics will be able to work correctly without the webRequest API, nor if the dynamic filter cap is sufficient to even hold the basic cookie blocklist.
Our cookie heuristics respond to user input, for example clicking on a Facebook like button or Google login form, in order to trigger a temporary cookie whitelist for a specific domain. To implement this with `declarativeNetRequest`, we would have to add or modify our cookie blocking rule temporarily. As the API for this is asynchronous, we introduce a race condition that we did not have before. If the rule is not added before the request we want to whitelist, the mechanism will fail. This can, for example, break Google logins on third-party sites.
The cookie blocking is done based on a dynamically generated list of tracker domains of between 2,000 and 3,000 entries. For these domains, third-party cookies should be blocked, unless a heuristic allows it. Again, the limited rule threshold of the `declarativeNetRequest` API means that this list would have to be reduced.
Another concern is that the [Rule condition specification](https://developer.chrome.com/extensions/declarativeNetRequest#type-Rule) can distuingiush between `firstParty` and `thirdParty` contexts for a request, but this is done on a frame level, rather than relative to the page document. This means that we would not, for example, be able to block Google cookies inside a Google Ads iFrame, as in this context the API would consider requests from the frame as first party.
### 3. Removing private data points
The other component of Ghostery's 'Enhanced Anti-Tracking' feature is the dynamic removal of url parameters seen to be used for cross-site tracking. This uses a [k-anonymity](./how_cliqz_antitracking_protects_users.html) based algorithm, using anonymously contributed data from our users.
As the `declarativeNetRequest` API does not support dynamic redirects, this component cannot be implemented with it.
### 4. Adblocker
Ghostery includes an additional adblocker component which is able to further block ads based on standard blocklist. As this feature should also be toggleable on-and-off at runtime, we would need to use _dynamic_ rules for these filters. With only 1,000 rules available after adding the Ghostery tracker matching, the coverage of this feature would be drastically reduced.
### 5. WhoTracks.Me Data
Ghostery is the primary source of data for this website, using our [anonymised telemetry system](https://arxiv.org/abs/1804.08959) to report on global tracker trends. This largely relies on the webRequest API in order to observe which trackers are on which page. Changes caused by the introduction of `declarativeNetRequest` will reduce the quality of this data. Namely, cookies blocked by the declarative API will not be visible to webRequest listeners. This means that we will not be able to distinguish between trackers setting cookies, which are then blocked, and those who do not set cookies.
### Summary
To summarise, the Manifest V3 changes to the webRequest API will require a significant re-write of the Ghostery extension to be able to fit the existing features into the constraints of the `declarativeNetRequest` API. The result will be:
- Slower: URL matching will have to be done twice in order to show tracker counts in the UI.
- Less configurable: Configuration may have to be limited to fit within the very low dynamic rule limit.
- Break sites more often: We will have to evaluate the trade-offs of relaxing the third-party cookie blocking vs. breaking sites.
- Less private: As the private data removal feature will have to be removed.
## Conclusion
In this post we have shown that the current proposed changes to the webRequest API by Chrome do not improve privacy, and in fact reduce it, by severely hindering the operation of privacy extensions like Ghostery. The limitations on dynamic rules in the new `declarativeNetRequest` API are particularly taxing for extensions which aim to the give user control over what is blocked and what is not.
This forces extensions into a 'dumb blocker' model, where block lists are fixed, and the only controls are an on/off toggle. At the same time,
the changes increase the difficulty and practicality of implementing dynamic heuristic mechanisms for detecting and blocking tracking.
The webRequest API powers much innovation in browser extensions, however it does implicitly provide access to private user data. While the Chrome team state that privacy is a reason for the proposed changes to this API they have not stated which specific concerns they aim to address. The Manifest V3 changes do not prevent extensions accessing private user data via webRequest, nor have other potentially dangerous APIs like content scripts been limited. Therefore the claims that this change improves extension privacy are misleading and disingenuous.
The fact that very few of the initial concerns regarding Manifest V3 have been addressed in the months since the original announcement, means that it currently looks like the changes will be forced through, despite community objections. This means that Chrome users will become second class web citizens with regards to their access to tracking protection. This is however just a continuation of a trend where Chrome stands still or actively reduces privacy while the rest of the competition have been pushing forward. At this point we recommend considering switching away from Chrome, if you haven't done so already, to browsers with privacy built-in by default. For example, the [Cliqz Browser](https://cliqz.com/en/download) has Anti-tracking built in and enabled by default, and Firefox now ships with [tracking protection on by default](https://blog.mozilla.org/blog/2019/06/04/firefox-now-available-with-enhanced-tracking-protection-by-default/).
_Disclosure: WhoTracks.Me is a joint effort by Cliqz and Ghostery._
================================================
FILE: blog/private_analytics.md
================================================
title: Tracking visits without tracking people
subtitle: A privacy-by-design approach.
author: privacy team
type: article
publish: True
date: 2018-05-03
tags: analytics, privacy-by-design
header_img: blog/analytics/analytics.png
redirect_url: https://www.ghostery.com/blog/private-analytics
+++
Analytics are one of the most common use-cases on the web. You want to know how many people are
visiting your website, whether anyone actually clicked the link you posted on social media, or who
is sending traffic to your website. For most sites, the solution is to just drop a
[Google Analytics](../trackers/google_analytics.html) script into the page - it's free, after all...
This has led us to the current situation, where we see Google Analytics having presence across 87%
of the top half a million websites, and, despite using reasonably short-lived identifiers, the way the data is collected can be used to
[track users across these sites](https://www.slideshare.net/jmpujol/data-collection-without-privacy-sideeffects-at-big2016-www-2016#13).
Is counting page visits such a difficult problem that only Google has solved it? No, there are
[paid](https://get.gaug.es/) and [open source](https://matomo.org/) alternatives available, but
why pay when you can use a free version which does more, and why host a server with the extra
costs that entails, when you don't have to?
But is Google Analytics actually better than the competition? We would argue that, at least among
privacy conscious users (i.e. those
[who contribute to the WhoTracks.Me dataset](../blog/where_is_the_data_from.html)), Google
Analytics will report vastly incorrect figures, for two main reasons:
1. Our data shows that on 29% of pages with Google Analytics some of the requests will be blocked
due to Ghostery blocking settings.
2. On 19% of pages with Google Analytics, Cliqz and Ghostery's AI anti-tracking will remove
potential identifiers from the request, often causing unique visitors and conversions to be
incorrectly measured.
## Analytics without tracking
So how can we _accurately_ measure the traffic coming to our site without exposing the user to
tracking and privacy side-effects? This was a problem we faced when we created the WhoTracks.Me
website. We wanted to have _some_ analytics so that we can measure if we are being successful in
engaging people with the information we are providing on the site. However, we had a few
constraints:
1. No tracking. We [define tracking](../blog/what_is_a_tracker.html) as when a service is able to
collect and correlate data across multiple sites. Unfortunately, as server-side aggregation is the
norm amongst third-party analytics providers, privacy cannot be guaranteed.
[Client side alternatives](http://josepmpujol.net/public/papers/big_green_tracker.pdf) have been
proposed, but unfortunately [the implementation](https://github.com/cliqz-oss/green-analytics) only
reached a proof-of-concept state. This means we have to roll our own service.
2. Minimal Ops. WhoTracks.Me is a statically generated site, which is simply hosted on a CDN. This
decision was made to minimise costs, make it fast, and eliminate the need to deploy and monitor
hosting infrastructure. Having done this, it does not make sense to have to deploy infrastructure
in order to host a [Matomo](https://matomo.org/) or similar service.
3. Respect Privacy. The system should not store any personal information from users (i.e. IP
address), nor be able to correlate visits for an individual user over a long time frame. Apart from
the obvious reasons for this, it makes regulatory compliance easy: If we do not hold IP addresses,
it is not possible for us to extract data on an individual user for data access or deletion
requests (as per GDPR).
Our analytics implementation satisfies these three constraints, using probably the oldest technique
on the Internet: server log parsing. Daily analytics for the WhoTracks.Me site are generated as
follows:
1. Visits to the site are logged via [CloudFront's logging mechanism](https://docs.aws.amazon.com/AmazonCloudFront/latest/DeveloperGuide/AccessLogs.html).
2. Each day, a script processes these logs, to obfuscate personal data such as IP addresses. This
script generates a random key for the day, and encrypts all IP addresses with this key. The
anonymised logs are copied to a new bucket, and the key is destroyed once the job completes. This
method allows us to count unique visits from an IP address during a single day, but no day-to-day
correlations can be made, nor can the IP address ever be recovered from the anonymised value.
3. The original CloudFront logs (with IP addresses) are removed.
4. We can then parse the clean logs and filter out requests to static resources and those by bots
in order to see requests to actual pages. We can count unique visitors within single days, using a
combination of user-agent and anonymised IP; we can see where incoming traffic is coming from via
HTTP referrers (which we also strip of potentially revealing parameters) and so on.

<p class="img-caption">Processing of raw CloudFront logs to remove potential personal data.</p>
This workflow allows us to keep track of how much traffic we are getting to the WhoTracks.Me
website. There is also no reason that this method could not be scaled up to more complex use-cases
which services like Google Analytics provides, like conversion counting - provided the time frame
that this conversions can occur in are shorter than the time the IP encryption key is used for.
The method is also safe with respect to privacy regulations and user preferences. As IPs are stored
for maximum 1 day (and this is only because CloudFront's logging does not obfuscate IPs for us), no
other personal information is collected, and message linkage limited to 1 day, there are no
additional obligations regarding the usage of this data under GDPR. Furthermore, as tracking is time
limited and context limited (this data can only be used for usage on whotracks.me), it respects
[Do Not Track](https://en.wikipedia.org/wiki/Do_Not_Track) automatically (using the standard's own
[tracking definition](https://www.w3.org/TR/tracking-dnt/#terminology.activity)).
## Conclusion
We rolled our own analytics for this site because there was no off-the-shelf solution providing the
(very basic) analytics we wanted without significant extra overhead, or potential privacy
implications for users of the site. Our system leverages CloudFront logging with a data obfuscation
step in order to collect privacy-safe server logs which can then be analysed for basic insights.
This technique could be extended to provide most of the richer features of existing web analytics
tools.
The lack of privacy-preserving tools in the web analytics ecosystem is a worrying trend. Google
Analytics dominates as they provide an extremely feature-rich product as zero cost to the webmaster.
It is difficult to see how a service can compete with free without selling analytics data. Existing
competitors mostly aim for businesses who will pay for a premium product, and leave bloggers and
smaller sites to Google.
While increasing use of adblockers is a more fundamental threat to Google's Ad business, a side
effect may be a loss of trust in Google Analytics, as we measure
[29%](https://github.com/ghostery/whotracks.me/blob/master/whotracksme/data/assets/2018-03/global/trackers.csv#L2)
of pages with Google Analytics being affected by blocking. We already see companies which rely on
analytics for core business activities (for example advertisers using affiliate schemes) deploying
multiple analytics scripts and averaging the results. If the trust in analytics breaks down, then
this whole ecosystem may unravel.
================================================
FILE: blog/static_site.md
================================================
title: Building whotracks.me
subtitle: Adding search, data, plots and blog to 1000+ pages of tracker profiles and top domains.
author: privacy team
type: article
publish: False
date: 2017-11-03
tags: tracker-free, lightweight
header_img: blog/blog-site.jpg
+++
At Cliqz and Ghostery, we [collect anonymous data about trackers](/blog/where_is_the_data_from.html)
to power our [anti-tracking](blog/how_cliqz_antitracking_protects_users.html) technology.
We see our anti-tracking as a community effort and as such we want to share a structured
representation of this data to cast some light on the tracker landscape.
Out of the three main entities involved in a page load: **users**, **websites** and **trackers**,
we have data only on the last two. We'll start with:
* Profiles of the [top 500 trackers](/trackers.html)
* Tracker data on the [top 500 domains](/websites.html).
With these out of the way, a blog space would be needed. This for two
of reasons. We realised there was a need for a learning space where we
explain concepts referred to in the site. We call these **primers**.
These define what we call a [tracker](/blog/what_is_a_tracker.html),
what [cookies](/blog/cookies.html) and [fingerprinting](/blog/fingerprinting.html)
are or [where this data comes from](/blog/where_is_the_data_from.html).
Hopefully over time it will become a space for curious readers
to be introduced to tracking technologies. The second reason is to have
a space where we'll be writing about particular trackers, technologies, papers,
engineering, and other interesting topics.
## Going static
Through whotracks.me, we want to cast some light on the tracking
landscape, but also make a point about trackers and **privacy by design**,
hence the choice of this being a static site was pretty obvious. This
meant that we could build the whole site offline, put it in a folder
and serve it through CDN.
Given this will be updated a few times a month, build performance was not really
a big issue for us. But stumbling upon [a discussion](https://news.ycombinator.com/item?id=15507538)
about site generators' performance, some comments read:
- *"with Hugo + Pygments was taking ~20s for ~20 pages at the time"*
- *"92 pages in 1s (full rebuild, No CSS magic tooling though)"*
- *"Rust: ~10k pages in ~60s"*
The assumption would be that most of this time is spent parsing the
markdown files. To build this site however, with the exception of the
blog, the rest of the pages are mainly about instantiating a template,
plugging some content, and writing to disk. So most likely a comparison between
site generators and this would be unfair. At the time of writing,
whotracks.me has roughly 1020 pages. On 1000 of these pages there are
offline generated plots, quite some data and a fair amount of tooling
with respect to styling. On a `Thinkpad x230` with an Intel `i3 processor`:
```bash
(venv) ➜ whotracks.me git:(master) ✗ time python build.py site
Home page ............................... done
Tracker list ............................ done
Website list ............................ done
Blog List ............................... done
Blog Posts .............................. done
Website pages ........................... done
Tracker Pages ........................... done
python build.py site 13.86s user 1.08s system 158% cpu 9.400 total
```
This will be a 5 part series dedicated to:
1. [Generating a static site (part 1)](/blog/static_site_generation.html)
2. [Visualization (part 2)](/blog/static_site_visualization.html)
3. [Building a blog (part 3)](/blog/static_site_blog.html)
4. Search, for some definition of search.
5. No third party trackers and Fast
The code and data to generate this site is open-sourced at
[`https://github.com/ghostery/whotracks.me`](https://github.com/ghostery/whotracks.me).
<br><br>
So let's start with [Generating a Static Site (part 1) ... ](/blog/static_site_templating.html)
================================================
FILE: blog/static_site_blog.md
================================================
title: Building whotracks.me - Blog (part 3)
subtitle: Adding search, data, plots and blog to 1000+ pages of tracker profiles and top domains.
author: privacy team
type: article
publish: False
date: 2017-11-01
tags: blog, markdown
header_img: blog/blog-site-p3.png
+++
In most static site generators, one writes markdown, which is parsed and rendered
into a nice blog post. So how do we do that here. As it turns out, it's pretty
straight forward.
We have two cases here, first the main blog page, where a list of the posts
is presented, and then the post page. For both of these we have
templates, which can be found at
[`templates/blog.html`](https://github.com/ghostery/whotracks.me/blob/master/templates/blog.html) and
[`templates/blog-page.html`](https://github.com/ghostery/whotracks.me/blob/master/templates/blog-page.html)
respectively.
For this, let's write a super simple function to parse the markdown file. To make
its life easy, we specify a given format in the post's markdown that looks like this:
```md
title: Building whotracks.me - Blog (part 3)
subtitle: Adding search, data, plots and blog to 1000+ pages of tracker profiles and top domains.
author: privacy team
type: article
publish: True
date: 2017-11-01
tags: tracker-free, lightweight
header_img: blog/blog-site-p3.png
➕➕➕
<MARKDOWN BODY>
```
These are all the components we need to render the snippet, and
the actual blog post. As promised the parsing function is quite simple:
```python
def parse(fp):
''' fp: filepath to the markdown file '''
with open(fp) as r:
text = r.read()
meta, body = text.split('➕➕➕')
title, subtitle, author, post_type, publish, date, tags, header, _ = meta.split("\n")
return {
"filename": fp.split("/")[-1].replace(".md", ""),
"title": title.split(":")[1].strip(),
"subtitle": subtitle.split(":")[1].strip(),
"author": author.split(":")[1].strip(),
"type": post_type.split(":")[1].strip(),
"publish": eval(publish.split(":")[1].strip()),
"date": date.split(":")[1].strip(),
"tags": tags.split(":")[-1].split(","),
"header_img": header.split(":")[1].strip(),
"body": body
}
```
Alright, so now we have a way to parse the markdown to generate all parts that we
need and the templates to render them, so we are left with styling. There are three
elements we need to style:
- the blog post card (snippet)
- the actual post page.
- the code snippets style
Their styles are respectively defined in:
- [`static/scss/blog/card.scss`](https://github.com/ghostery/whotracks.me/blob/master/static/scss/blog/card.scss)
- [`static/scss/post/post.scss`](https://github.com/ghostery/whotracks.me/blob/master/static/scss/blog/post.scss)
- [`static/scss/post/github.scss`](https://github.com/ghostery/whotracks.me/blob/master/static/scss/blog/github.scss)
================================================
FILE: blog/static_site_generation.md
================================================
title: Building whotracks.me - Generating a static site (part 1)
subtitle: Adding search, data, plots and blog to 1000+ pages of tracker profiles and top domains.
author: no one
type: article
publish: False
date: 2017-11-02
tags: tracker-free, lightweight
header_img: blog/blog-site-p1.png
+++
## 1. Generating a static site
We figured that, for this problem, speed of development and richness of the
ecosystem are very important, so we decided to go with Python.
Since it's a new project, and we had no dependencies,
we decided for [Python 3.6](https://docs.python.org/3.6/).
Generating a static site boils down to two important components:
* Path management
* Templates
This is in no way implying that all there is to a static site generator
are these two, but for our needs, that was the case.
Both of these are defined in [`templating.py`](https://github.com/ghostery/whotracks.me/blob/master/templating.py).
### 1.1 Path management
There are two parts here. First, generating urls for shared
resources (i.e. `static`, `shared data` etc),
and second generating urls dynamically in the templates
for all different entities (e.g.: `trackers`, `websites`,
`primers` for learning etc).
The first part is easy, we just store the paths of the
shared resources in a dictionary, which we will later pass
to the template rendering function as persistent context:
```python
import os
PATHS = {
"_site": os.path.abspath('_site'),
"static": '/static',
...
}
```
For the second part, we need the `entity` we are generating
a url for, and the `id` of that entity.
```python
from urllib.parse import quote_plus
class DataSource:
#... complete class definition on github repo where
# details on the structure of the data is also present
@staticmethod
def normalize_url(url_substring):
return quote_plus(url_substring.replace("/", " ")).lower()
def url_for(self, entity, id):
if entity == "tracker":
return "/trackers/{}.html".format(self.normalize_url(id))
elif entity == "website":
return "/websites/{}.html".format(self.get_site_name(id)).lower()
elif entity == "report":
return "/reports/{}.html".format(id)
```
We use [quote_plus](https://docs.python.org/3.6/library/urllib.parse.html#url-quoting)
here to make sure we get properly formatted urls. This
is very useful to avoid errors in generating urls for entities whose id
would cause issues such as `[24]7`, which is a
[tracker](https://whotracks.me/trackers/24_7.html).
### 1.2. Templating
The templating engine we choose was [Jinja2](http://jinja.pocoo.org/docs/2.9/).
Jinja features template inheritance which is very
useful for having reusable components. Although in
the docs they say `python >= 3.3` support is
experimental, we found it very stable.
We'll be using the `Environment` object from jinja
to load templates and `FileSystemLoader` as the
loader [docs](http://jinja.pocoo.org/docs/2.9/api/)
of choice to load templates from the file system.
```python
from jinja2 import Environment, FileSystemLoader
```
Now we need a function to render the templates, and
pass default jinja variables that will be shared
amongst all templates:
```python
import os
def get_template(data, template_name):
# data is an instance of DataSource
env = Environment(
loader=FileSystemLoader('./templates'),
autoescape=select_autoescape(['html', 'xml'])
)
# adding url_for as a custom flter to the environment object
env.filters["url_for"] = lambda entity, id: data.url_for(entity, id)
return env.get_template(template_name)
def render_template(template, **context):
return template.render(
PATHS=PATHS,
**context # template specific object(s)
)
```
The documentation on [filters in jinja](http://jinja.pocoo.org/docs/2.9/api/#writing-filters)
can be found here. Note how we registered a filter
in the environment. This is very useful and we use
this extensively in [`templating.py#L137`](https://github.com/ghostery/whotracks.me/blob/master/templating.py#L137).
### 1.3. Building a static page
In this section, we'll use the functions defined
above to build a simple tracker page.
```python
from templating.py import DataSource, get_template, render_template
def build_tracker_page(data, tracker):
# data is an instance of DataSource
# tracker is a dictionary that holds
# .. information on a given tracker
template = get_template(data, "tracker-page.html")
content = render_template(
template=template,
tracker=tracker
)
with open('{}'.format(data.url_for('tracker', tracker["id"])), 'w') as fp:
fp.write(content)
return
```
All the templates are defined in [`./templates`](https://github.com/ghostery/whotracks.me/tree/master/templates).
So in that folder, we must have a `tracker-page.html`
template that could look like this:
```html
{% extends "base.html" %}
{% block content %}
<div>
<a href="{{ 'tracker'|url_for(tracker.id) }}">{ tracker.name }}</a>
</div>
{% endblock %}
```
We have a [`base.html`](https://github.com/ghostery/whotracks.me/blob/master/templates/base.html)
where we define the shared html structure with all stylesheets
and scripts. Inside the `body` in this base we declare a content block,
which we will be populating in other templates
(see [`base.html#L51`](https://github.com/ghostery/whotracks.me/blob/master/templates/base.html#L51)).
So now in our `tracker-page.html` template we simply extend
base and start populating the content block, which in our
case has the tracker name with a link to its profile.
Note that this is to simply show how we can access the
`tracker` dictionary passed to `render_template()` and the
custom filter `url_for` registered earlier in `get_template()`.
The actual function for building tracker-pages is defined in
[`buildsite.py#L120`](https://github.com/ghostery/whotracks.me/blob/master/buildsite.py#L120)
while the template for tracker pages like [this one](/trackers/criteo.html)
is defined here [`tracker-page.html`](https://github.com/ghostery/whotracks.me/blob/master/templates/tracker-page.html).
## Relevant files
Do not forget to check the our [repository on github](https://github.com/ghostery/whotracks.me)
for the actual implementation and more details.
Relevant files are:
- [`buildsite.py`](https://github.com/ghostery/whotracks.me/tree/master/buildsite.py): entry point for building pages
- [`templating.py`](https://github.com/ghostery/whotracks.me/tree/master/templating.py): handles the templating and path management (discussed here)
- [`templates/`](https://github.com/ghostery/whotracks.me/tree/master/templates) : Where all html templates and components are defined
================================================
FILE: blog/static_site_visualization.md
================================================
title: Building whotracks.me - Visualization (part 2)
subtitle: Adding search, data, plots and blog to 1000+ pages of tracker profiles and top domains.
author: privacy team
type: article
publish: False
date: 2017-10-30
tags: tracker-free, lightweight
header_img: blog/blog-site-p2.png
+++
A picture says a 1000 words - or so they say. Interestingly,
some recent research suggests that even when we read, our brain
actually recognizes words as pictures [1]. With that said, as if one needs to
justify this, having plots accompany text and numbers, is typically a good idea,
and we did add some plots.
# Offline plots with Plotly
Choosing [Plotly](https://plot.ly/python), allowed us to keep as much
of the codebase as possible in python, and have interactive plots as opposed to
images.
## Plot Components
The main components needed to plot something in plotly are five: `traces`,
`data`, `layout`, `figure` and the `plot`object, where they're put together.
- **Traces** are [`graph objects`](https://plot.ly/python/reference/)
populated with the input data needed.
- **Data** is a list of all traces
- **Layout** is a dictionary of configuration options that determines the
layout of the plot.
- **Figure** is a dictionary with only two keys: `data` and `layout`, and the
respective values defined earlier.
- **Plot Object**: this is the plot method used (plotting online and offline)
A typical function that plots something, has this rough structure:
```python
def some_plot(param_0, param_1 ..., param_n):
# list of traces
data = [trace_0, trace_1, ..., trace_n]
# Dictionary to configure the layout of the plot
layout = {
config_option_0: value (type:: str | int | dict)
config_option_1: ____
config_option_n: ____
}
# creating the fig object
fig = {
data = data,
layout = layout
}
# creating the plot object (see next section for details)
return plotly.offline.plot(fig, other_configurable_params)
```
We'll discuss details and provide examples on each using real examples of
plots we have on this site.
## Plotting Offline
This is where the plot object (referred to earlier) gets created. There are
a few options.
```python
from plotly.plotly import plot, iplot # create url to be viewed on plotly's
# website (api key needed). With iplot
# you can also open the with jupyter
# notebooks
from plotly.offline import plot, iplot # the first one creates a file of the
# in an array of file formats, the second
# creates an interactive plot without
# connecting to the plotly server, but
# viewable in a notebook.
```
We will be using [`plottly.offline.plot`](https://plot.ly/python/offline/) and
choose `div` as the output type, which is very handy given it is html
that will go into the template where it will be rendered. This enables us to
generate the plots completely offline and just link the minified
[`plotly.js`](https://github.com/ghostery/whotracks.me/blob/master/static/js/plotly-v1.29.3.min.js)
in the head of [`base.html`](https://github.com/ghostery/whotracks.me/blob/master/templates/base.html).
One downside to consider, is the 2.8MB size of plotly.js though. For us however,
given the site will be served via CDN, this should be cached after the
first time it loads.
Let's write a function with all options we need, that will be used for all
types of plots shown later in this post. This function is defined in
[`plotting/utils.py`](https://github.com/ghostery/whotracks.me/blob/master/plotting/utils.py):
```python
def div_output(fig, display_mode_bar=False):
return plotly.offline.plot(
figure_or_data=fig,
output_type='div',
show_link=False,
include_plotlyjs=False,
config={"displayModeBar": display_mode_bar}
)
```
Note that `display_mode_bar` is the set of options that shows up on the top
right corner of the plot when rendered by `plotly.js`, and it looks like this:
<img class="img-responsive img-with-padding" src="../static/img/blog/plotting/display_mode_bar.png">
<p class="img-caption">Figure 2: Mode bar on top right corner of plotly plots.</p>
`include_plotlyjs` is set to `False` to avoid `plotly.js` being loaded inline
with the `div` output for every plot. This is not necessary as it is already
linked in [`base.html`](https://github.com/ghostery/whotracks.me/blob/master/templates/base.html).
## Bar Chart
On main page of this site, you will see this:
<img class="img-responsive img-with-padding" src="../static/img/blog/plotting/bar-chart.png">
<p class="img-caption">Figure 3: Horizontal bar chart on tracking reach of top 10 companies</p>
The code to generate this can be found in [`plotting/companies`](https://github.com/ghostery/whotracks.me/blob/master/plotting/companies.py).
Let's write a simpler function for a horizontal bar plot to get the idea:
```python
def horizontal_bar_plot(x, y):
'''
x: values
y: names
'''
c_purple = "#A069AB"
c_gray = "#BCC4CE"
trace = go.Bar(
x=x,
y=y,
orientation='h'
marker=dict(
color=[c_purple]*2 + [c_gray]*8
),
)
data = [trace]
layout = go.Layout(
dict(
showlegend=False,
xaxis=dict(
color=CliqzColors["gray_blue"]
)
)
)
fig = dict(data=data, layout=layout)
return div_output(fig)
```
## Tracker Reach - trend Line
This chart, as many others, was inspired by Edward Tufte's sparkline [2],
drawn without axes or coordinates.
<img class="img-responsive img-with-padding" src="../static/img/blog/plotting/sparkline.png">
<p class="img-caption">Figure 4: Trend line of tracker reach.</p>
```python
def sparkline(ts, t):
"""
Sparkline for plotting line
Args:
ts: timeseries data
t: x-axis (time)
Returns: hmtl output of an interactive timeseries plot
"""
y = list(map(lambda x: x * 100, ts)) # scaling percentages
trace0 = line(
x=t,
y=y,
color="#A069AB" #purple
)
trace1 = line(
x=[t[-1]],
y=[y[-1]],
color="#A069AB",
mode='markers'
)
layout = go.Layout(
dict(
showlegend=False,
height=100,
width=153,
hoverlabel=dict(
bgcolor="#1A1A25",
bordercolor="#00000000", # transparent
font=dict(
family=WTMFonts.mono,
size=13,
color="#BFCBD6"
)
),
xaxis=dict(
autorange=True,
showgrid=False,
zeroline=False,
showline=False,
autotick=True,
hoverformat="%b %y",
ticks='',
showticklabels=False
),
yaxis=dict(
# providing some padding for the sparkline
range=[min(y)*0.90, max(y)*1.05 if max(y) != y[-1] else max(y)*1.15],
showgrid=False,
zeroline=False,
showline=False,
autotick=True,
ticks='',
showticklabels=False
)
)
)
data = [trace0, trace1]
fig = dict(data=data, layout=layout)
return div_output(fig)
```
The code used to plot the sparkline seen in tracker profiles is defined
in [`plotting/trackers.py`](https://github.com/ghostery/whotracks.me/blob/master/plotting/trackers.py).
## Sankey Diagrams
Sankey diagrams are at visualizing flow volume metrics. Sometimes
they are found under the name alluvial diagrams, although they originally are
different types of flow diagrams.
<img class="img-responsive img-with-padding" src="../static/img/blog/plotting/tracker-map.png"/>
<p class="img-caption">Figure 1: Sankey diagram used to represent a [tracker map](../websites/upornia.com.html)</p>
In this site we use sankey diagrams in website profile
pages like [bahn.de](/websites/www.bahn.de.html) to map companies
and the trackers they operate to the category of the tracker. The thickness
of the link is a function of the frequency of of appearance of the tracker
per page load in the given domain. So looking at the diagram above,
we know that the dominant tracker category is advertising and Google operates
the most trackers and has the highest frequency of appearance.
Our Sankey Diagram function in Python looks like this:
```python
from plotting.utils import div_output
def sankey_plot(input_data):
data_trace = dict(
type='sankey',
domain=dict(
x=[0, 1],
y=[0, 1]
),
hoverinfo="none",
orientation="h",
node=dict(
pad=10,
thickness=30,
label=list(map(lambda x: x.replace("_", " ").capitalize(), input_data['node']['label'])),
color=input_data['node']['color']
),
link=dict(
source=input_data['link']['source'],
target=input_data['link']['target'],
value=input_data['link']['value'],
label=input_data['link']['label'],
color=["#dedede" for _ in range(len(input_data['link']['source']))]
)
)
layout = dict(
autosize=True,
font=dict(
size=12
)
)
fig = dict(data=[data_trace], layout=layout)
return div_output(fig)
```
Having looked at a lot of examples of sankey plots, we noticed a recurrent
pattern: they do a great job at explaining the plot aesthetics, but
take the structure of input data as given. This is a bit of a problem, because
in most examples the input data is a huge json file, and figuring out the
structure of such json file can become tedious.
Here is how `input_data` is structured:
```json
input_data = {
"node":{
"label": [],
"color": []
},
"link": {
"source": [],
"target": [],
"value": [],
"label": [],
"color": []
}
}
```
As you notice, input_data has two main parts: node and link:
**NODE**: `input_data["node"]` is responsible for building nodes. In our example these nodes are either
categories of trackers or companies that operate them. The atributes of each node are two:
`label` and `color`. These are both lists of strings. These lists have to have equal length because
the mapping of each label to a color is done based on the item's index in the list.
**LINK**: `input_data["link"]` is responsible for linking two nodes together. Each link has
the following attributes: `source`, `target`, `value`, `label` and `color`. So here is where the index of
`input_data["node"]["label"]` becomes very important given the way sankey plots have been implemented in
plotly. The `source` and `target` are lists of equal length, where the index is used to link.
<img class="img-responsive img-with-padding" src="../static/img/blog/plotting/node_label.png"/>
<p class="img-caption">Figure 5: Node label ilustration</p>
The elements in `source` and `target` are in fact the indexes of the source node and target
nodes in the `input_data["node"]["label"]`. So if we were to refer to the illustration in the
figure above, to render our sankey diagram we would have:
```python
source = [1, 1, 1, ... ]
target = [0, 2, len-2, ... ]
```
With that out of the way, the remaining are intuitive: `value` represents how thick the link should be,
`label` what name it has and `color` its color. All the `link` attributes are lists of equal length, and
the matching is done based on index.
For details, have a look at the actual implementation of the `input_data` generation
in [`utils/companies.py`](https://github.com/ghostery/whotracks.me/blob/master/utils/companies.py).
## References
[1] [Adding Words to the Brain's Visual Dictionary](http://www.jneurosci.org/content/35/12/4965.short) <br>
[2] [Sparkline - Wikipedia](https://en.wikipedia.org/wiki/Sparkline) <br>
================================================
FILE: blog/tracker-tax.md
================================================
title: Tracker Tax
subtitle: The impact of third-party trackers on website speed in the United States.
author: privacy team
type: article
publish: True
date: 2018-05-29
tags: blog, update
header_img: blog/tracker_tax/tracker-tax.png
+++
_This post is a summary of the paper **"The Tracker Tax: the impact of third-party
trackers on website speed in the United States"**. Full paper can be found [here](https://www.ghostery.com/wp-content/themes/ghostery/images/campaigns/tracker-tax/Ghostery_Study_-_The_Tracker_Tax.pdf)_.
<br /><br />
Earlier this month, we published a study titled, _“The Tracker Tax: the impact
of third-party trackers on website speed in the United States”_. The goal of
this study was to shed light on the impact of trackers from a performance
perspective, rather than the more frequently studied privacy standpoint.
Previous research on the topic has looked at the ubiquitous nature of online tracking
and their various business models[^1], pervasiveness of tracking, especially among
news websites[^2] and the privacy implication of tracking in the wild where a few companies
have extensive reach on web traffic [^3].
Beyond privacy concerns, we are left with one question: Do trackers cost us time?
More specifically, what is the relationship between the number of trackers and the time
a page takes to load? We call this tracker impact on the website page load times, also
referred to as page latency, the _Tracker Tax_.
## Data Collection and Cleaning
Intuition tells us that the more trackers present on a page, the longer it would take
that page to load; however, this hypothesis had not been tested on a large scale.
Web privacy measurement framework, [OpenWPM](https://github.com/citp/OpenWPM) has been
used by numerous researchers to collect data for privacy studies on a mass, automated
scale, but this tool was not built to measure website performance metrics like page load
time. So we built a custom crawler to collect the number of third-party trackers on a
website and the time it took that page to load. The crawler was built with Selenium
running Chrome, making GET requests from a server based in New York City, and used
[Ghostery](https://ghostery.com) to collect two metrics per page load: the count of
third-party trackers and number of seconds to load the page.
Ghostery detects third-party trackers by matching URLs from HTTP requests against
their trackers' database, which currently contains over 3,000 tracker companies and 4,700 tracker
patterns. To measure the time it takes a page to load, Ghostery uses
Mozilla’s `Window.performance` API [^4] by taking the delta between
`domContentLoadedEventStart` and `requestStart`. We chose to use Ghostery to measure
tracker count and page load time so the public could replicate similar, independent
analyses by simply installing the browser extension on their own and loading a
website to easily see these two metrics.
We ran the custom web crawler five times on each of the top 500 websites in the
United States, as determined by Alexa [^5]. Data cleaning included removing domains
with fewer than five successful measurements, and excluding four Chinese websites
from the sample. We excluded these websites because we suspect there are China-based
trackers that are not yet accounted for in the Ghostery database. Further, to account
for variation in the data, we filtered out the fastest and slowest page loads per
site (and their associated tracker counts), so that the data would be less sensitive
to outliers and data collection errors. The crawler was run under two configurations:
1) with no trackers blocked, and 2) with the Ghostery browser extension
blocking all trackers. Both configurations underwent the same data cleaning process.
## Results
### Tracker Ecosystem
Using the data collected under the configuration without tracker blocking enabled,
we saw that nearly 90% of page loads contained at least one tracker, 65% had at
least 10 trackers, and 20% had 50 or more trackers. Only 10% of page loads were
tracker free. These metrics once again confirm the prevalence of online tracking,
and broadly align with our
[previous study](https://www.ghostery.com/wp-content/themes/ghostery/images/campaigns/tracker-study/Ghostery_Study_-_Tracking_the_Trackers.pdf)[^6]
which observed that 77.4% of page loads contain trackers.

<p class="img-caption">Figure 1: Distribution of the number of trackers</p>
There are several differences between our two studies which may explain the increase
in tracker dominance seen in this study. Firstly, this study’s sample contains the
500 most popular websites in the US, while our previous study analyzed 144 million
page loads across more than 12 countries. By only considering the most popular websites
and neglecting the long tail of more obscure ones, it is not surprising that this study
saw a larger proportion of sites with a tracker. Additionally, the data for this study
was synthetically generated using a custom crawler, whereas our previous study used data
gathered from users of the Ghostery browser extension who had opted-in to the collection
of information about trackers on pages they visit. While the methodologies differ,
both studies verify tracker pervasiveness throughout the web.
### Trackers and Page Latency
Without blocking trackers, only 17% of all the pages in the study loaded within
5 seconds. All other pages loaded much more slowly: it took more than 10 seconds
to load nearly 60% of the pages, more than 30 seconds for 18% of the pages, and
nearly 5% of the pages took over a minute to load. This long tail cannot be ignored
and suggests Internet users waste a lot of time every day simply waiting
or websites to load.

<p class="img-caption">Figure 2: Average time to load trackers</p>
While we found that websites are generally slow to load, can any of this page
latency be explained by the number of third-party trackers on that site?
To answer this question, we calculated the average page load time for
each tracker count. We excluded both tracker volumes with fewer than
five observations and page latency outliers within each tracker
count (identified using the interquartile range rule).
To quantify the relationship between the number of trackers on a website and
the average time it took that page to load, we ran a simple linear regression
(`adj-R2 0.802`) which suggested that each additional tracker adds, on average,
0.5 seconds to the overall page load. The next model we fitted, which included
a quadratic term (`adj-R2 0.836`), suggests that trackers have an increasing
impact on page load times. However, these linear models both exhibit
heteroscedasticity – uneven variance of the error terms – and thus violate
linear regression assumptions.

<p class="img-caption">Figure 3: Log Latency as a function of the number of trackers </p>
A Box-Cox test showed that log-transforming the response variable would realize
the best fitting model, and also act as a variance-stabilizing transformation.
The log-linear model (`adj-R2 0.885`) on the transformed data indicates a
compounding effect: if the tracker count increases by 1, we expect the
page load time to increase by 2.5%.
### Protection from Trackers
We also assessed the difference in page latency when trackers are blocked rather
than allowed. The data showed that the average page load time was twice as long
when trackers are not blocked: the mean page latency with no trackers blocked and
with all trackers blocked was 19.3 seconds and 8.6 seconds, respectively. These
time savings from blocking trackers are even more drastic when only considering the
10 slowest domains in the sample. We saw that average load times were 10x faster,
and blocking trackers saved an average of 84 seconds per page load.

<p class="img-caption">Figure 4: Latencies for certain domains </p>
The term “piggybacking” describes the practice of one tracker that is placed
directly on a website giving access to other “piggybacking” trackers that are
not originally on the site. We observed this phenomenon in our data: page loads
were not the only metric significantly reduced when trackers were blocked, there
were also fewer trackers detected on the page. We saw significantly more
trackers per page when trackers were unblocked compared to blocked, in fact,
among the domains with the highest average volume of trackers, there were on
average 93 fewer trackers present per page load when tracker blocking was enabled.
Piggybacking can create a snowball effect, where trackers bring in more trackers
that can then bring in even more trackers; and as suggested above, each additional
tracker slows down a website more than previous ones. This not only has notable
performance implications, but also profound privacy concerns since these trackers
are not directly on the site, so site owners may not be aware such intrusion
is occurring.
## Future Implications
The data in our study clearly showed the pervasiveness of online tracking,
as nearly 90% of the most popular sites in the US had at least one third-party
tracker present. Our study also confirmed the strong, positive link between the
number of trackers on a page and the time it takes that page to load. Generally,
the more tracks on a site, the longer the user will have to wait for that site to load.
Quantifying this relationship depends on the model used, however the optimal model
we found shows a compounding effect: for every extra tracker on the page,
the time it takes for the page to load increases by 2.5%.
While our current study focuses on only the most popular domains in the
United Sates, it would be valuable to apply this framework to other regions
to see if similar trends persist elsewhere. Additionally, future work may
include measuring additional performance implications of trackers including data
transferred. This data transferred, which occurs when trackers make requests to
other servers, bears real monetary costs to the user, particularly on a mobile
device where data plans are typically based on data used. Expanding this study
to assess data transfer on mobile could be translated to the out of pocket
expense suffered by the user, in addition to the more subjective dollar value
of the user’s wasted time waiting for pages to load.
Other future work may also include looking at the relationship between
bounce rates and page load speeds, to calculate a hypothetical tracker
value measure. Given the additional time trackers add to page loads, and
research suggest that slower pages lead to a loss in site traffic, one tracker
should provide the same value as this lost site traffic. As bounce rates
are likely influenced by other factors besides page load speed, like
funnel page and domain category, this potential future research involves
several additional considerations.
Moreover, the tracker tax may even have more pronounced implications
in the United States following the recent repeal of net neutrality.
In a time without such net neutrality regulations, users and their browsing
speeds may be squeezed from both sides – by the ISP and the online tracking
ecosystem. We may then start to see more of a two prong tacker tax: the
direct monetary impact imposed by the ISP and the more subjective dollar
value to the user for longer load times, and therefore more unproductive
time imposed by trackers.
In the wake of the net neutrality repeal, now more than ever users must
consider the performance implications of browsing online without
protection from trackers. The added waiting times incurred by not
blocking trackers are not trivial, especially as the population is
spending increasingly more time online. Luckily, various tracker blocking
tools are available so user can not only protect their privacy, but also
speed up their browsing experience by avoiding the tracker tax.
## References
[^1]: [Using Passive Measurements to Demystify Online Trackers](https://www.telematica.polito.it/users/mellia/papers/metwalleyComsi.pdf)
[^2]: [WhoTracks.Me: Monitoring the online tracking landscape at scale](https://arxiv.org/abs/1804.08959)
[^3]: [Tracking The Trackers](https://pdfs.semanticscholar.org/2bfb/b6b8da453f91f5860ea936588fddef6c80e0.pdf)
[^4]: [Windows.performance](https://developer.mozilla.org/en-US/docs/Web/API/Window/performance) API
[^5]: [alexa.com](https://alexa.com)
[^6]: Ghostery Study: [Tracking the Trackers](https://www.ghostery.com/wp-content/themes/ghostery/images/campaigns/tracker-study/Ghostery_Study_-_Tracking_the_Trackers.pdf)
================================================
FILE: blog/tracker_categories.md
================================================
title: Tracker Categories
subtitle: Definitions for different types of trackers
author: privacy team
type: primer
publish: True
date: 2017-07-22
tags: primer, categories
header_img: blog/blog-tracker-categories.jpg
+++
Trackers differ both in the technologies they use, and the
purpose they serve. Based on the the service they provide
to the site owner, we have categorized the trackers in the following:
Advertising
: Provides advertising or advertising-related services such
as data collection, behavioral analysis or re-targeting.
Comments
: Enables comments sections for articles and product reviews
Customer Interaction
: Includes chat, email messaging, customer support, and other
interaction tools
Essential
: Includes tag managers, privacy notices, and technologies
that are critical to the functionality of a website
Pornvertising
: Delivers advertisements that generally appear on sites with adult content
Site Analytics
: Collects and analyzes data related to site usage and
performance.
Social Media
: Integrates features related to social media sites
Audio Video Player
: Enables websites to publish, distribute, and optimize
video and audio content
CDN (Content Delivery Network)
: Content delivery network that delivers resources for
different site utilities and usually for many different customers.
Misc (Miscellaneous)
: This tracker does not fit in other categories.
Hosting
: This is a service used by the content provider or site owner
Unknown
: This tracker has either not been labelled yet, or we do not have
enough information to label it.
================================================
FILE: blog/trackers-who-steal.md
================================================
title: The Trackers Who Steal
subtitle: How WhoTracks.Me caught the trail of the MageCart hackers
author: privacy team
type: article
publish: True
date: 2018-11-23
tags: tracking, hacking
header_img: blog/blog-cc-stealing.png
+++
We're all aware of the trackers siphoning off information about you as you browse the web. These trackers are mostly doing this for some business intelligence related reason - websites use these services to try to 'better understand' their customers, or to target them in order to attract their attention in a way which will benefit that website owner - be-it increasing the value of products customers put into their shopping cart, or increasing the likelihood that they click an ad.
However, there is another kind of tracker which is more nefarious than these. These are hidden scripts placed by hackers on E-commerce sites which try to steal your credit-card details as you enter them. In the last year a string of attacks — dubbed 'Magecart' — have affected major sites, including [British Airways](https://www.riskiq.com/blog/labs/magecart-british-airways-breach/), [Ticketmaster](https://www.riskiq.com/blog/labs/magecart-ticketmaster-breach/), [NewEgg](https://www.riskiq.com/blog/labs/magecart-newegg/) and [VisionDirect](https://twitter.com/troyhunt/status/1064069833967337472);
gitextract_rc25zhaz/
├── .github/
│ └── workflows/
│ └── test.yml
├── .gitignore
├── .tool-versions
├── Dockerfile
├── Jenkinsfile
├── LICENSE.md
├── README.md
├── RIGHT_TO_AMEND.md
├── blog/
│ ├── adblockers_performance_study.md
│ ├── block-third-party-cookies.md
│ ├── cookie-consent.md
│ ├── cookies.md
│ ├── dexie_transaction_bug.md
│ ├── fingerprinting.md
│ ├── gdpr-what-happened.md
│ ├── generating_adblocker_filters.md
│ ├── google_domains.md
│ ├── government_websites_september.md
│ ├── how_cliqz_antitracking_protects_users.md
│ ├── how_facebook_knows_exactly_what_turns_you_on.md
│ ├── manifest_v3_privacy.md
│ ├── private_analytics.md
│ ├── static_site.md
│ ├── static_site_blog.md
│ ├── static_site_generation.md
│ ├── static_site_visualization.md
│ ├── tracker-tax.md
│ ├── tracker_categories.md
│ ├── trackers-who-steal.md
│ ├── trackers_in_your_favorite_site.md
│ ├── tracking_and_ux.md
│ ├── tracking_pixel.md
│ ├── update_apr_2018.md
│ ├── update_dec_2017.md
│ ├── update_feb_2018.md
│ ├── update_jan_2018.md
│ ├── update_jun_2018.md
│ ├── update_may_2018.md
│ ├── updating_our_tracking_prevalence_metrics.md
│ ├── what_is_a_tracker.md
│ └── where_is_the_data_from.md
├── contrib/
│ ├── generating_adblocker_filters.py
│ ├── tracker_map_notebook.ipynb
│ ├── wtm_april_update.ipynb
│ └── wtm_may_update.ipynb
├── deploy_to_s3.py
├── docs/
│ └── local-build.md
├── pyproject.toml
├── static/
│ ├── font-awesome-4.7.0/
│ │ ├── HELP-US-OUT.txt
│ │ ├── css/
│ │ │ └── font-awesome.css
│ │ ├── fonts/
│ │ │ └── FontAwesome.otf
│ │ ├── less/
│ │ │ ├── animated.less
│ │ │ ├── bordered-pulled.less
│ │ │ ├── core.less
│ │ │ ├── fixed-width.less
│ │ │ ├── font-awesome.less
│ │ │ ├── icons.less
│ │ │ ├── larger.less
│ │ │ ├── list.less
│ │ │ ├── mixins.less
│ │ │ ├── path.less
│ │ │ ├── rotated-flipped.less
│ │ │ ├── screen-reader.less
│ │ │ ├── stacked.less
│ │ │ └── variables.less
│ │ └── scss/
│ │ ├── _animated.scss
│ │ ├── _bordered-pulled.scss
│ │ ├── _core.scss
│ │ ├── _fixed-width.scss
│ │ ├── _icons.scss
│ │ ├── _larger.scss
│ │ ├── _list.scss
│ │ ├── _mixins.scss
│ │ ├── _path.scss
│ │ ├── _rotated-flipped.scss
│ │ ├── _screen-reader.scss
│ │ ├── _stacked.scss
│ │ ├── _variables.scss
│ │ └── font-awesome.scss
│ ├── fonts/
│ │ └── RationalTWSemiBold.otf
│ ├── js/
│ │ ├── bootstrap.js
│ │ ├── d3.layout.cloud.js
│ │ ├── explorer.js
│ │ ├── ghostery.js
│ │ ├── highlight.pack.js
│ │ └── search.js
│ └── scss/
│ ├── _colors.scss
│ ├── blog/
│ │ ├── card.scss
│ │ ├── github.scss
│ │ └── post.scss
│ ├── bootstrap.min.scss
│ ├── companies/
│ │ └── reach-chart.scss
│ ├── custom.scss
│ ├── datatables.colReorder.min.scss
│ ├── datatables.min.scss
│ ├── explorer/
│ │ └── table.scss
│ ├── home/
│ │ └── index.scss
│ ├── trackers/
│ │ ├── list.scss
│ │ └── profile.scss
│ └── websites/
│ ├── overview.scss
│ └── profile.scss
├── templates/
│ ├── base.html
│ ├── blog-page.html
│ ├── blog.html
│ ├── company-page.html
│ ├── components/
│ │ ├── blog-card.html
│ │ ├── breadcrumb.html
│ │ ├── category-item.html
│ │ ├── company-card.html
│ │ ├── cookies.html
│ │ ├── fingerprinting.html
│ │ ├── footer.html
│ │ ├── home/
│ │ │ └── header.html
│ │ ├── navbar.html
│ │ ├── tag_cloud.html
│ │ ├── top-5-info-box.html
│ │ ├── top-5-trackers.html
│ │ ├── tracker-list.html
│ │ ├── trackers/
│ │ │ ├── category.html
│ │ │ └── header.html
│ │ ├── tracking-methods.html
│ │ ├── unified-ui-tracker-list.html
│ │ ├── website-list.html
│ │ └── websites/
│ │ ├── header.html
│ │ └── tracker-list.html
│ ├── explorer.html
│ ├── imprint.html
│ ├── index.html
│ ├── not-found.html
│ ├── privacy-policy.html
│ ├── reach-chart-page.html
│ ├── tracker-not-found.html
│ ├── tracker-page.html
│ ├── trackers.html
│ ├── website-not-found.html
│ ├── website-page.html
│ └── websites.html
├── tests/
│ ├── __init__.py
│ ├── test_data_integrity.py
│ ├── test_db_integrity.py
│ ├── test_db_validity.py
│ ├── test_site_categories.py
│ └── test_sites_data.py
├── update_trackerdb.sh
├── update_trackers_preview.py
└── whotracksme/
├── __init__.py
├── data/
│ ├── Readme.md
│ ├── __init__.py
│ ├── assets/
│ │ ├── trackerdb.sql
│ │ └── trackers-preview.json
│ ├── db.py
│ ├── loader.py
│ └── pack.py
├── main.py
├── qa/
│ ├── __init__.py
│ ├── todo.py
│ └── utils.py
└── website/
├── __init__.py
├── api/
│ └── meta.py
├── build/
│ ├── __init__.py
│ ├── blog.py
│ ├── companies.py
│ ├── data.py
│ ├── explorer.py
│ ├── home.py
│ ├── trackers.py
│ └── websites.py
├── builder.py
├── plotting/
│ ├── .vscode/
│ │ └── settings.json
│ ├── __init__.py
│ ├── colors.py
│ ├── companies.py
│ ├── plots.py
│ ├── sankey.py
│ ├── trackers.py
│ └── utils.py
├── serve.py
├── templates.py
└── utils.py
SYMBOL INDEX (302 symbols across 38 files)
FILE: contrib/generating_adblocker_filters.py
function filter_domains (line 25) | def filter_domains(domains):
function generate_adb_filters (line 35) | def generate_adb_filters(domains):
function generate_hostname_filters (line 41) | def generate_hostname_filters(domains):
FILE: deploy_to_s3.py
function iterate_bucket (line 28) | def iterate_bucket(s3_client, bucket_name, bucket_prefix):
function get_max_age (line 38) | def get_max_age(path, filename):
function get_cache_control (line 57) | def get_cache_control(path, filename, production=False):
function get_content_type (line 64) | def get_content_type(local_path):
function upload_file_to_s3 (line 126) | def upload_file_to_s3(path, filename):
FILE: static/js/bootstrap.js
function transitionEnd (line 34) | function transitionEnd() {
function removeElement (line 126) | function removeElement() {
function Plugin (line 142) | function Plugin(option) {
function Plugin (line 251) | function Plugin(option) {
function Plugin (line 475) | function Plugin(option) {
function getTargetFromTrigger (line 695) | function getTargetFromTrigger($trigger) {
function Plugin (line 707) | function Plugin(option) {
function getParent (line 774) | function getParent($this) {
function clearMenus (line 787) | function clearMenus(e) {
function Plugin (line 880) | function Plugin(option) {
function Plugin (line 1208) | function Plugin(option, _relatedTarget) {
function complete (line 1574) | function complete() {
function Plugin (line 1750) | function Plugin(option) {
function Plugin (line 1859) | function Plugin(option) {
function ScrollSpy (line 1902) | function ScrollSpy(element, options) {
function Plugin (line 2022) | function Plugin(option) {
function next (line 2131) | function next() {
function Plugin (line 2177) | function Plugin(option) {
function Plugin (line 2334) | function Plugin(option) {
FILE: static/js/d3.layout.cloud.js
function s (line 1) | function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&re...
function step (line 57) | function step() {
function getContext (line 89) | function getContext(canvas) {
function place (line 102) | function place(board, tag, bounds) {
function cloudText (line 208) | function cloudText(d) {
function cloudFont (line 212) | function cloudFont() {
function cloudFontNormal (line 216) | function cloudFontNormal() {
function cloudFontSize (line 220) | function cloudFontSize(d) {
function cloudRotate (line 224) | function cloudRotate() {
function cloudPadding (line 228) | function cloudPadding() {
function cloudSprite (line 234) | function cloudSprite(contextAndRatio, d, data, di) {
function cloudCollide (line 322) | function cloudCollide(tag, board, sw) {
function cloudBounds (line 343) | function cloudBounds(bounds, d) {
function collideRects (line 352) | function collideRects(a, b) {
function archimedeanSpiral (line 356) | function archimedeanSpiral(size) {
function rectangularSpiral (line 363) | function rectangularSpiral(size) {
function zeroArray (line 382) | function zeroArray(n) {
function cloudCanvas (line 389) | function cloudCanvas() {
function functor (line 393) | function functor(d) {
function dispatch (line 412) | function dispatch() {
function Dispatch (line 420) | function Dispatch(_) {
function parseTypenames (line 424) | function parseTypenames(typenames, types) {
function get (line 474) | function get(type, name) {
function set (line 482) | function set(type, name, callback) {
FILE: static/js/explorer.js
class ByteView (line 1) | class ByteView {
method constructor (line 2) | constructor(buffer) {
method getFloat32 (line 8) | getFloat32() {
method getFloat64 (line 14) | getFloat64() {
method getBytes (line 20) | getBytes(n) {
method getByte (line 26) | getByte() {
method getUint8 (line 30) | getUint8() {
method getInt8 (line 36) | getInt8() {
method getUint16 (line 42) | getUint16() {
method getInt16 (line 48) | getInt16() {
method getUint32 (line 54) | getUint32() {
method getInt32 (line 60) | getInt32() {
function nextPow2 (line 68) | function nextPow2(_v) {
constant FORMAT_CHARS (line 80) | const FORMAT_CHARS = {
constant SEARCHABLE_FIELDS (line 99) | const SEARCHABLE_FIELDS = new Set([
function sizeOfFormat (line 109) | function sizeOfFormat(format) {
class Table (line 123) | class Table {
method constructor (line 124) | constructor(name) {
method update (line 133) | update(headers, data) {
class LazyCSVReader (line 184) | class LazyCSVReader {
method constructor (line 185) | constructor(url) {
method decodeMetadataSize (line 205) | decodeMetadataSize(view) {
method decodeHeaders (line 214) | decodeHeaders(view) {
method decodeFormatString (line 228) | decodeFormatString(view) {
method decodeSymbols (line 241) | decodeSymbols(view) {
method decodeNumberOfRows (line 263) | decodeNumberOfRows(view) {
method decodeRows (line 269) | decodeRows(view) {
method fetchNext (line 340) | async fetchNext() {
FILE: static/js/highlight.pack.js
function n (line 2) | function n(e){return e.replace(/&/g,"&").replace(/</g,"<").replac...
function t (line 2) | function t(e){return e.nodeName.toLowerCase()}
function r (line 2) | function r(e,n){var t=e&&e.exec(n);return t&&0===t.index}
function a (line 2) | function a(e){return k.test(e)}
function i (line 2) | function i(e){var n,t,r,i,o=e.className+" ";if(o+=e.parentNode?e.parentN...
function o (line 2) | function o(e){var n,t={},r=Array.prototype.slice.call(arguments,1);for(n...
function u (line 2) | function u(e){var n=[];return function r(e,a){for(var i=e.firstChild;i;i...
function c (line 2) | function c(e,r,a){function i(){return e.length&&r.length?e[0].offset!==r...
function l (line 2) | function l(e){return e.v&&!e.cached_variants&&(e.cached_variants=e.v.map...
function s (line 2) | function s(e){function n(e){return e&&e.source||e}function t(t,r){return...
function f (line 2) | function f(e,t,a,i){function o(e,n){var t,a;for(t=0,a=n.c.length;a>t;t++...
function g (line 2) | function g(e,t){t=t||I.languages||x(y);var r={r:0,value:n(e)},a=r;return...
function p (line 2) | function p(e){return I.tabReplace||I.useBR?e.replace(M,function(e,n){ret...
function h (line 2) | function h(e,n,t){var r=n?L[n]:t,a=[e.trim()];return e.match(/\bhljs\b/)...
function d (line 2) | function d(e){var n,t,r,o,l,s=i(e);a(s)||(I.useBR?(n=document.createElem...
function b (line 2) | function b(e){I=o(I,e)}
function v (line 2) | function v(){if(!v.called){v.called=!0;var e=document.querySelectorAll("...
function m (line 2) | function m(){addEventListener("DOMContentLoaded",v,!1),addEventListener(...
function N (line 2) | function N(n,t){var r=y[n]=t(e);r.aliases&&r.aliases.forEach(function(e)...
function R (line 2) | function R(){return x(y)}
function w (line 2) | function w(e){return e=(e||"").toLowerCase(),y[e]||y[L[e]]}
FILE: static/js/search.js
function compare (line 21) | function compare(a, b) {
function editDistance (line 27) | function editDistance(s1, s2) {
function search (line 55) | function search(query, idx) {
function inArray (line 86) | function inArray(obj, arr){
function createList (line 101) | function createList(parent, arr, name) {
function autocomplete (line 126) | function autocomplete(inp) {
FILE: tests/test_data_integrity.py
class TestDataIntegrity (line 7) | class TestDataIntegrity(unittest.TestCase):
method setUp (line 9) | def setUp(self):
method test_all_trackers_have_db_entry (line 13) | def test_all_trackers_have_db_entry(self):
method test_all_companies_have_db_entry (line 25) | def test_all_companies_have_db_entry(self):
FILE: tests/test_db_integrity.py
class TestDbIntegrity (line 6) | class TestDbIntegrity(unittest.TestCase):
method setUp (line 8) | def setUp(self):
method test_all_trackers_have_domains (line 11) | def test_all_trackers_have_domains(self):
FILE: tests/test_db_validity.py
class ValidateTrackerDatabase (line 7) | class ValidateTrackerDatabase(unittest.TestCase):
method setUpClass (line 12) | def setUpClass(cls):
method test_db_has_trackers (line 15) | def test_db_has_trackers(self):
method test_db_has_companies (line 21) | def test_db_has_companies(self):
method test_db_has_tracker_domains (line 27) | def test_db_has_tracker_domains(self):
method test_no_trackers_without_domain (line 33) | def test_no_trackers_without_domain(self):
method test_tracker_ids_ascii (line 44) | def test_tracker_ids_ascii(self):
method test_company_ids_ascii (line 54) | def test_company_ids_ascii(self):
method test_domains_ascii (line 64) | def test_domains_ascii(self):
FILE: tests/test_site_categories.py
function iterate_site_categories (line 18) | def iterate_site_categories():
class TestSitesData (line 27) | class TestSitesData(unittest.TestCase):
method test_all_categories_are_valid (line 29) | def test_all_categories_are_valid(self):
method test_no_repeated_sites (line 36) | def test_no_repeated_sites(self):
FILE: tests/test_sites_data.py
class TestSitesData (line 5) | class TestSitesData(unittest.TestCase):
method test_all_sites_have_category (line 7) | def test_all_sites_have_category(self):
FILE: update_trackers_preview.py
function download_privacy_score (line 26) | def download_privacy_score(bucket_name, bucket_prefix):
function list_known_categories (line 49) | def list_known_categories(privacy_score):
function generate_trackers_preview (line 55) | def generate_trackers_preview(privacy_score):
function write_json (line 106) | def write_json(data, path):
FILE: whotracksme/data/assets/trackerdb.sql
type categories (line 4) | CREATE TABLE categories(
type companies (line 19) | CREATE TABLE companies (
type tracker_domains (line 2597) | CREATE TABLE tracker_domains (
type trackers (line 7634) | CREATE TABLE trackers (
type tracker_domain_pair (line 11139) | CREATE UNIQUE INDEX tracker_domain_pair ON tracker_domains (tracker, dom...
FILE: whotracksme/data/db.py
function asset_string (line 10) | def asset_string(name):
function load_tracker_db (line 21) | def load_tracker_db(loc=':memory:'):
function import_trackers (line 27) | def import_trackers(connection):
function create_tracker_map (line 31) | def create_tracker_map(db, with_iab_vendors=False):
function get_iab_vendorlist (line 111) | def get_iab_vendorlist():
function get_column_type (line 118) | def get_column_type(col):
class WhoTracksMeDB (line 173) | class WhoTracksMeDB:
method __init__ (line 227) | def __init__(self):
method _get_existing_tables (line 256) | def _get_existing_tables(self):
method get_file_checksum (line 259) | def get_file_checksum(self, filename):
method update_file_checksum (line 267) | def update_file_checksum(self, filename, checksum):
method load_data (line 271) | def load_data(self, name, region, month):
FILE: whotracksme/data/loader.py
function asset_exists (line 9) | def asset_exists(name):
function asset_stream (line 19) | def asset_stream(name):
function list_available_months (line 32) | def list_available_months(region="global"):
class DataSource (line 42) | class DataSource:
method __init__ (line 43) | def __init__(self, region="global", populate=True):
method __enter__ (line 76) | def __enter__(self):
method __exit__ (line 79) | def __exit__(self, *args):
method normalize_url (line 83) | def normalize_url(url_substring):
method url_for (line 86) | def url_for(self, entity, id, path_to_root='.'):
method get_company_name (line 96) | def get_company_name(self, id):
function parse_date (line 102) | def parse_date(date_string):
function is_valid_date (line 105) | def is_valid_date(date_string):
class SQLDataLoader (line 116) | class SQLDataLoader:
method __init__ (line 118) | def __init__(self, data_months, name, db, region='global', id_column=N...
method get_data_query (line 131) | def get_data_query(self):
method sort_by (line 142) | def sort_by(self, metric="reach", descending=True):
method get_snapshot (line 151) | def get_snapshot(self, month=None):
method iter (line 158) | def iter(self):
method get_datapoint (line 162) | def get_datapoint(self, id, month=None):
method dump (line 171) | def dump(self):
class Trackers (line 178) | class Trackers(SQLDataLoader):
method __init__ (line 180) | def __init__(self, data_months, db, region='global', populate=True):
method summary_stats (line 203) | def summary_stats(self):
method get_tracker (line 230) | def get_tracker(self, id):
method get_name (line 283) | def get_name(self, id):
method get_rank (line 286) | def get_rank(self, id):
method get_rank_label (line 289) | def get_rank_label(self, id):
method get_tracking_methods (line 309) | def get_tracking_methods(self, id):
method get_reach (line 328) | def get_reach(self, id):
method get_presence_by_site_category (line 350) | def get_presence_by_site_category(self, id):
method similar_trackers (line 369) | def similar_trackers(self, id, n=4):
method get_domains (line 401) | def get_domains(self, id):
method iter_sites (line 410) | def iter_sites(self, id):
class Sites (line 417) | class Sites(SQLDataLoader):
method __init__ (line 418) | def __init__(self, data_months, db, region='global', populate=True):
method summary_stats (line 430) | def summary_stats(self):
method get_name (line 458) | def get_name(self, id):
method get_tracker_list (line 462) | def get_tracker_list(self, site, month=None):
method trackers_on_site (line 482) | def trackers_on_site(self, site, month=None):
method mean_trackers_timeseries (line 504) | def mean_trackers_timeseries(self, id):
method get_site_tracker_categories (line 514) | def get_site_tracker_categories(self, site, month=None):
class SitesTrackers (line 530) | class SitesTrackers(SQLDataLoader):
method __init__ (line 532) | def __init__(self, data_months, db, region='global', populate=True):
class Companies (line 537) | class Companies(SQLDataLoader):
method __init__ (line 539) | def __init__(self, data_months, db, region='global', populate=True):
FILE: whotracksme/data/pack.py
function get_minimal_int_type (line 32) | def get_minimal_int_type(value):
function get_minimal_float_type (line 52) | def get_minimal_float_type(value):
function guess_type (line 56) | def guess_type(value):
class ByteView (line 82) | class ByteView:
method __init__ (line 83) | def __init__(self, buffer=b""):
method get_string (line 87) | def get_string(self, length):
method get (line 90) | def get(self, f):
method set_string (line 96) | def set_string(self, string):
method set (line 101) | def set(self, f, *values):
function unpack_rows (line 106) | def unpack_rows(buffer):
function pack_rows (line 144) | def pack_rows(rows, fields):
FILE: whotracksme/main.py
class objectview (line 32) | class objectview:
method __init__ (line 42) | def __init__(self, d):
function website (line 46) | def website(args):
function main (line 54) | def main():
FILE: whotracksme/qa/todo.py
function create_task_files (line 5) | def create_task_files(needqa_folder, **kwargs):
function upgrade_to_https (line 20) | def upgrade_to_https(tracker_db):
FILE: whotracksme/qa/utils.py
function fetch (line 7) | async def fetch(session, url):
function fetch_all (line 24) | async def fetch_all(session, urls, loop):
function retrieve_status (line 31) | def retrieve_status(urls):
function write_to_file (line 46) | def write_to_file(filepath, json_output):
FILE: whotracksme/website/api/meta.py
function get_trackerdb_version (line 6) | def get_trackerdb_version():
function get_api_meta_data (line 19) | def get_api_meta_data(trackerdb_version,
function update_api_meta_data (line 35) | def update_api_meta_data():
FILE: whotracksme/website/build/blog.py
function parse_blogpost (line 23) | def parse_blogpost(filepath):
function load_blog_posts (line 67) | def load_blog_posts():
function build_blogpost_list (line 77) | def build_blogpost_list(data, blog_posts):
function build_blogpost_pages (line 88) | def build_blogpost_pages(blog_posts):
function get_human_date (line 107) | def get_human_date(date):
function build_rss_feeds (line 118) | def build_rss_feeds(blog_posts):
FILE: whotracksme/website/build/companies.py
function get_company (line 15) | def get_company(companies, company_id):
function company_data (line 19) | def company_data(companies, company_id):
function get_company_name (line 27) | def get_company_name(company_dict):
function website_doughnout (line 34) | def website_doughnout(site, data):
function tracker_map_data (line 42) | def tracker_map_data(site_id, data):
function company_reach (line 88) | def company_reach(companies, n=10):
function company_page (line 93) | def company_page(template, company_data, data):
function build_company_pages (line 111) | def build_company_pages(data):
function build_company_reach_chart_page (line 120) | def build_company_reach_chart_page(data):
FILE: whotracksme/website/build/data.py
function build_tracker_db (line 7) | def build_tracker_db():
function build_tracker_json (line 14) | def build_tracker_json(tracker_id, data):
function build_api (line 36) | def build_api(data):
function build_tracker_api_batch (line 49) | def build_tracker_api_batch(batch):
function build_website_api_batch (line 57) | def build_website_api_batch(batch):
FILE: whotracksme/website/build/explorer.py
function build_packed_data (line 69) | def build_packed_data(data):
function table_to_csv (line 88) | def table_to_csv(table, file):
function build_explorer (line 97) | def build_explorer():
FILE: whotracksme/website/build/home.py
function build_home (line 9) | def build_home(data):
function build_privacy_policy (line 49) | def build_privacy_policy(data):
function build_imprint (line 58) | def build_imprint(data):
FILE: whotracksme/website/build/trackers.py
function recent_tracker_reach (line 14) | def recent_tracker_reach(reach):
function tag_cloud_data (line 29) | def tag_cloud_data(tracker_id, data):
function build_trackers_list (line 75) | def build_trackers_list(data):
function tracker_page_data (line 97) | def tracker_page_data(tracker_id, tracker, data):
function tracker_page (line 121) | def tracker_page(template, data):
function build_tracker_pages (line 139) | def build_tracker_pages(data):
function build_tracker_page_batch (line 147) | def build_tracker_page_batch(batch):
FILE: whotracksme/website/build/websites.py
function build_website_list (line 21) | def build_website_list(data):
function website_page (line 53) | def website_page(template, site, rank, data):
function build_website_pages (line 111) | def build_website_pages(data):
function build_website_pages_batch (line 120) | def build_website_pages_batch(batch):
FILE: whotracksme/website/builder.py
class BlockingSingleThreadExecutor (line 62) | class BlockingSingleThreadExecutor(concurrent.futures._base.Executor):
method submit (line 70) | def submit(self, fn, *args, **kwargs):
method shutdown (line 75) | def shutdown(self, wait=True, *, cancel_futures=False):
class Builder (line 79) | class Builder:
method __init__ (line 80) | def __init__(self):
method build (line 84) | def build(self):
method on_explorer_folder_change (line 87) | def on_explorer_folder_change(self):
method on_data_folder_change (line 90) | def on_data_folder_change(self):
method on_templates_folder_change (line 93) | def on_templates_folder_change(self):
method on_static_folder_change (line 96) | def on_static_folder_change(self):
method on_blog_folder_change (line 99) | def on_blog_folder_change(self):
method _create_executor (line 102) | def _create_executor(self):
method feed_event (line 109) | def feed_event(self, event):
FILE: whotracksme/website/plotting/colors.py
function palette (line 66) | def palette(color1, color2, number_of_shades):
function random_color (line 72) | def random_color():
FILE: whotracksme/website/plotting/companies.py
function overview_bars (line 9) | def overview_bars(companies, highlight=2, height=None):
function overview_reach (line 45) | def overview_reach(companies):
FILE: whotracksme/website/plotting/plots.py
function profile_doughnut (line 15) | def profile_doughnut(values, labels, name, color_palette=False):
function doughnut_chart (line 58) | def doughnut_chart(values, labels, name, color_palette=False):
function hbar (line 115) | def hbar(label, color=wtm_colors["blue"], **kwargs):
function scatter (line 136) | def scatter(x, y, name, color, fill=True, line_style="solid"):
function line (line 155) | def line(x, y, color, line_style="solid", mode='lines'):
function treemap (line 175) | def treemap():
FILE: whotracksme/website/plotting/sankey.py
function sankey_plot (line 4) | def sankey_plot(sndata):
FILE: whotracksme/website/plotting/trackers.py
function tracker_cfh (line 10) | def tracker_cfh(https, fingerprinting, cookies):
function tracker_reach_ts (line 57) | def tracker_reach_ts(ts1, ts2, t):
function ts_trend (line 95) | def ts_trend(ts, t, percent=True):
function site_tree_map (line 159) | def site_tree_map(sites):
function tracker_map (line 174) | def tracker_map(app, site_values, rectangles):
FILE: whotracksme/website/plotting/utils.py
function set_category_colors (line 17) | def set_category_colors(tracker_labels):
function set_margins (line 21) | def set_margins(l=60, r=60, b=40, t=0, pad=5):
function set_line_style (line 31) | def set_line_style(color, width=3, line_style="solid"):
function annotation (line 52) | def annotation(text, x, y, background_color, shift_x=-1, text_size=12, c...
function overview_label (line 78) | def overview_label(text, x, y, text_size=12, shift_x=-1, color=wtm_color...
function div_output (line 103) | def div_output(fig, display_mode_bar=False, height="100%"):
FILE: whotracksme/website/serve.py
function watch (line 14) | def watch(builder):
function serve_site (line 60) | def serve_site(port):
function serve (line 71) | def serve(builder):
FILE: whotracksme/website/templates.py
function site_to_json (line 21) | def site_to_json(data_source, blog_posts):
function copy_custom_error_pages (line 106) | def copy_custom_error_pages(data):
function generate_sitemap (line 118) | def generate_sitemap(blog_posts):
function get_template (line 131) | def get_template(data_source, name, render_markdown=False, path_to_root=...
function render_template (line 166) | def render_template(template, path_to_root='.', **context):
function create_site_structure (line 187) | def create_site_structure(static_path):
FILE: whotracksme/website/utils.py
function write_json (line 7) | def write_json(path, **data):
function without_keys (line 18) | def without_keys(d, keys):
function print_progress (line 22) | def print_progress(text, default_space=40):
Copy disabled (too large)
Download .json
Condensed preview — 179 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (19,677K chars).
[
{
"path": ".github/workflows/test.yml",
"chars": 1057,
"preview": "name: Tests\n\non:\n push:\n branches: [master]\n pull_request:\n branches: [master]\n\njobs:\n test:\n runs-on: ubunt"
},
{
"path": ".gitignore",
"chars": 139,
"preview": "*.pyc\n.cache/\n.sass-cache/\n__pycache__/\n_site/\ndist/\nwhotracksme.egg-info/\n.DS_Store\nvenv/\nwhotracksme/data/assets/**/*."
},
{
"path": ".tool-versions",
"chars": 14,
"preview": "python 3.11.6\n"
},
{
"path": "Dockerfile",
"chars": 806,
"preview": "# Set base image to build upon\nFROM python:3.11-slim\n\n# Set arg and env\nARG VERSION\nARG UID=1000\nARG GID=1000\nARG USER=j"
},
{
"path": "Jenkinsfile",
"chars": 2239,
"preview": "\ndef testReport = 'test-report.xml'\ndef stagingBucket = 'internal.clyqz.com'\ndef stagingPrefix = '/docs/whotracksme'\ndef"
},
{
"path": "LICENSE.md",
"chars": 1083,
"preview": "MIT License\n\nCopyright (c) 2017 - to present Ghostery GmbH\n\nPermission is hereby granted, free of charge, to any person "
},
{
"path": "README.md",
"chars": 4175,
"preview": " \n\n<p align=\"center\">\n <img src=\"https://raw.githubusercontent.com/ghostery/whotracks.me/master/static/img/who-tra"
},
{
"path": "RIGHT_TO_AMEND.md",
"chars": 1183,
"preview": "# Right to Amend\nA Guideline for 3rd parties wanting to suggest corrections to their data\n\n\nwhotracks.me has already gro"
},
{
"path": "blog/adblockers_performance_study.md",
"chars": 20474,
"preview": "title: Adblockers Performance Study\nsubtitle:\ndescription: A detailed comparison of popular adblockers with tips to help"
},
{
"path": "blog/block-third-party-cookies.md",
"chars": 18954,
"preview": "title: Third-party cookies - the guests who won't leave\nsubtitle: How the web ecosystem is preventing us from reverting "
},
{
"path": "blog/cookie-consent.md",
"chars": 5405,
"preview": "title: Improving Cookie Consent\nsubtitle: Cliqz' new feature to make consent fairer\nauthor: privacy team\ntype: article\np"
},
{
"path": "blog/cookies.md",
"chars": 2880,
"preview": "title: Cookies\nsubtitle: A small piece of data sent from a website, meant to 'help', used to track.\nauthor: privacy team"
},
{
"path": "blog/dexie_transaction_bug.md",
"chars": 7461,
"preview": "title: A quantum bug in Firefox Quantum\nsubtitle: DevTools - how we tracked down an observant-dependent bug.\nauthor: pri"
},
{
"path": "blog/fingerprinting.md",
"chars": 2103,
"preview": "title: Fingerprinting\nsubtitle: Let me tell you what's unique about your device. \nauthor: privacy team\ntype: primer\npubl"
},
{
"path": "blog/gdpr-what-happened.md",
"chars": 13259,
"preview": "title: GDPR - What happened?\nsubtitle: The tracking landscape post GDPR, adverse effects on competition and a market for"
},
{
"path": "blog/generating_adblocker_filters.md",
"chars": 10147,
"preview": "title: Generating Ad-Blocker filters from whotracks.me data\nsubtitle: Let's never miss a new tracker again.\nauthor: remu"
},
{
"path": "blog/google_domains.md",
"chars": 3487,
"preview": "title: The end of google.{your country}?\nsubtitle: Google's move to keep their cookies.\nauthor: privacy team\ntype: artic"
},
{
"path": "blog/government_websites_september.md",
"chars": 6507,
"preview": "title: Government websites\nsubtitle: If you are not the product, you're the taxpayer\nauthor: privacy team\ntype: article\n"
},
{
"path": "blog/how_cliqz_antitracking_protects_users.md",
"chars": 12407,
"preview": "title: How Cliqz anti-tracking protects users\nsubtitle: Using an algorithmic, data-driven approach to remove unique iden"
},
{
"path": "blog/how_facebook_knows_exactly_what_turns_you_on.md",
"chars": 16654,
"preview": "title: How facebook knows exactly what turns you on\nsubtitle: A technical analysis of the methods used to track users as"
},
{
"path": "blog/manifest_v3_privacy.md",
"chars": 16270,
"preview": "title: Chrome's Manifest V3 - Improving Privacy?\nsubtitle: How Chrome's changes will reduce user privacy\nauthor: privacy"
},
{
"path": "blog/private_analytics.md",
"chars": 7815,
"preview": "title: Tracking visits without tracking people\nsubtitle: A privacy-by-design approach.\nauthor: privacy team\ntype: articl"
},
{
"path": "blog/static_site.md",
"chars": 3928,
"preview": "title: Building whotracks.me\nsubtitle: Adding search, data, plots and blog to 1000+ pages of tracker profiles and top do"
},
{
"path": "blog/static_site_blog.md",
"chars": 2869,
"preview": "title: Building whotracks.me - Blog (part 3)\nsubtitle: Adding search, data, plots and blog to 1000+ pages of tracker pro"
},
{
"path": "blog/static_site_generation.md",
"chars": 6767,
"preview": "title: Building whotracks.me - Generating a static site (part 1)\nsubtitle: Adding search, data, plots and blog to 1000+ "
},
{
"path": "blog/static_site_visualization.md",
"chars": 12260,
"preview": "title: Building whotracks.me - Visualization (part 2)\nsubtitle: Adding search, data, plots and blog to 1000+ pages of tr"
},
{
"path": "blog/tracker-tax.md",
"chars": 12885,
"preview": "title: Tracker Tax\nsubtitle: The impact of third-party trackers on website speed in the United States.\nauthor: privacy t"
},
{
"path": "blog/tracker_categories.md",
"chars": 1640,
"preview": "title: Tracker Categories\nsubtitle: Definitions for different types of trackers \nauthor: privacy team\ntype: primer\npubli"
},
{
"path": "blog/trackers-who-steal.md",
"chars": 35250,
"preview": "title: The Trackers Who Steal\nsubtitle: How WhoTracks.Me caught the trail of the MageCart hackers\nauthor: privacy team\nt"
},
{
"path": "blog/trackers_in_your_favorite_site.md",
"chars": 9116,
"preview": "title: Making sense of the trackers on Reddit\nsubtitle: Using whotracks.me data and sankey diagrams to dissect trackers\n"
},
{
"path": "blog/tracking_and_ux.md",
"chars": 10577,
"preview": "title: The Impact of Tracking on User Experience\nsubtitle: Beyond privacy - a survey of hidden and visible effects of tr"
},
{
"path": "blog/tracking_pixel.md",
"chars": 1435,
"preview": "title: Tracking Pixel\nsubtitle: So, ... did you read my email?\nauthor: privacy team\ntype: primer\npublish: True\ndate: 201"
},
{
"path": "blog/update_apr_2018.md",
"chars": 7448,
"preview": "title: April Update - Preparing for Internationalisation\nsubtitle: A new data format to ease access to tracker data.\naut"
},
{
"path": "blog/update_dec_2017.md",
"chars": 4326,
"preview": "title: WhoTracks.me December Update\nsubtitle: New data and trackers in our monthly update.\nauthor: privacy team\ntype: ar"
},
{
"path": "blog/update_feb_2018.md",
"chars": 8504,
"preview": "title: February Update - The Tracking Shell Game\nsubtitle: How mergers and acquisitions are hiding who actually is track"
},
{
"path": "blog/update_jan_2018.md",
"chars": 3726,
"preview": "title: WhoTracks.me January Update\nsubtitle: New data and trackers in our monthly update.\nauthor: privacy team\ntype: art"
},
{
"path": "blog/update_jun_2018.md",
"chars": 11291,
"preview": "title: June Update - Do you consent?\nsubtitle: The rushed, rough and rogue UX of services getting you to opt-in.\nauthor:"
},
{
"path": "blog/update_may_2018.md",
"chars": 6034,
"preview": "title: May Update - Countdown to GDPR\nsubtitle: Facebook's \"Tough\" month and Google's recommendation\nauthor: privacy tea"
},
{
"path": "blog/updating_our_tracking_prevalence_metrics.md",
"chars": 6179,
"preview": "title: Updating our tracking prevalence metrics\nsubtitle: Metrics that make more sense.\nauthor: privacy team\ntype: artic"
},
{
"path": "blog/what_is_a_tracker.md",
"chars": 3967,
"preview": "title: What are trackers?\nsubtitle:\ndescription: What is tracker profiling? Find out what trackers are profiling you and"
},
{
"path": "blog/where_is_the_data_from.md",
"chars": 2930,
"preview": "title: Where does the data come from?\nsubtitle: An explanation of the origin of the data in this site, and why its colle"
},
{
"path": "contrib/generating_adblocker_filters.py",
"chars": 1839,
"preview": "\nfrom collections import defaultdict, Counter\nfrom whotracksme.data import load_tracker_db, load_apps\n\n# Categories to t"
},
{
"path": "contrib/tracker_map_notebook.ipynb",
"chars": 4755890,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Making sense of the trackers on y"
},
{
"path": "contrib/wtm_april_update.ipynb",
"chars": 5556919,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Whotracks.me April Update\\n\",\n "
},
{
"path": "contrib/wtm_may_update.ipynb",
"chars": 5099807,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"# Whotracks.me May Update\\n\",\n \""
},
{
"path": "deploy_to_s3.py",
"chars": 9388,
"preview": "\"\"\"\nModule to deploy WhoTracksMe site to an s3 bucket.\n\nUsage:\n deploy_to_s3 <bucket_name> [<prefix>] [--production] "
},
{
"path": "docs/local-build.md",
"chars": 1127,
"preview": "# Local development\n\n## Building the website (the static HTML based version) and the internal API\n\nThe code to build the"
},
{
"path": "pyproject.toml",
"chars": 1073,
"preview": "[project]\nname = \"whotracks\"\nversion = \"2026.4.1\"\ndescription = \"Learn about tracking technologies, market structure and"
},
{
"path": "static/font-awesome-4.7.0/HELP-US-OUT.txt",
"chars": 323,
"preview": "I hope you love Font Awesome. If you've found it useful, please do me a favor and check out my latest project,\nFort Awes"
},
{
"path": "static/font-awesome-4.7.0/css/font-awesome.css",
"chars": 37414,
"preview": "/*!\n * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome\n * License - http://fontawesome.io/lice"
},
{
"path": "static/font-awesome-4.7.0/less/animated.less",
"chars": 713,
"preview": "// Animated Icons\n// --------------------------\n\n.@{fa-css-prefix}-spin {\n -webkit-animation: fa-spin 2s infinite linea"
},
{
"path": "static/font-awesome-4.7.0/less/bordered-pulled.less",
"chars": 585,
"preview": "// Bordered & Pulled\n// -------------------------\n\n.@{fa-css-prefix}-border {\n padding: .2em .25em .15em;\n border: sol"
},
{
"path": "static/font-awesome-4.7.0/less/core.less",
"chars": 452,
"preview": "// Base Class Definition\n// -------------------------\n\n.@{fa-css-prefix} {\n display: inline-block;\n font: normal norma"
},
{
"path": "static/font-awesome-4.7.0/less/fixed-width.less",
"chars": 119,
"preview": "// Fixed Width Icons\n// -------------------------\n.@{fa-css-prefix}-fw {\n width: (18em / 14);\n text-align: center;\n}\n"
},
{
"path": "static/font-awesome-4.7.0/less/font-awesome.less",
"chars": 495,
"preview": "/*!\n * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome\n * License - http://fontawesome.io/lice"
},
{
"path": "static/font-awesome-4.7.0/less/icons.less",
"chars": 49712,
"preview": "/* Font Awesome uses the Unicode Private Use Area (PUA) to ensure screen\n readers do not read off random characters th"
},
{
"path": "static/font-awesome-4.7.0/less/larger.less",
"chars": 370,
"preview": "// Icon Sizes\n// -------------------------\n\n/* makes the font 33% larger relative to the icon container */\n.@{fa-css-pre"
},
{
"path": "static/font-awesome-4.7.0/less/list.less",
"chars": 377,
"preview": "// List Icons\n// -------------------------\n\n.@{fa-css-prefix}-ul {\n padding-left: 0;\n margin-left: @fa-li-width;\n lis"
},
{
"path": "static/font-awesome-4.7.0/less/mixins.less",
"chars": 1603,
"preview": "// Mixins\n// --------------------------\n\n.fa-icon() {\n display: inline-block;\n font: normal normal normal @fa-font-siz"
},
{
"path": "static/font-awesome-4.7.0/less/path.less",
"chars": 771,
"preview": "/* FONT PATH\n * -------------------------- */\n\n@font-face {\n font-family: 'FontAwesome';\n src: url('@{fa-font-path}/fo"
},
{
"path": "static/font-awesome-4.7.0/less/rotated-flipped.less",
"chars": 622,
"preview": "// Rotated & Flipped Icons\n// -------------------------\n\n.@{fa-css-prefix}-rotate-90 { .fa-icon-rotate(90deg, 1); }\n.@"
},
{
"path": "static/font-awesome-4.7.0/less/screen-reader.less",
"chars": 118,
"preview": "// Screen Readers\n// -------------------------\n\n.sr-only { .sr-only(); }\n.sr-only-focusable { .sr-only-focusable(); }\n"
},
{
"path": "static/font-awesome-4.7.0/less/stacked.less",
"chars": 476,
"preview": "// Stacked Icons\n// -------------------------\n\n.@{fa-css-prefix}-stack {\n position: relative;\n display: inline-block;\n"
},
{
"path": "static/font-awesome-4.7.0/less/variables.less",
"chars": 22563,
"preview": "// Variables\n// --------------------------\n\n@fa-font-path: \"../fonts\";\n@fa-font-size-base: 14px;\n@fa-line-heigh"
},
{
"path": "static/font-awesome-4.7.0/scss/_animated.scss",
"chars": 715,
"preview": "// Spinning Icons\n// --------------------------\n\n.#{$fa-css-prefix}-spin {\n -webkit-animation: fa-spin 2s infinite line"
},
{
"path": "static/font-awesome-4.7.0/scss/_bordered-pulled.scss",
"chars": 592,
"preview": "// Bordered & Pulled\n// -------------------------\n\n.#{$fa-css-prefix}-border {\n padding: .2em .25em .15em;\n border: so"
},
{
"path": "static/font-awesome-4.7.0/scss/_core.scss",
"chars": 459,
"preview": "// Base Class Definition\n// -------------------------\n\n.#{$fa-css-prefix} {\n display: inline-block;\n font: normal norm"
},
{
"path": "static/font-awesome-4.7.0/scss/_fixed-width.scss",
"chars": 120,
"preview": "// Fixed Width Icons\n// -------------------------\n.#{$fa-css-prefix}-fw {\n width: (18em / 14);\n text-align: center;\n}\n"
},
{
"path": "static/font-awesome-4.7.0/scss/_icons.scss",
"chars": 50498,
"preview": "/* Font Awesome uses the Unicode Private Use Area (PUA) to ensure screen\n readers do not read off random characters th"
},
{
"path": "static/font-awesome-4.7.0/scss/_larger.scss",
"chars": 375,
"preview": "// Icon Sizes\n// -------------------------\n\n/* makes the font 33% larger relative to the icon container */\n.#{$fa-css-pr"
},
{
"path": "static/font-awesome-4.7.0/scss/_list.scss",
"chars": 378,
"preview": "// List Icons\n// -------------------------\n\n.#{$fa-css-prefix}-ul {\n padding-left: 0;\n margin-left: $fa-li-width;\n li"
},
{
"path": "static/font-awesome-4.7.0/scss/_mixins.scss",
"chars": 1637,
"preview": "// Mixins\n// --------------------------\n\n@mixin fa-icon() {\n display: inline-block;\n font: normal normal normal #{$fa-"
},
{
"path": "static/font-awesome-4.7.0/scss/_path.scss",
"chars": 783,
"preview": "/* FONT PATH\n * -------------------------- */\n\n@font-face {\n font-family: 'FontAwesome';\n src: url('#{$fa-font-path}/f"
},
{
"path": "static/font-awesome-4.7.0/scss/_rotated-flipped.scss",
"chars": 672,
"preview": "// Rotated & Flipped Icons\n// -------------------------\n\n.#{$fa-css-prefix}-rotate-90 { @include fa-icon-rotate(90deg, "
},
{
"path": "static/font-awesome-4.7.0/scss/_screen-reader.scss",
"chars": 134,
"preview": "// Screen Readers\n// -------------------------\n\n.sr-only { @include sr-only(); }\n.sr-only-focusable { @include sr-only-f"
},
{
"path": "static/font-awesome-4.7.0/scss/_stacked.scss",
"chars": 482,
"preview": "// Stacked Icons\n// -------------------------\n\n.#{$fa-css-prefix}-stack {\n position: relative;\n display: inline-block;"
},
{
"path": "static/font-awesome-4.7.0/scss/_variables.scss",
"chars": 22644,
"preview": "// Variables\n// --------------------------\n\n$fa-font-path: \"../fonts\" !default;\n$fa-font-size-base: 14px !defau"
},
{
"path": "static/font-awesome-4.7.0/scss/font-awesome.scss",
"chars": 430,
"preview": "/*!\n * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome\n * License - http://fontawesome.io/lice"
},
{
"path": "static/js/bootstrap.js",
"chars": 69707,
"preview": "/*!\n * Bootstrap v3.3.7 (http://getbootstrap.com)\n * Copyright 2011-2016 Twitter, Inc.\n * Licensed under the MIT license"
},
{
"path": "static/js/d3.layout.cloud.js",
"chars": 14613,
"preview": "(function(f){if(typeof exports===\"object\"&&typeof module!==\"undefined\"){module.exports=f()}else if(typeof define===\"func"
},
{
"path": "static/js/explorer.js",
"chars": 9656,
"preview": "class ByteView {\n constructor(buffer) {\n this.buffer = buffer;\n this.view = new DataView(buffer.buffer);\n this"
},
{
"path": "static/js/ghostery.js",
"chars": 298,
"preview": "const navbar = document.querySelector('.navbar-ghostery');\n\nif (navbar) {\n if (window.location.href.indexOf('utm_campai"
},
{
"path": "static/js/highlight.pack.js",
"chars": 46065,
"preview": "/*! highlight.js v9.12.0 | BSD3 License | git.io/hljslicense */\n!function(e){var n=\"object\"==typeof window&&window||\"obj"
},
{
"path": "static/js/search.js",
"chars": 7777,
"preview": "// Indices to search\nlet sites_idx = [];\nlet trackers_idx = [];\nlet blog_idx = [];\n\n\nfetch(`${pathToRoot}/sitemap.json`)"
},
{
"path": "static/scss/_colors.scss",
"chars": 455,
"preview": "$grey-txt-color: #71869E;\n$red-txt-color: #A069AB;\n$light-blue-bg: #F4F7F8;\n$header-bg-color: #253135;\n$link-color: #007"
},
{
"path": "static/scss/blog/card.scss",
"chars": 1535,
"preview": "@import '../colors';\n\n$font-serif: Palatino, \"Palatino Linotype\", \"Palatino LT STD\", \"Book Antiqua\", Georgia, serif;\n\n."
},
{
"path": "static/scss/blog/github.scss",
"chars": 1148,
"preview": "/*\n\ngithub.com style (c) Vasily Polovnyov <vast@whiteants.net>\n\n*/\n\n.hljs {\n display: block;\n overflow-x: auto;\n padd"
},
{
"path": "static/scss/blog/post.scss",
"chars": 2638,
"preview": "@import '../colors';\n\n$font-serif: Palatino, \"Palatino Linotype\", \"Palatino LT STD\", \"Book Antiqua\", Georgia, serif;\n\n\n"
},
{
"path": "static/scss/bootstrap.min.scss",
"chars": 121200,
"preview": "/*!\n * Bootstrap v3.3.7 (http://getbootstrap.com)\n * Copyright 2011-2016 Twitter, Inc.\n * Licensed under MIT (https://gi"
},
{
"path": "static/scss/companies/reach-chart.scss",
"chars": 834,
"preview": "@import '../colors';\n\n.home-plot {\n color: $grey-txt-color;\n\n h4 {\n font-size: 11px;\n color: $grey-t"
},
{
"path": "static/scss/custom.scss",
"chars": 15362,
"preview": "@import 'colors';\n\nhtml {\n height: 100%;\n box-sizing: border-box;\n}\n\n@font-face {\n font-family: \"Open Sans\";\n src: u"
},
{
"path": "static/scss/datatables.colReorder.min.scss",
"chars": 178,
"preview": "table.DTCR_clonedTable.dataTable{position:absolute !important;background-color:rgba(255,255,255,0.7);z-index:202}div.DTC"
},
{
"path": "static/scss/datatables.min.scss",
"chars": 13900,
"preview": "table.dataTable{width:100%;margin:0 auto;clear:both;border-collapse:separate;border-spacing:0}table.dataTable thead th,t"
},
{
"path": "static/scss/explorer/table.scss",
"chars": 670,
"preview": "@import '../colors';\n\n#wtm{\n margin-top: 30px;\n margin-bottom: 30px;\n}\n.nav-tabs{\n margin-bottom: 20px;\n}\n#expo"
},
{
"path": "static/scss/home/index.scss",
"chars": 4917,
"preview": "@import '../colors';\n\n$font-serif: Palatino, \"Palatino Linotype\", \"Palatino LT STD\", \"Book Antiqua\", Georgia, serif;\n\n/"
},
{
"path": "static/scss/trackers/list.scss",
"chars": 2072,
"preview": "@import '../colors';\n\n.trackers-list {\n\n &.dark-page {\n h1 {\n color: #fff;\n font-family: 'FiraMono-Bold', "
},
{
"path": "static/scss/trackers/profile.scss",
"chars": 3496,
"preview": "@import '../colors';\n\n#tracker-page {\n #overview-header {\n background-color: $dark-header;\n\n .tracker-rank {\n "
},
{
"path": "static/scss/websites/overview.scss",
"chars": 536,
"preview": "@import '../colors';\n\n.websites-page {\n\n .info-box {\n margin-top: 40px;\n margin-bottom: 10px;\n }\n\n hr {\n bor"
},
{
"path": "static/scss/websites/profile.scss",
"chars": 842,
"preview": "@import '../colors';\n/* WEBSITES */\n#website-page .overview-header-box {\n\n #doughnut {\n height: 200px;\n }\n\n .dough"
},
{
"path": "templates/base.html",
"chars": 1791,
"preview": "<!doctype html>\n<!--[if lt IE 7]>\n<html class=\"no-js lt-ie9 lt-ie8 lt-ie7\" lang=\"\"> <![endif]-->\n<!--[if IE 7]>\n<html cl"
},
{
"path": "templates/blog-page.html",
"chars": 2392,
"preview": "{% extends \"base.html\" %}\n{% block title %}\n <title>Blog | {{blog_post.title}}</title>\n <meta name=\"description\" c"
},
{
"path": "templates/blog.html",
"chars": 1507,
"preview": "{% extends \"base.html\" %}\n{% block title %}\n <title>WhoTracks.me | Blog</title>\n <meta name=\"description\" content="
},
{
"path": "templates/company-page.html",
"chars": 1291,
"preview": "{% extends \"base.html\" %}\n\n<!-- Setting some vars used by breadcrumbs and navbar -->\n{% set active_page='companies' %}\n{"
},
{
"path": "templates/components/blog-card.html",
"chars": 473,
"preview": "<div class=\"card\">\n <div class=\"thumbnail\">\n <div class=\"card-media\">\n <img class=\"img-fit-cover\" s"
},
{
"path": "templates/components/breadcrumb.html",
"chars": 530,
"preview": "{% if active_page %}\n {% if not active_page==\"home\" %}\n\n <div class=\"breadcrumb-bar\">\n <div class=\"containe"
},
{
"path": "templates/components/category-item.html",
"chars": 158,
"preview": "<ul class=\"doughnut-legend\">\n{% for c in tracker_categories %}\n <li>\n {% include 'components/trackers/category"
},
{
"path": "templates/components/company-card.html",
"chars": 1064,
"preview": "<div class=\"panel panel-body\">\n {% if demographics.logo is not none %}\n <div class=\"left\">\n <img cl"
},
{
"path": "templates/components/cookies.html",
"chars": 385,
"preview": "<img src=\"{{ PATHS.static }}/img/cookies.svg\">\n<p class=\"tracking-method\">\n <span class=\"highlight\">COOKIES</span> are "
},
{
"path": "templates/components/fingerprinting.html",
"chars": 380,
"preview": "<img src=\"{{ PATHS.static }}/img/fingerprinting.svg\">\n<p class=\"tracking-method\">\n <span class=\"highlight\">FINGERPRINT"
},
{
"path": "templates/components/footer.html",
"chars": 1936,
"preview": "<!-- FOOTER -->\n<footer>\n <div class=\"container\">\n <div class=\"row\">\n <div class=\"col-md-3 col-sm-3 col-xs-12 m"
},
{
"path": "templates/components/home/header.html",
"chars": 1276,
"preview": "<div class=\"home-header\">\n <div class=\"container\">\n <div class=\"row\">\n <div class=\"col-md-5 col-sm-"
},
{
"path": "templates/components/navbar.html",
"chars": 4734,
"preview": "<nav class=\"navbar-ghostery\">\n <div class=\"navbar-ghostery-header\">\n <img src=\"{{PATHS.static}}/img/logo-ghost"
},
{
"path": "templates/components/tag_cloud.html",
"chars": 4946,
"preview": "<script src=\"{{PATHS.static}}/js/d3.min.js\"></script>\n<script src=\"{{PATHS.static}}/js/d3.layout.cloud.js\"></script>\n\n<s"
},
{
"path": "templates/components/top-5-info-box.html",
"chars": 609,
"preview": "<div class=\"info-box\">\n <div class=\"row\">\n <div class=\"col-md-8\">\n <span class=\"percentage\">\n {{"
},
{
"path": "templates/components/top-5-trackers.html",
"chars": 1252,
"preview": "<div class=\"row top-5-trackers\">\n <div class=\"col-md-6 col-sm-6\">\n <ul class=\"top-5-list\">\n {% for "
},
{
"path": "templates/components/tracker-list.html",
"chars": 1850,
"preview": "\n<ul class=\"nav nav-tabs\">\n <li class=\"active\"><a data-toggle=\"tab\" href=\"#popularity\">Popularity</a></li>\n <l"
},
{
"path": "templates/components/trackers/category.html",
"chars": 150,
"preview": "<span class=\"cat-item\">\n <i class=\"fa fa-circle\" style=\"color: {{ TRACKER_CATEGORIES[c] }}\" aria-hidden=\"true\"></i> \n "
},
{
"path": "templates/components/trackers/header.html",
"chars": 4322,
"preview": "<div class=\"col-md-6 col-sm-6\">\n <h1>{{ profile.name }}</h1>\n {% if app.company_id %}\n <p class=\"header\">Owned by {"
},
{
"path": "templates/components/tracking-methods.html",
"chars": 762,
"preview": "<h3 style=\"margin-left:15px\">TRACKING METHODS</h3>\n{% if site.overview.cookies > 0.4 and site.overview.bad_qs > 0.1 %}\n "
},
{
"path": "templates/components/unified-ui-tracker-list.html",
"chars": 2199,
"preview": "<ul class=\"nav nav-tabs\">\n <li class=\"active\"><a data-toggle=\"tab\" href=\"#popularity\">Popularity</a></li>\n <li"
},
{
"path": "templates/components/website-list.html",
"chars": 1861,
"preview": "<ul class=\"nav nav-tabs\">\n <li class=\"active\"><a data-toggle=\"tab\" href=\"#popularity\">Popularity</a></li>\n <li><a data"
},
{
"path": "templates/components/websites/header.html",
"chars": 3196,
"preview": "<div class=\"container\">\n <div class=\"row overview-info\">\n <div class=\"col-md-6\">\n {% if profile.web"
},
{
"path": "templates/components/websites/tracker-list.html",
"chars": 2119,
"preview": "<ul class=\"nav nav-tabs\">\n <li class=\"active\"><a data-toggle=\"tab\" href=\"#frequency\">Frequency</a></li>\n <li><"
},
{
"path": "templates/explorer.html",
"chars": 3258,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}\n<title>WhoTracks.me | Explorer</title>\n<meta name=\"description\" content=\"Tr"
},
{
"path": "templates/imprint.html",
"chars": 1784,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}\n<title>WhoTracks.me, operated by Ghostery - Privacy Policy</title>\n<meta na"
},
{
"path": "templates/index.html",
"chars": 9921,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}\n <title>WhoTracks.me - Bringing Transparency to Online Tracking</title>\n"
},
{
"path": "templates/not-found.html",
"chars": 1149,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}\n <title>WhoTracks.me | Page Not Found </title>\n <meta name=\"descripti"
},
{
"path": "templates/privacy-policy.html",
"chars": 26380,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}\n<title>WhoTracks.me - Bringing Transparency to Online Tracking</title>\n<met"
},
{
"path": "templates/reach-chart-page.html",
"chars": 1487,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}\n <title>WhoTracks.me | Who Tracks the Most</title>\n <meta name=\"descr"
},
{
"path": "templates/tracker-not-found.html",
"chars": 1276,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}\n <title>WhoTracks.me | Tracker Not Found </title>\n <meta name=\"descri"
},
{
"path": "templates/tracker-page.html",
"chars": 5249,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}\n <title>Tracker | {{ app.tracker | get_app_name }}</title>\n <meta nam"
},
{
"path": "templates/trackers.html",
"chars": 1417,
"preview": "{% extends \"base.html\" %}\n{% block title %}\n <title>WhoTracks.me | Trackers</title>\n <meta name=\"description\" cont"
},
{
"path": "templates/website-not-found.html",
"chars": 1220,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}\n <title>WhoTracks.me | Website Profile Not Found</title>\n <meta name="
},
{
"path": "templates/website-page.html",
"chars": 2178,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}\n <title>Website | {{ profile.name }}</title>\n <meta name=\"descriptio"
},
{
"path": "templates/websites.html",
"chars": 2177,
"preview": "{% extends \"base.html\" %}\n\n{% block title %}\n <title>WhoTracks.me | Websites</title>\n <meta name=\"description\" con"
},
{
"path": "tests/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "tests/test_data_integrity.py",
"chars": 1665,
"preview": "from operator import itemgetter\nimport unittest\n\nfrom whotracksme.data import load_tracker_db, DataSource\n\n\nclass TestDa"
},
{
"path": "tests/test_db_integrity.py",
"chars": 1337,
"preview": "import unittest\n\nfrom whotracksme.data import load_tracker_db\n\n\nclass TestDbIntegrity(unittest.TestCase):\n\n def setUp"
},
{
"path": "tests/test_db_validity.py",
"chars": 2428,
"preview": "import sqlite3\nimport unittest\n\nfrom whotracksme.data import load_tracker_db\n\n\nclass ValidateTrackerDatabase(unittest.Te"
},
{
"path": "tests/test_site_categories.py",
"chars": 1086,
"preview": "import unittest\nimport csv\n\nVALID_CATEGORIES = set([\n \"Adult\",\n \"Banking\",\n \"Business\",\n \"E-Commerce\",\n \""
},
{
"path": "tests/test_sites_data.py",
"chars": 366,
"preview": "import unittest\n\nfrom whotracksme.data import DataSource\n\nclass TestSitesData(unittest.TestCase):\n\n def test_all_site"
},
{
"path": "update_trackerdb.sh",
"chars": 1113,
"preview": "#!/bin/bash\n#\n# Downloads the latest trackerdb release (from https://github.com/ghostery/trackerdb),\n# and updates \"whot"
},
{
"path": "update_trackers_preview.py",
"chars": 4130,
"preview": "\"\"\"\nHelper script to update the trackers-preview.json file.\n\nThe raw input data (aka \"privacy score\") is computed each m"
},
{
"path": "whotracksme/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "whotracksme/data/Readme.md",
"chars": 21775,
"preview": "# Data\n\nThe data for the whotracks.me site is provided here as JSON files, with a SQL database containing tracker inform"
},
{
"path": "whotracksme/data/__init__.py",
"chars": 133,
"preview": "\nfrom whotracksme.data.loader import (\n load_tracker_db,\n DataSource,\n)\n\n__all__ = [\n \"load_tracker_db\",\n \"D"
},
{
"path": "whotracksme/data/assets/trackerdb.sql",
"chars": 1854308,
"preview": "-- Generated from https://github.com/ghostery/trackerdb/releases/download/202604281015/trackerdb.db\nPRAGMA foreign_keys="
},
{
"path": "whotracksme/data/assets/trackers-preview.json",
"chars": 395749,
"preview": "{\"trackers\":{\"google.com\":[4,1,0,0,0,4,0,0,1,0,1],\"fiverr.com\":[1,0,0,0,0,2,0,0,3,0,3],\"youtube.com\":[3,1,0,0,1,6,0,0,0,"
},
{
"path": "whotracksme/data/db.py",
"chars": 11047,
"preview": "import json\nimport requests\nimport sqlite3\nimport importlib.resources\nimport itertools\nimport io\nimport csv\nfrom hashlib"
},
{
"path": "whotracksme/data/loader.py",
"chars": 19460,
"preview": "from datetime import datetime\nfrom urllib.parse import quote_plus\nimport io\nimport importlib.resources\nfrom collections "
},
{
"path": "whotracksme/data/pack.py",
"chars": 6740,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\nfrom struct import Struct, unpack_from, pack, calcsize\nimport collections"
},
{
"path": "whotracksme/main.py",
"chars": 2052,
"preview": "#! /usr/bin/env python\n# -*- coding: utf-8 -*-\n\n\n\"\"\"\nWhotracks.me website development tool.\n\nUsage:\n whotracksme webs"
},
{
"path": "whotracksme/qa/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "whotracksme/qa/todo.py",
"chars": 1719,
"preview": "import sqlite3\nfrom whotracksme.qa.utils import retrieve_status, write_to_file\n\n\ndef create_task_files(needqa_folder, **"
},
{
"path": "whotracksme/qa/utils.py",
"chars": 1611,
"preview": "import aiohttp\nimport asyncio\nimport async_timeout\nimport json\n\n\nasync def fetch(session, url):\n with async_timeout.t"
},
{
"path": "whotracksme/website/__init__.py",
"chars": 1,
"preview": "\n"
},
{
"path": "whotracksme/website/api/meta.py",
"chars": 1583,
"preview": "import os\nfrom datetime import datetime, timezone\nimport json\nimport re\n\ndef get_trackerdb_version():\n trackerdb_sql "
},
{
"path": "whotracksme/website/build/__init__.py",
"chars": 1,
"preview": "\n"
},
{
"path": "whotracksme/website/build/blog.py",
"chars": 4934,
"preview": "import os\nimport calendar\nfrom datetime import datetime\nfrom whotracksme.data.loader import DataSource\nfrom whotracksme."
},
{
"path": "whotracksme/website/build/companies.py",
"chars": 4111,
"preview": "from collections import defaultdict\nfrom markupsafe import Markup\n\nfrom whotracksme.website.utils import print_progress,"
},
{
"path": "whotracksme/website/build/data.py",
"chars": 2627,
"preview": "import json\nfrom pathlib import Path\nfrom whotracksme.data.loader import DataSource\nfrom whotracksme.data.db import load"
},
{
"path": "whotracksme/website/build/explorer.py",
"chars": 3325,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\nimport json\nimport shutil\nimport csv\nfrom pathlib import Path\nfrom dateti"
},
{
"path": "whotracksme/website/build/home.py",
"chars": 2246,
"preview": "from markupsafe import Markup\nfrom whotracksme.website.plotting.companies import overview_bars\nfrom whotracksme.website."
},
{
"path": "whotracksme/website/build/trackers.py",
"chars": 5491,
"preview": "from collections import defaultdict\nfrom markupsafe import Markup\n\nfrom whotracksme.data.loader import DataSource\nfrom w"
},
{
"path": "whotracksme/website/build/websites.py",
"chars": 4197,
"preview": "#! /usr/bin/env python\n# -*- coding: utf-8 -*-\n\nfrom operator import itemgetter\nfrom markupsafe import Markup\n\nfrom whot"
},
{
"path": "whotracksme/website/builder.py",
"chars": 8952,
"preview": "#! /usr/bin/env python\n# -*- coding: utf-8 -*-\n\n\nimport concurrent.futures\nimport os\n\nfrom pathlib import Path\nfrom whot"
},
{
"path": "whotracksme/website/plotting/.vscode/settings.json",
"chars": 127,
"preview": "version https://git-lfs.github.com/spec/v1\noid sha256:048f53e6ca01ac583b48784cd2f6f7d248e0534849955b144e75f017f73188a3\ns"
},
{
"path": "whotracksme/website/plotting/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "whotracksme/website/plotting/colors.py",
"chars": 1820,
"preview": "from random import randint\nimport colour\n\nwtm_colors = {\n \"purple\": \"#A069AB\",\n \"blue\": \"#00AEF0\",\n \"black\": \"#"
},
{
"path": "whotracksme/website/plotting/companies.py",
"chars": 2347,
"preview": "from datetime import datetime\nimport plotly.graph_objs as go\n\nfrom whotracksme.website.plotting.utils import set_margins"
},
{
"path": "whotracksme/website/plotting/plots.py",
"chars": 6006,
"preview": "import plotly.graph_objs as go\nimport squarify\n\nfrom whotracksme.website.plotting.utils import (\n WTMFonts,\n div_o"
},
{
"path": "whotracksme/website/plotting/sankey.py",
"chars": 1126,
"preview": "from whotracksme.website.plotting.utils import div_output, set_margins\n\n\ndef sankey_plot(sndata):\n data_trace = dict("
},
{
"path": "whotracksme/website/plotting/trackers.py",
"chars": 5945,
"preview": "from statistics import mean\nimport plotly.graph_objs as go\nimport squarify\n\nfrom whotracksme.website.plotting.utils impo"
},
{
"path": "whotracksme/website/plotting/utils.py",
"chars": 2458,
"preview": "from collections import namedtuple\nfrom plotly.io import to_html\nfrom plotly.graph_objs.layout import Margin\n\nfrom whotr"
},
{
"path": "whotracksme/website/serve.py",
"chars": 2054,
"preview": "#! /usr/bin/env python\n# -*- coding: utf-8 -*-\n\n\nfrom multiprocessing import Process\nimport os\nimport time\n\nfrom sanic i"
},
{
"path": "whotracksme/website/templates.py",
"chars": 8640,
"preview": "from collections import defaultdict\nimport json\nimport os\nimport shutil\nimport subprocess\nfrom datetime import date\nfrom"
},
{
"path": "whotracksme/website/utils.py",
"chars": 634,
"preview": "import datetime\nimport simplejson\nimport pathlib\nimport os\n\n\ndef write_json(path, **data):\n def myconverter(o):\n "
}
]
// ... and 2 more files (download for full content)
About this extraction
This page contains the full source code of the cliqz-oss/whotracks.me GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 179 files (17.9 MB), approximately 4.7M tokens, and a symbol index with 302 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.