From 933aa6159da9dbe7025f6294e98a6d3e326b43a3 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi Date: Tue, 7 Jan 2025 11:02:29 +0100 Subject: [PATCH 01/12] Implementing htaccess generation --- .htaccess | 3 +++ code/robots.py | 22 +++++++++++++++++++++- code/test_files/.htaccess | 3 +++ code/tests.py | 8 +++++++- 4 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 .htaccess create mode 100644 code/test_files/.htaccess diff --git a/.htaccess b/.htaccess new file mode 100644 index 0000000..31ba5f7 --- /dev/null +++ b/.htaccess @@ -0,0 +1,3 @@ +RewriteEngine On +RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteRule .* - [F,L] \ No newline at end of file diff --git a/code/robots.py b/code/robots.py index cf44e8e..d35d74b 100644 --- a/code/robots.py +++ b/code/robots.py @@ -132,10 +132,26 @@ def json_to_table(robots_json): return table +def json_to_htaccess(robot_json): + htaccess = "RewriteEngine On\n" + htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" + + robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys()) + htaccess += "|".join(robots) + htaccess += ").*$ [NC]\n" + htaccess += "RewriteRule .* - [F,L]" + return htaccess + + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" new_content = converter(load_robots_json()) - old_content = Path(file_name).read_text(encoding="utf-8") + filepath = Path(file_name) + if not filepath.exists(): + filepath.write_text(new_content, encoding="utf-8") + print(f"{file_name} has been created.") + return + old_content = filepath.read_text(encoding="utf-8") if old_content == new_content: print(f"{file_name} is already up to date.") else: @@ -150,6 +166,10 @@ def conversions(): file_name="./table-of-bot-metrics.md", converter=json_to_table, ) + update_file_if_changed( + file_name="./.htaccess", + converter=json_to_htaccess, + ) if __name__ == "__main__": diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess new file mode 100644 index 0000000..a34bf55 --- /dev/null +++ b/code/test_files/.htaccess @@ -0,0 +1,3 @@ +RewriteEngine On +RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteRule .* - [F,L] \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index 9cf35fe..6f778c3 100644 --- a/code/tests.py +++ b/code/tests.py @@ -6,7 +6,7 @@ cd to the `code` directory and run `pytest` import json from pathlib import Path -from robots import json_to_txt, json_to_table +from robots import json_to_txt, json_to_table, json_to_htaccess def test_robots_txt_creation(): @@ -19,3 +19,9 @@ def test_table_of_bot_metrices_md(): robots_json = json.loads(Path("test_files/robots.json").read_text()) robots_table = json_to_table(robots_json) assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table + + +def test_htaccess_creation(): + robots_json = json.loads(Path("test_files/robots.json").read_text()) + robots_htaccess = json_to_htaccess(robots_json) + assert Path("test_files/.htaccess").read_text() == robots_htaccess From 189e75bbfd06715a5d30972d3aa4c23974aecee0 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi Date: Fri, 17 Jan 2025 21:25:23 +0100 Subject: [PATCH 02/12] Adding usage instructions --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index b3c2e7c..45c8f3a 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,19 @@ A number of these crawlers have been sourced from [Dark Visitors](https://darkvi If you'd like to add information about a crawler to the list, please make a pull request with the bot name added to `robots.txt`, `ai.txt`, and any relevant details in `table-of-bot-metrics.md` to help people understand what's crawling. +## Usage + +Many visitors will find these files from this repository most useful: +- `robots.txt` +- `.htaccess` + +The first one tells search engine and AI crawlers which parts of your website should be scanned or avoided. The webpages of your server are returned anyway, but the crawler "pledges" not to use them. By default, the provided `robots.txt` tells every AI crawler not to scan any page in your website. This is not bulletproof, as an evil crawler could simply ignore the `robots.txt` content. + +The second one tells your own webserver to return an error page when one of the listed AI crawlers tries to request a page from your website. A `.htaccess` file does not work on every webserver, but works correctly on most common and cheap shared hosting providers. The majority of AI crawlers set a "User Agent" string in every request they send, by which they are identifiable: this string is used to filter the request. Instead of simply hoping the crawler pledges to respect our intention, this solution actively sends back a bad webpage (an error or an empty page). Note that this solution isn't bulletproof either, as anyone can fake the sent User Agent. + +We suggest adding both files, as some crawlers may respect `robots.txt` while not having an identifiable User Agent; on the other hand, other crawlers may not respect the `robots.txt`, but they provide a identifiable User Agent by which we can filter them out. + + ## Contributing A note about contributing: updates should be added/made to `robots.json`. A GitHub action, courtesy of [Adam](https://github.com/newbold), will then generate the updated `robots.txt` and `table-of-bot-metrics.md`. From b455af66e7903e76162d43f3e8f0900084fb9539 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi Date: Fri, 17 Jan 2025 21:42:08 +0100 Subject: [PATCH 03/12] Adding clarification about performance and code comment --- README.md | 3 ++- code/robots.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 45c8f3a..dd84a16 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,9 @@ The first one tells search engine and AI crawlers which parts of your website sh The second one tells your own webserver to return an error page when one of the listed AI crawlers tries to request a page from your website. A `.htaccess` file does not work on every webserver, but works correctly on most common and cheap shared hosting providers. The majority of AI crawlers set a "User Agent" string in every request they send, by which they are identifiable: this string is used to filter the request. Instead of simply hoping the crawler pledges to respect our intention, this solution actively sends back a bad webpage (an error or an empty page). Note that this solution isn't bulletproof either, as anyone can fake the sent User Agent. -We suggest adding both files, as some crawlers may respect `robots.txt` while not having an identifiable User Agent; on the other hand, other crawlers may not respect the `robots.txt`, but they provide a identifiable User Agent by which we can filter them out. +Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. Nevertheless, most shared hosting providers only allow `.htaccess` configuration. +We suggest adding both files, as some crawlers may respect `robots.txt` while not having an identifiable User Agent; on the other hand, other crawlers may not respect the `robots.txt`, but they provide a identifiable User Agent by which we can filter them out. ## Contributing diff --git a/code/robots.py b/code/robots.py index d35d74b..f2ddbb8 100644 --- a/code/robots.py +++ b/code/robots.py @@ -133,7 +133,9 @@ def json_to_table(robots_json): def json_to_htaccess(robot_json): - htaccess = "RewriteEngine On\n" + # Creates a .htaccess filter file. It uses a regular expression to filter out + #User agents that contain any of the blocked values. + htaccess += "RewriteEngine On\n" htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys()) From 8aee2f24bb03a8d91a2fb17c3a98628411239d40 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Sat, 18 Jan 2025 12:39:07 +0100 Subject: [PATCH 04/12] Fixed space in comment Co-authored-by: Glyn Normington --- code/robots.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/robots.py b/code/robots.py index f2ddbb8..0172330 100644 --- a/code/robots.py +++ b/code/robots.py @@ -134,7 +134,7 @@ def json_to_table(robots_json): def json_to_htaccess(robot_json): # Creates a .htaccess filter file. It uses a regular expression to filter out - #User agents that contain any of the blocked values. + # User agents that contain any of the blocked values. htaccess += "RewriteEngine On\n" htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" From 1cc4b59dfc4acd5666478efea658b1adf1af8aee Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Sat, 18 Jan 2025 12:40:03 +0100 Subject: [PATCH 05/12] Shortened htaccess instructions Co-authored-by: Glyn Normington --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dd84a16..badd23b 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Many visitors will find these files from this repository most useful: - `robots.txt` - `.htaccess` -The first one tells search engine and AI crawlers which parts of your website should be scanned or avoided. The webpages of your server are returned anyway, but the crawler "pledges" not to use them. By default, the provided `robots.txt` tells every AI crawler not to scan any page in your website. This is not bulletproof, as an evil crawler could simply ignore the `robots.txt` content. +`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). The second one tells your own webserver to return an error page when one of the listed AI crawlers tries to request a page from your website. A `.htaccess` file does not work on every webserver, but works correctly on most common and cheap shared hosting providers. The majority of AI crawlers set a "User Agent" string in every request they send, by which they are identifiable: this string is used to filter the request. Instead of simply hoping the crawler pledges to respect our intention, this solution actively sends back a bad webpage (an error or an empty page). Note that this solution isn't bulletproof either, as anyone can fake the sent User Agent. From d65128d10acfd14b714488170b3a261912cc3729 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Sat, 18 Jan 2025 12:41:09 +0100 Subject: [PATCH 06/12] Removed paragraph in favour of future FAQ.md Co-authored-by: Glyn Normington --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index badd23b..505a8dd 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,6 @@ The second one tells your own webserver to return an error page when one of the Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. Nevertheless, most shared hosting providers only allow `.htaccess` configuration. -We suggest adding both files, as some crawlers may respect `robots.txt` while not having an identifiable User Agent; on the other hand, other crawlers may not respect the `robots.txt`, but they provide a identifiable User Agent by which we can filter them out. ## Contributing From 70fd6c0fb13cdf4f0525bf061556e8e50ca7b8d9 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Mon, 20 Jan 2025 06:25:07 +0100 Subject: [PATCH 07/12] Add mention of htaccess in readme Co-authored-by: Glyn Normington --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 505a8dd..cd8d467 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ ## Contributing -A note about contributing: updates should be added/made to `robots.json`. A GitHub action, courtesy of [Adam](https://github.com/newbold), will then generate the updated `robots.txt` and `table-of-bot-metrics.md`. +A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`. ## Subscribe to updates From 013b7abfa1f2126e9320ddbab90ff87af54b092c Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Mon, 20 Jan 2025 06:27:02 +0100 Subject: [PATCH 08/12] Update README.md Co-authored-by: Glyn Normington --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cd8d467..1417a85 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,9 @@ Many visitors will find these files from this repository most useful: `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). -The second one tells your own webserver to return an error page when one of the listed AI crawlers tries to request a page from your website. A `.htaccess` file does not work on every webserver, but works correctly on most common and cheap shared hosting providers. The majority of AI crawlers set a "User Agent" string in every request they send, by which they are identifiable: this string is used to filter the request. Instead of simply hoping the crawler pledges to respect our intention, this solution actively sends back a bad webpage (an error or an empty page). Note that this solution isn't bulletproof either, as anyone can fake the sent User Agent. +### `.htaccess` + +`.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server. Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. Nevertheless, most shared hosting providers only allow `.htaccess` configuration. From 52241bdca6c9930f7b225264cd862b5f98a2d68f Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Mon, 20 Jan 2025 06:27:56 +0100 Subject: [PATCH 09/12] Update README.md Co-authored-by: Glyn Normington --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1417a85..bb6558c 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Many visitors will find these files from this repository most useful: `.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server. -Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. Nevertheless, most shared hosting providers only allow `.htaccess` configuration. +Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. ## Contributing From 33c38ee70b3a45343ddb360ae79e743e42bc8f76 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Mon, 20 Jan 2025 06:28:32 +0100 Subject: [PATCH 10/12] Update README.md Co-authored-by: Glyn Normington --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index bb6558c..648f5ed 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,12 @@ If you'd like to add information about a crawler to the list, please make a pull ## Usage -Many visitors will find these files from this repository most useful: +This repository provides the following files: - `robots.txt` - `.htaccess` +### `robots.txt` + `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). ### `.htaccess` From a9956f7825080467adbbda6e41d7dfbaee47210b Mon Sep 17 00:00:00 2001 From: Massimo Gismondi Date: Mon, 20 Jan 2025 06:50:48 +0100 Subject: [PATCH 11/12] Removed additional sections --- README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/README.md b/README.md index 648f5ed..065b0b7 100644 --- a/README.md +++ b/README.md @@ -14,14 +14,9 @@ This repository provides the following files: - `robots.txt` - `.htaccess` -### `robots.txt` - `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). -### `.htaccess` - `.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server. - Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. From 4f03818280e7979697250ac5d59da12290db2e9f Mon Sep 17 00:00:00 2001 From: Massimo Gismondi Date: Mon, 20 Jan 2025 06:51:06 +0100 Subject: [PATCH 12/12] Removed if condition and added a little comments --- code/robots.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/code/robots.py b/code/robots.py index 0172330..087b00b 100644 --- a/code/robots.py +++ b/code/robots.py @@ -135,9 +135,10 @@ def json_to_table(robots_json): def json_to_htaccess(robot_json): # Creates a .htaccess filter file. It uses a regular expression to filter out # User agents that contain any of the blocked values. - htaccess += "RewriteEngine On\n" + htaccess = "RewriteEngine On\n" htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" + # Escape spaces in each User Agent to build the regular expression robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys()) htaccess += "|".join(robots) htaccess += ").*$ [NC]\n" @@ -149,10 +150,8 @@ def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" new_content = converter(load_robots_json()) filepath = Path(file_name) - if not filepath.exists(): - filepath.write_text(new_content, encoding="utf-8") - print(f"{file_name} has been created.") - return + # "touch" will create the file if it doesn't exist yet + filepath.touch() old_content = filepath.read_text(encoding="utf-8") if old_content == new_content: print(f"{file_name} is already up to date.")