From da85207314724c02d151a7bdfcdca3ef3fd056a1 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 12:27:09 +0100 Subject: [PATCH 1/4] Implement new function "json_to_nginx" which outputs an Nginx configuration snippet --- code/robots.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/code/robots.py b/code/robots.py index 6bf7920..f58f2b8 100755 --- a/code/robots.py +++ b/code/robots.py @@ -152,6 +152,12 @@ def json_to_htaccess(robot_json): htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n" return htaccess +def json_to_nginx(robot_json): + # Creates an Nginx config file. This config snippet can be included in + # nginx server{} blocks to block AI bots. + config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" + return config + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" @@ -178,6 +184,10 @@ def conversions(): file_name="./.htaccess", converter=json_to_htaccess, ) + update_file_if_changed( + file_name="./nginx-block-ai-bots.conf", + converter=json_to_nginx, + ) if __name__ == "__main__": From 5a312c5f4d1fcd89c17f4d6cb360ad7230857402 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 12:28:11 +0100 Subject: [PATCH 2/4] Mention Nginx config feature in README --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 30a85da..b984672 100644 --- a/README.md +++ b/README.md @@ -13,16 +13,19 @@ If you'd like to add information about a crawler to the list, please make a pull This repository provides the following files: - `robots.txt` - `.htaccess` +- `nginx-block-ai-bots.conf` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). `.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server. Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. +`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. + ## Contributing -A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`. +A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`. You can run the tests by [installing](https://www.python.org/about/gettingstarted/) Python 3 and issuing: ```console From 4f3f4cd0dd0f421c2787b1336d37b8da06998882 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 12:28:50 +0100 Subject: [PATCH 3/4] Add assembled version of nginx-block-ai-bots.conf file --- nginx-block-ai-bots.conf | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 nginx-block-ai-bots.conf diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf new file mode 100644 index 0000000..ce30520 --- /dev/null +++ b/nginx-block-ai-bots.conf @@ -0,0 +1,3 @@ +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { + return 403; +} \ No newline at end of file From 7c3b5a2cb21f5404cf4e2af1acf8689ba77d7b06 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 16:12:18 +0100 Subject: [PATCH 4/4] Add tests for Nginx config generator --- code/test_files/nginx-block-ai-bots.conf | 3 +++ code/tests.py | 12 +++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 code/test_files/nginx-block-ai-bots.conf diff --git a/code/test_files/nginx-block-ai-bots.conf b/code/test_files/nginx-block-ai-bots.conf new file mode 100644 index 0000000..d1b559e --- /dev/null +++ b/code/test_files/nginx-block-ai-bots.conf @@ -0,0 +1,3 @@ +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") { + return 403; +} \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index 94cbb47..61d69b4 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -50,6 +50,16 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_htaccess = json_to_htaccess(self.robots_dict) self.assertEqualsFile("test_files/.htaccess", robots_htaccess) +class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_nginx_generation(self): + robots_nginx = json_to_nginx(self.robots_dict) + self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) + if __name__ == "__main__": import os