From 50e739dd738bb821018a863491b770dd8ee61155 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 08:42:52 +0200 Subject: [PATCH 1/3] HAProxy converter added. --- README.md | 9 +++++++ code/robots.py | 9 +++++++ haproxy-block-ai-bots.txt | 57 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 haproxy-block-ai-bots.txt diff --git a/README.md b/README.md index b984672..1f1eff6 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This repository provides the following files: - `robots.txt` - `.htaccess` - `nginx-block-ai-bots.conf` +- `haproxy-block-ai-bots.txt` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). @@ -22,6 +23,14 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. +`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it; +1. Add the file to the config directory of HAProxy +2. Add the following lines in the `frontend` section; + ``` + acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt + http-request deny if ai_robot + ``` + (Note that the path of the `haproxy-block-ai-bots.txt` may be different on your environment.) ## Contributing diff --git a/code/robots.py b/code/robots.py index 8a06b55..da157c1 100755 --- a/code/robots.py +++ b/code/robots.py @@ -178,6 +178,11 @@ def json_to_nginx(robot_json): config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" return config +def json_to_haproxy(robots_json): + # Creates a source file for HAProxy. Follow instructions in the README to implement it. + txt = "\n".join(f"{k}" for k in robots_json.keys()) + return txt + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" @@ -208,6 +213,10 @@ def conversions(): file_name="./nginx-block-ai-bots.conf", converter=json_to_nginx, ) + update_file_if_changed( + file_name="./haproxy-block-ai-bots.txt", + converter=json_to_haproxy, + ) if __name__ == "__main__": diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt new file mode 100644 index 0000000..3c326bd --- /dev/null +++ b/haproxy-block-ai-bots.txt @@ -0,0 +1,57 @@ +AI2Bot +Ai2Bot-Dolma +aiHitBot +Amazonbot +anthropic-ai +Applebot +Applebot-Extended +Brightbot 1.0 +Bytespider +CCBot +ChatGPT-User +Claude-Web +ClaudeBot +cohere-ai +cohere-training-data-crawler +Cotoyogi +Crawlspace +Diffbot +DuckAssistBot +FacebookBot +Factset_spyderbot +FirecrawlAgent +FriendlyCrawler +Google-Extended +GoogleOther +GoogleOther-Image +GoogleOther-Video +GPTBot +iaskspider/2.0 +ICC-Crawler +ImagesiftBot +img2dataset +imgproxy +ISSCyberRiskCrawler +Kangaroo Bot +meta-externalagent +Meta-ExternalAgent +meta-externalfetcher +Meta-ExternalFetcher +NovaAct +OAI-SearchBot +omgili +omgilibot +Operator +PanguBot +Perplexity-User +PerplexityBot +PetalBot +Scrapy +SemrushBot-OCOB +SemrushBot-SWA +Sidetrade indexer bot +TikTokSpider +Timpibot +VelenPublicWebCrawler +Webzio-Extended +YouBot \ No newline at end of file From 66da70905f503239faeb0e49204776f508928048 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 09:09:40 +0200 Subject: [PATCH 2/3] Fixed incorrect English sentence. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1f1eff6..ff124e3 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt http-request deny if ai_robot ``` - (Note that the path of the `haproxy-block-ai-bots.txt` may be different on your environment.) + (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) ## Contributing From a4a9f2ac2b9116d104789664231af4017d3828a7 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 09:30:26 +0200 Subject: [PATCH 3/3] Tests for HAProxy file added. --- code/test_files/haproxy-block-ai-bots.txt | 47 +++++++++++++++++++++++ code/tests.py | 12 +++++- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 code/test_files/haproxy-block-ai-bots.txt diff --git a/code/test_files/haproxy-block-ai-bots.txt b/code/test_files/haproxy-block-ai-bots.txt new file mode 100644 index 0000000..5ed6939 --- /dev/null +++ b/code/test_files/haproxy-block-ai-bots.txt @@ -0,0 +1,47 @@ +AI2Bot +Ai2Bot-Dolma +Amazonbot +anthropic-ai +Applebot +Applebot-Extended +Bytespider +CCBot +ChatGPT-User +Claude-Web +ClaudeBot +cohere-ai +Diffbot +FacebookBot +facebookexternalhit +FriendlyCrawler +Google-Extended +GoogleOther +GoogleOther-Image +GoogleOther-Video +GPTBot +iaskspider/2.0 +ICC-Crawler +ImagesiftBot +img2dataset +ISSCyberRiskCrawler +Kangaroo Bot +Meta-ExternalAgent +Meta-ExternalFetcher +OAI-SearchBot +omgili +omgilibot +Perplexity-User +PerplexityBot +PetalBot +Scrapy +Sidetrade indexer bot +Timpibot +VelenPublicWebCrawler +Webzio-Extended +YouBot +crawler.with.dots +star***crawler +Is this a crawler? +a[mazing]{42}(robot) +2^32$ +curl|sudo bash \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index f58b445..e179c44 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -60,6 +60,16 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_nginx = json_to_nginx(self.robots_dict) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) +class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_haproxy_generation(self): + robots_haproxy = json_to_haproxy(self.robots_dict) + self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) + class TestRobotsNameCleaning(unittest.TestCase): def test_clean_name(self): from robots import clean_robot_name