diff --git a/README.md b/README.md index ff124e3..b984672 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,6 @@ This repository provides the following files: - `robots.txt` - `.htaccess` - `nginx-block-ai-bots.conf` -- `haproxy-block-ai-bots.txt` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). @@ -23,14 +22,6 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. -`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it; -1. Add the file to the config directory of HAProxy -2. Add the following lines in the `frontend` section; - ``` - acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt - http-request deny if ai_robot - ``` - (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) ## Contributing diff --git a/code/robots.py b/code/robots.py index da157c1..8a06b55 100755 --- a/code/robots.py +++ b/code/robots.py @@ -178,11 +178,6 @@ def json_to_nginx(robot_json): config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" return config -def json_to_haproxy(robots_json): - # Creates a source file for HAProxy. Follow instructions in the README to implement it. - txt = "\n".join(f"{k}" for k in robots_json.keys()) - return txt - def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" @@ -213,10 +208,6 @@ def conversions(): file_name="./nginx-block-ai-bots.conf", converter=json_to_nginx, ) - update_file_if_changed( - file_name="./haproxy-block-ai-bots.txt", - converter=json_to_haproxy, - ) if __name__ == "__main__": diff --git a/code/test_files/haproxy-block-ai-bots.txt b/code/test_files/haproxy-block-ai-bots.txt deleted file mode 100644 index 5ed6939..0000000 --- a/code/test_files/haproxy-block-ai-bots.txt +++ /dev/null @@ -1,47 +0,0 @@ -AI2Bot -Ai2Bot-Dolma -Amazonbot -anthropic-ai -Applebot -Applebot-Extended -Bytespider -CCBot -ChatGPT-User -Claude-Web -ClaudeBot -cohere-ai -Diffbot -FacebookBot -facebookexternalhit -FriendlyCrawler -Google-Extended -GoogleOther -GoogleOther-Image -GoogleOther-Video -GPTBot -iaskspider/2.0 -ICC-Crawler -ImagesiftBot -img2dataset -ISSCyberRiskCrawler -Kangaroo Bot -Meta-ExternalAgent -Meta-ExternalFetcher -OAI-SearchBot -omgili -omgilibot -Perplexity-User -PerplexityBot -PetalBot -Scrapy -Sidetrade indexer bot -Timpibot -VelenPublicWebCrawler -Webzio-Extended -YouBot -crawler.with.dots -star***crawler -Is this a crawler? -a[mazing]{42}(robot) -2^32$ -curl|sudo bash \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index e179c44..f58b445 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -60,16 +60,6 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_nginx = json_to_nginx(self.robots_dict) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) -class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): - maxDiff = 8192 - - def setUp(self): - self.robots_dict = self.loadJson("test_files/robots.json") - - def test_haproxy_generation(self): - robots_haproxy = json_to_haproxy(self.robots_dict) - self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) - class TestRobotsNameCleaning(unittest.TestCase): def test_clean_name(self): from robots import clean_robot_name diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt deleted file mode 100644 index 3c326bd..0000000 --- a/haproxy-block-ai-bots.txt +++ /dev/null @@ -1,57 +0,0 @@ -AI2Bot -Ai2Bot-Dolma -aiHitBot -Amazonbot -anthropic-ai -Applebot -Applebot-Extended -Brightbot 1.0 -Bytespider -CCBot -ChatGPT-User -Claude-Web -ClaudeBot -cohere-ai -cohere-training-data-crawler -Cotoyogi -Crawlspace -Diffbot -DuckAssistBot -FacebookBot -Factset_spyderbot -FirecrawlAgent -FriendlyCrawler -Google-Extended -GoogleOther -GoogleOther-Image -GoogleOther-Video -GPTBot -iaskspider/2.0 -ICC-Crawler -ImagesiftBot -img2dataset -imgproxy -ISSCyberRiskCrawler -Kangaroo Bot -meta-externalagent -Meta-ExternalAgent -meta-externalfetcher -Meta-ExternalFetcher -NovaAct -OAI-SearchBot -omgili -omgilibot -Operator -PanguBot -Perplexity-User -PerplexityBot -PetalBot -Scrapy -SemrushBot-OCOB -SemrushBot-SWA -Sidetrade indexer bot -TikTokSpider -Timpibot -VelenPublicWebCrawler -Webzio-Extended -YouBot \ No newline at end of file