2025-05-18 08:13:12 +00:00
5 changed files with 1 additions and 133 deletions
--- a/README.md
+++ b/README.md
@ -14,7 +14,6 @@ This repository provides the following files:
 - `robots.txt`
 - `.htaccess`
 - `nginx-block-ai-bots.conf`
 - `haproxy-block-ai-bots.txt`
 `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
@ -23,14 +22,6 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
 `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
 `haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it;
 1. Add the file to the config directory of HAProxy
 2. Add the following lines in the `frontend` section;
   ```
   acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt
   http-request deny if ai_robot
   ```
   (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.)
 ## Contributing
--- a/code/robots.py
+++ b/code/robots.py
@ -178,11 +178,6 @@ def json_to_nginx(robot_json):
    config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n    return 403;\n}}"
    return config
 def json_to_haproxy(robots_json):
    # Creates a source file for HAProxy. Follow instructions in the README to implement it.
    txt = "\n".join(f"{k}" for k in robots_json.keys())
    return txt
 def update_file_if_changed(file_name, converter):
    """Update files if newer content is available and log the (in)actions."""
@ -213,10 +208,6 @@ def conversions():
        file_name="./nginx-block-ai-bots.conf",
        converter=json_to_nginx,
    )
    update_file_if_changed(
        file_name="./haproxy-block-ai-bots.txt",
        converter=json_to_haproxy,
    )
 if __name__ == "__main__":
--- a/code/test_files/haproxy-block-ai-bots.txt
+++ b/code/test_files/haproxy-block-ai-bots.txt
@ -1,47 +0,0 @@
 AI2Bot
 Ai2Bot-Dolma
 Amazonbot
 anthropic-ai
 Applebot
 Applebot-Extended
 Bytespider
 CCBot
 ChatGPT-User
 Claude-Web
 ClaudeBot
 cohere-ai
 Diffbot
 FacebookBot
 facebookexternalhit
 FriendlyCrawler
 Google-Extended
 GoogleOther
 GoogleOther-Image
 GoogleOther-Video
 GPTBot
 iaskspider/2.0
 ICC-Crawler
 ImagesiftBot
 img2dataset
 ISSCyberRiskCrawler
 Kangaroo Bot
 Meta-ExternalAgent
 Meta-ExternalFetcher
 OAI-SearchBot
 omgili
 omgilibot
 Perplexity-User
 PerplexityBot
 PetalBot
 Scrapy
 Sidetrade indexer bot
 Timpibot
 VelenPublicWebCrawler
 Webzio-Extended
 YouBot
 crawler.with.dots
 star***crawler
 Is this a crawler?
 a[mazing]{42}(robot)
 2^32$
 curl|sudo bash
--- a/code/tests.py
+++ b/code/tests.py
@ -4,7 +4,7 @@
 import json
 import unittest
-from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy
+from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx
 class RobotsUnittestExtensions:
    def loadJson(self, pathname):
@ -60,16 +60,6 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
        robots_nginx = json_to_nginx(self.robots_dict)
        self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
 class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
    maxDiff = 8192
    def setUp(self):
        self.robots_dict = self.loadJson("test_files/robots.json")
    def test_haproxy_generation(self):
        robots_haproxy = json_to_haproxy(self.robots_dict)
        self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
 class TestRobotsNameCleaning(unittest.TestCase):
    def test_clean_name(self):
        from robots import clean_robot_name
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@ -1,57 +0,0 @@
 AI2Bot
 Ai2Bot-Dolma
 aiHitBot
 Amazonbot
 anthropic-ai
 Applebot
 Applebot-Extended
 Brightbot 1.0
 Bytespider
 CCBot
 ChatGPT-User
 Claude-Web
 ClaudeBot
 cohere-ai
 cohere-training-data-crawler
 Cotoyogi
 Crawlspace
 Diffbot
 DuckAssistBot
 FacebookBot
 Factset_spyderbot
 FirecrawlAgent
 FriendlyCrawler
 Google-Extended
 GoogleOther
 GoogleOther-Image
 GoogleOther-Video
 GPTBot
 iaskspider/2.0
 ICC-Crawler
 ImagesiftBot
 img2dataset
 imgproxy
 ISSCyberRiskCrawler
 Kangaroo Bot
 meta-externalagent
 Meta-ExternalAgent
 meta-externalfetcher
 Meta-ExternalFetcher
 NovaAct
 OAI-SearchBot
 omgili
 omgilibot
 Operator
 PanguBot
 Perplexity-User
 PerplexityBot
 PetalBot
 Scrapy
 SemrushBot-OCOB
 SemrushBot-SWA
 Sidetrade indexer bot
 TikTokSpider
 Timpibot
 VelenPublicWebCrawler
 Webzio-Extended
 YouBot