Merge branch 'main' into main

2025-05-19 08:43:11 +00:00 · 2025-04-30 22:13:51 +02:00 · 2025-04-30 22:13:51 +02:00 · 6baa7725b3
commit 6baa7725b3
parent df6d7f9723 91a88e2fa8
10 changed files with 160 additions and 7 deletions
--- a/.htaccess
+++ b/.htaccess
@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
--- a/README.md
+++ b/README.md
@ -15,6 +15,7 @@ This repository provides the following files:
 - `.htaccess`
 - `nginx-block-ai-bots.conf`
 - `Caddyfile`
+- `haproxy-block-ai-bots.txt`

 `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).

@ -25,6 +26,15 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/

 `Caddyfile` includes a Header Regex matcher group you can copy or import into your Caddyfile, the rejection can then be handeld as followed `abort @aibots`

+`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it;
+1. Add the file to the config directory of HAProxy
+2. Add the following lines in the `frontend` section;
+   ```
+   acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt
+   http-request deny if ai_robot
+   ```
+   (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.)
+
 ## Contributing

 A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`.
--- a/code/robots.py
+++ b/code/robots.py
@ -178,12 +178,19 @@ def json_to_nginx(robot_json):
    config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n    return 403;\n}}"
    return config

+
 def json_to_caddy(robot_json):
    caddyfile = "@aibots {\n    "
    caddyfile += f'    header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"'
    caddyfile += "\n}"
    return caddyfile

+def json_to_haproxy(robots_json):
+    # Creates a source file for HAProxy. Follow instructions in the README to implement it.
+    txt = "\n".join(f"{k}" for k in robots_json.keys())
+    return txt
+
+

 def update_file_if_changed(file_name, converter):
    """Update files if newer content is available and log the (in)actions."""
@ -217,6 +224,10 @@ def conversions():
    update_file_if_changed(
        file_name="./Caddyfile",
        converter=json_to_caddy
+      
+    update_file_if_changed(
+        file_name="./haproxy-block-ai-bots.txt",
+        converter=json_to_haproxy,
    )


--- a/code/test_files/haproxy-block-ai-bots.txt
+++ b/code/test_files/haproxy-block-ai-bots.txt
@ -0,0 +1,47 @@
+AI2Bot
+Ai2Bot-Dolma
+Amazonbot
+anthropic-ai
+Applebot
+Applebot-Extended
+Bytespider
+CCBot
+ChatGPT-User
+Claude-Web
+ClaudeBot
+cohere-ai
+Diffbot
+FacebookBot
+facebookexternalhit
+FriendlyCrawler
+Google-Extended
+GoogleOther
+GoogleOther-Image
+GoogleOther-Video
+GPTBot
+iaskspider/2.0
+ICC-Crawler
+ImagesiftBot
+img2dataset
+ISSCyberRiskCrawler
+Kangaroo Bot
+Meta-ExternalAgent
+Meta-ExternalFetcher
+OAI-SearchBot
+omgili
+omgilibot
+Perplexity-User
+PerplexityBot
+PetalBot
+Scrapy
+Sidetrade indexer bot
+Timpibot
+VelenPublicWebCrawler
+Webzio-Extended
+YouBot
+crawler.with.dots
+star***crawler
+Is this a crawler?
+a[mazing]{42}(robot)
+2^32$
+curl|sudo bash
--- a/code/tests.py
+++ b/code/tests.py
@ -4,7 +4,7 @@
 import json
 import unittest

-from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_caddy
+from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy

 class RobotsUnittestExtensions:
    def loadJson(self, pathname):
@ -60,6 +60,16 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
        robots_nginx = json_to_nginx(self.robots_dict)
        self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)

+class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
+    maxDiff = 8192
+
+    def setUp(self):
+        self.robots_dict = self.loadJson("test_files/robots.json")
+
+    def test_haproxy_generation(self):
+        robots_haproxy = json_to_haproxy(self.robots_dict)
+        self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
+
 class TestRobotsNameCleaning(unittest.TestCase):
    def test_clean_name(self):
        from robots import clean_robot_name
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@ -0,0 +1,57 @@
+AI2Bot
+Ai2Bot-Dolma
+aiHitBot
+Amazonbot
+anthropic-ai
+Applebot
+Applebot-Extended
+Brightbot 1.0
+Bytespider
+CCBot
+ChatGPT-User
+Claude-Web
+ClaudeBot
+cohere-ai
+cohere-training-data-crawler
+Cotoyogi
+Crawlspace
+Diffbot
+DuckAssistBot
+FacebookBot
+Factset_spyderbot
+FirecrawlAgent
+FriendlyCrawler
+Google-Extended
+GoogleOther
+GoogleOther-Image
+GoogleOther-Video
+GPTBot
+iaskspider/2.0
+ICC-Crawler
+ImagesiftBot
+img2dataset
+imgproxy
+ISSCyberRiskCrawler
+Kangaroo Bot
+meta-externalagent
+Meta-ExternalAgent
+meta-externalfetcher
+Meta-ExternalFetcher
+NovaAct
+OAI-SearchBot
+omgili
+omgilibot
+Operator
+PanguBot
+Perplexity-User
+PerplexityBot
+PetalBot
+Scrapy
+SemrushBot-OCOB
+SemrushBot-SWA
+Sidetrade indexer bot
+TikTokSpider
+Timpibot
+VelenPublicWebCrawler
+Webzio-Extended
+YouBot
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
    return 403;
 }
--- a/robots.json
+++ b/robots.json
@ -244,13 +244,27 @@
        "frequency": "Unclear at this time.",
        "description": "Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot"
    },
-    "Meta-ExternalAgent": {
+    "meta-externalagent": {
        "operator": "[Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)",
-        "respect": "Yes.",
+        "respect": "Yes",
        "function": "Used to train models and improve products.",
        "frequency": "No information.",
        "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\""
    },
+    "Meta-ExternalAgent": {
+        "operator": "Unclear at this time.",
+        "respect": "Unclear at this time.",
+        "function": "AI Data Scrapers",
+        "frequency": "Unclear at this time.",
+        "description": "Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent"
+    },
+    "meta-externalfetcher": {
+        "operator": "Unclear at this time.",
+        "respect": "Unclear at this time.",
+        "function": "AI Assistants",
+        "frequency": "Unclear at this time.",
+        "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher"
+    },
    "Meta-ExternalFetcher": {
        "operator": "Unclear at this time.",
        "respect": "Unclear at this time.",
--- a/robots.txt
+++ b/robots.txt
@ -33,7 +33,9 @@ User-agent: img2dataset
 User-agent: imgproxy
 User-agent: ISSCyberRiskCrawler
 User-agent: Kangaroo Bot
+User-agent: meta-externalagent
 User-agent: Meta-ExternalAgent
+User-agent: meta-externalfetcher
 User-agent: Meta-ExternalFetcher
 User-agent: NovaAct
 User-agent: OAI-SearchBot
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@ -35,7 +35,9 @@
 | imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. |
 | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. |
 | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot |
-| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
+| meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
+| Meta\-ExternalAgent | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent |
+| meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
 | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
 | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact |
 | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |