From 1e92364c13de8a79ad3c1dff11665fe8a48267c1 Mon Sep 17 00:00:00 2001 From: Julian Beittel Date: Wed, 23 Apr 2025 09:41:18 +0200 Subject: [PATCH] Added Test and Docu --- Caddyfile | 56 +-------------------------------------- README.md | 2 ++ code/robots.py | 2 +- code/test_files/Caddyfile | 3 +++ code/tests.py | 13 ++++++++- 5 files changed, 19 insertions(+), 57 deletions(-) create mode 100644 code/test_files/Caddyfile diff --git a/Caddyfile b/Caddyfile index 679e5c2..1857d75 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,57 +1,3 @@ @aibots { - header User-Agent "*AI2Bot*" - header User-Agent "*Ai2Bot-Dolma*" - header User-Agent "*aiHitBot*" - header User-Agent "*Amazonbot*" - header User-Agent "*anthropic-ai*" - header User-Agent "*Applebot*" - header User-Agent "*Applebot-Extended*" - header User-Agent "*Brightbot 1.0*" - header User-Agent "*Bytespider*" - header User-Agent "*CCBot*" - header User-Agent "*ChatGPT-User*" - header User-Agent "*Claude-Web*" - header User-Agent "*ClaudeBot*" - header User-Agent "*cohere-ai*" - header User-Agent "*cohere-training-data-crawler*" - header User-Agent "*Cotoyogi*" - header User-Agent "*Crawlspace*" - header User-Agent "*Diffbot*" - header User-Agent "*DuckAssistBot*" - header User-Agent "*FacebookBot*" - header User-Agent "*Factset_spyderbot*" - header User-Agent "*FirecrawlAgent*" - header User-Agent "*FriendlyCrawler*" - header User-Agent "*Google-Extended*" - header User-Agent "*GoogleOther*" - header User-Agent "*GoogleOther-Image*" - header User-Agent "*GoogleOther-Video*" - header User-Agent "*GPTBot*" - header User-Agent "*iaskspider/2.0*" - header User-Agent "*ICC-Crawler*" - header User-Agent "*ImagesiftBot*" - header User-Agent "*img2dataset*" - header User-Agent "*imgproxy*" - header User-Agent "*ISSCyberRiskCrawler*" - header User-Agent "*Kangaroo Bot*" - header User-Agent "*Meta-ExternalAgent*" - header User-Agent "*Meta-ExternalFetcher*" - header User-Agent "*NovaAct*" - header User-Agent "*OAI-SearchBot*" - header User-Agent "*omgili*" - header User-Agent "*omgilibot*" - header User-Agent "*Operator*" - header User-Agent "*PanguBot*" - header User-Agent "*Perplexity-User*" - header User-Agent "*PerplexityBot*" - header User-Agent "*PetalBot*" - header User-Agent "*Scrapy*" - header User-Agent "*SemrushBot-OCOB*" - header User-Agent "*SemrushBot-SWA*" - header User-Agent "*Sidetrade indexer bot*" - header User-Agent "*TikTokSpider*" - header User-Agent "*Timpibot*" - header User-Agent "*VelenPublicWebCrawler*" - header User-Agent "*Webzio-Extended*" - header User-Agent "*YouBot*" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/README.md b/README.md index b984672..b3c7717 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This repository provides the following files: - `robots.txt` - `.htaccess` - `nginx-block-ai-bots.conf` +- `Caddyfile` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). @@ -22,6 +23,7 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. +`Caddyfile` includes a Header Regex matcher group you can copy or import into your Caddyfile, the rejection can then be handeld as followed `abort @aibots` ## Contributing diff --git a/code/robots.py b/code/robots.py index 710568d..1fe20cf 100755 --- a/code/robots.py +++ b/code/robots.py @@ -180,7 +180,7 @@ def json_to_nginx(robot_json): def json_to_caddy(robot_json): caddyfile = "@aibots {\n " - caddyfile += "\n ".join(f'header User-Agent "*{k}*"' for k in robot_json.keys()) + caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"' caddyfile += "\n}" return caddyfile diff --git a/code/test_files/Caddyfile b/code/test_files/Caddyfile new file mode 100644 index 0000000..82f365a --- /dev/null +++ b/code/test_files/Caddyfile @@ -0,0 +1,3 @@ +@aibots { + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)" +} \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index f58b445..9659203 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_caddy class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -66,6 +66,17 @@ class TestRobotsNameCleaning(unittest.TestCase): self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") +class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_caddyfile_generation(self): + robots_caddyfile = json_to_caddy(self.robots_dict) + self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile) + + if __name__ == "__main__": import os os.chdir(os.path.dirname(__file__))