Added a Caddyfile converter (#110)

Co-authored-by: Julian Beittel <julian@beittel.net>
Co-authored-by: Glyn Normington <work@underlap.org>
This commit is contained in:
Crazyroostereye 2025-05-01 12:21:32 +02:00 committed by GitHub
parent 91a88e2fa8
commit 1310dbae46
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 33 additions and 1 deletions

3
Caddyfile Normal file
View file

@ -0,0 +1,3 @@
@aibots {
header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
}

View file

@ -14,6 +14,7 @@ This repository provides the following files:
- `robots.txt` - `robots.txt`
- `.htaccess` - `.htaccess`
- `nginx-block-ai-bots.conf` - `nginx-block-ai-bots.conf`
- `Caddyfile`
- `haproxy-block-ai-bots.txt` - `haproxy-block-ai-bots.txt`
`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
@ -23,6 +24,8 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
`Caddyfile` includes a Header Regex matcher group you can copy or import into your Caddyfile, the rejection can then be handled as followed `abort @aibots`
`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it; `haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it;
1. Add the file to the config directory of HAProxy 1. Add the file to the config directory of HAProxy
2. Add the following lines in the `frontend` section; 2. Add the following lines in the `frontend` section;

View file

@ -178,12 +178,20 @@ def json_to_nginx(robot_json):
config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}"
return config return config
def json_to_caddy(robot_json):
caddyfile = "@aibots {\n "
caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"'
caddyfile += "\n}"
return caddyfile
def json_to_haproxy(robots_json): def json_to_haproxy(robots_json):
# Creates a source file for HAProxy. Follow instructions in the README to implement it. # Creates a source file for HAProxy. Follow instructions in the README to implement it.
txt = "\n".join(f"{k}" for k in robots_json.keys()) txt = "\n".join(f"{k}" for k in robots_json.keys())
return txt return txt
def update_file_if_changed(file_name, converter): def update_file_if_changed(file_name, converter):
"""Update files if newer content is available and log the (in)actions.""" """Update files if newer content is available and log the (in)actions."""
new_content = converter(load_robots_json()) new_content = converter(load_robots_json())
@ -213,6 +221,10 @@ def conversions():
file_name="./nginx-block-ai-bots.conf", file_name="./nginx-block-ai-bots.conf",
converter=json_to_nginx, converter=json_to_nginx,
) )
update_file_if_changed(
file_name="./Caddyfile",
converter=json_to_caddy
update_file_if_changed( update_file_if_changed(
file_name="./haproxy-block-ai-bots.txt", file_name="./haproxy-block-ai-bots.txt",
converter=json_to_haproxy, converter=json_to_haproxy,

View file

@ -0,0 +1,3 @@
@aibots {
header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)"
}

View file

@ -4,7 +4,7 @@
import json import json
import unittest import unittest
from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy
class RobotsUnittestExtensions: class RobotsUnittestExtensions:
def loadJson(self, pathname): def loadJson(self, pathname):
@ -76,6 +76,17 @@ class TestRobotsNameCleaning(unittest.TestCase):
self.assertEqual(clean_robot_name("PerplexityUser"), "Perplexity-User") self.assertEqual(clean_robot_name("PerplexityUser"), "Perplexity-User")
class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192
def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json")
def test_caddyfile_generation(self):
robots_caddyfile = json_to_caddy(self.robots_dict)
self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile)
if __name__ == "__main__": if __name__ == "__main__":
import os import os
os.chdir(os.path.dirname(__file__)) os.chdir(os.path.dirname(__file__))