diff --git a/Caddyfile b/Caddyfile new file mode 100644 index 0000000..679e5c2 --- /dev/null +++ b/Caddyfile @@ -0,0 +1,57 @@ +@aibots { + header User-Agent "*AI2Bot*" + header User-Agent "*Ai2Bot-Dolma*" + header User-Agent "*aiHitBot*" + header User-Agent "*Amazonbot*" + header User-Agent "*anthropic-ai*" + header User-Agent "*Applebot*" + header User-Agent "*Applebot-Extended*" + header User-Agent "*Brightbot 1.0*" + header User-Agent "*Bytespider*" + header User-Agent "*CCBot*" + header User-Agent "*ChatGPT-User*" + header User-Agent "*Claude-Web*" + header User-Agent "*ClaudeBot*" + header User-Agent "*cohere-ai*" + header User-Agent "*cohere-training-data-crawler*" + header User-Agent "*Cotoyogi*" + header User-Agent "*Crawlspace*" + header User-Agent "*Diffbot*" + header User-Agent "*DuckAssistBot*" + header User-Agent "*FacebookBot*" + header User-Agent "*Factset_spyderbot*" + header User-Agent "*FirecrawlAgent*" + header User-Agent "*FriendlyCrawler*" + header User-Agent "*Google-Extended*" + header User-Agent "*GoogleOther*" + header User-Agent "*GoogleOther-Image*" + header User-Agent "*GoogleOther-Video*" + header User-Agent "*GPTBot*" + header User-Agent "*iaskspider/2.0*" + header User-Agent "*ICC-Crawler*" + header User-Agent "*ImagesiftBot*" + header User-Agent "*img2dataset*" + header User-Agent "*imgproxy*" + header User-Agent "*ISSCyberRiskCrawler*" + header User-Agent "*Kangaroo Bot*" + header User-Agent "*Meta-ExternalAgent*" + header User-Agent "*Meta-ExternalFetcher*" + header User-Agent "*NovaAct*" + header User-Agent "*OAI-SearchBot*" + header User-Agent "*omgili*" + header User-Agent "*omgilibot*" + header User-Agent "*Operator*" + header User-Agent "*PanguBot*" + header User-Agent "*Perplexity-User*" + header User-Agent "*PerplexityBot*" + header User-Agent "*PetalBot*" + header User-Agent "*Scrapy*" + header User-Agent "*SemrushBot-OCOB*" + header User-Agent "*SemrushBot-SWA*" + header User-Agent "*Sidetrade indexer bot*" + header User-Agent "*TikTokSpider*" + header User-Agent "*Timpibot*" + header User-Agent "*VelenPublicWebCrawler*" + header User-Agent "*Webzio-Extended*" + header User-Agent "*YouBot*" +} \ No newline at end of file diff --git a/code/robots.py b/code/robots.py index 8a06b55..710568d 100755 --- a/code/robots.py +++ b/code/robots.py @@ -178,6 +178,12 @@ def json_to_nginx(robot_json): config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" return config +def json_to_caddy(robot_json): + caddyfile = "@aibots {\n " + caddyfile += "\n ".join(f'header User-Agent "*{k}*"' for k in robot_json.keys()) + caddyfile += "\n}" + return caddyfile + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" @@ -208,6 +214,10 @@ def conversions(): file_name="./nginx-block-ai-bots.conf", converter=json_to_nginx, ) + update_file_if_changed( + file_name="./Caddyfile", + converter=json_to_caddy + ) if __name__ == "__main__": diff --git a/robots.json b/robots.json index 698b31e..df9dcda 100644 --- a/robots.json +++ b/robots.json @@ -384,4 +384,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file