mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-05-19 08:43:11 +00:00
Added Test and Docu
This commit is contained in:
parent
e519f9f94c
commit
1e92364c13
5 changed files with 19 additions and 57 deletions
56
Caddyfile
56
Caddyfile
|
@ -1,57 +1,3 @@
|
||||||
@aibots {
|
@aibots {
|
||||||
header User-Agent "*AI2Bot*"
|
header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
|
||||||
header User-Agent "*Ai2Bot-Dolma*"
|
|
||||||
header User-Agent "*aiHitBot*"
|
|
||||||
header User-Agent "*Amazonbot*"
|
|
||||||
header User-Agent "*anthropic-ai*"
|
|
||||||
header User-Agent "*Applebot*"
|
|
||||||
header User-Agent "*Applebot-Extended*"
|
|
||||||
header User-Agent "*Brightbot 1.0*"
|
|
||||||
header User-Agent "*Bytespider*"
|
|
||||||
header User-Agent "*CCBot*"
|
|
||||||
header User-Agent "*ChatGPT-User*"
|
|
||||||
header User-Agent "*Claude-Web*"
|
|
||||||
header User-Agent "*ClaudeBot*"
|
|
||||||
header User-Agent "*cohere-ai*"
|
|
||||||
header User-Agent "*cohere-training-data-crawler*"
|
|
||||||
header User-Agent "*Cotoyogi*"
|
|
||||||
header User-Agent "*Crawlspace*"
|
|
||||||
header User-Agent "*Diffbot*"
|
|
||||||
header User-Agent "*DuckAssistBot*"
|
|
||||||
header User-Agent "*FacebookBot*"
|
|
||||||
header User-Agent "*Factset_spyderbot*"
|
|
||||||
header User-Agent "*FirecrawlAgent*"
|
|
||||||
header User-Agent "*FriendlyCrawler*"
|
|
||||||
header User-Agent "*Google-Extended*"
|
|
||||||
header User-Agent "*GoogleOther*"
|
|
||||||
header User-Agent "*GoogleOther-Image*"
|
|
||||||
header User-Agent "*GoogleOther-Video*"
|
|
||||||
header User-Agent "*GPTBot*"
|
|
||||||
header User-Agent "*iaskspider/2.0*"
|
|
||||||
header User-Agent "*ICC-Crawler*"
|
|
||||||
header User-Agent "*ImagesiftBot*"
|
|
||||||
header User-Agent "*img2dataset*"
|
|
||||||
header User-Agent "*imgproxy*"
|
|
||||||
header User-Agent "*ISSCyberRiskCrawler*"
|
|
||||||
header User-Agent "*Kangaroo Bot*"
|
|
||||||
header User-Agent "*Meta-ExternalAgent*"
|
|
||||||
header User-Agent "*Meta-ExternalFetcher*"
|
|
||||||
header User-Agent "*NovaAct*"
|
|
||||||
header User-Agent "*OAI-SearchBot*"
|
|
||||||
header User-Agent "*omgili*"
|
|
||||||
header User-Agent "*omgilibot*"
|
|
||||||
header User-Agent "*Operator*"
|
|
||||||
header User-Agent "*PanguBot*"
|
|
||||||
header User-Agent "*Perplexity-User*"
|
|
||||||
header User-Agent "*PerplexityBot*"
|
|
||||||
header User-Agent "*PetalBot*"
|
|
||||||
header User-Agent "*Scrapy*"
|
|
||||||
header User-Agent "*SemrushBot-OCOB*"
|
|
||||||
header User-Agent "*SemrushBot-SWA*"
|
|
||||||
header User-Agent "*Sidetrade indexer bot*"
|
|
||||||
header User-Agent "*TikTokSpider*"
|
|
||||||
header User-Agent "*Timpibot*"
|
|
||||||
header User-Agent "*VelenPublicWebCrawler*"
|
|
||||||
header User-Agent "*Webzio-Extended*"
|
|
||||||
header User-Agent "*YouBot*"
|
|
||||||
}
|
}
|
|
@ -14,6 +14,7 @@ This repository provides the following files:
|
||||||
- `robots.txt`
|
- `robots.txt`
|
||||||
- `.htaccess`
|
- `.htaccess`
|
||||||
- `nginx-block-ai-bots.conf`
|
- `nginx-block-ai-bots.conf`
|
||||||
|
- `Caddyfile`
|
||||||
|
|
||||||
`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
|
`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
|
||||||
|
|
||||||
|
@ -22,6 +23,7 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
|
||||||
|
|
||||||
`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
|
`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
|
||||||
|
|
||||||
|
`Caddyfile` includes a Header Regex matcher group you can copy or import into your Caddyfile, the rejection can then be handeld as followed `abort @aibots`
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
|
|
|
@ -180,7 +180,7 @@ def json_to_nginx(robot_json):
|
||||||
|
|
||||||
def json_to_caddy(robot_json):
|
def json_to_caddy(robot_json):
|
||||||
caddyfile = "@aibots {\n "
|
caddyfile = "@aibots {\n "
|
||||||
caddyfile += "\n ".join(f'header User-Agent "*{k}*"' for k in robot_json.keys())
|
caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"'
|
||||||
caddyfile += "\n}"
|
caddyfile += "\n}"
|
||||||
return caddyfile
|
return caddyfile
|
||||||
|
|
||||||
|
|
3
code/test_files/Caddyfile
Normal file
3
code/test_files/Caddyfile
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
@aibots {
|
||||||
|
header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)"
|
||||||
|
}
|
|
@ -4,7 +4,7 @@
|
||||||
import json
|
import json
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx
|
from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_caddy
|
||||||
|
|
||||||
class RobotsUnittestExtensions:
|
class RobotsUnittestExtensions:
|
||||||
def loadJson(self, pathname):
|
def loadJson(self, pathname):
|
||||||
|
@ -66,6 +66,17 @@ class TestRobotsNameCleaning(unittest.TestCase):
|
||||||
|
|
||||||
self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
|
self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
|
||||||
|
|
||||||
|
class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||||
|
maxDiff = 8192
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.robots_dict = self.loadJson("test_files/robots.json")
|
||||||
|
|
||||||
|
def test_caddyfile_generation(self):
|
||||||
|
robots_caddyfile = json_to_caddy(self.robots_dict)
|
||||||
|
self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import os
|
import os
|
||||||
os.chdir(os.path.dirname(__file__))
|
os.chdir(os.path.dirname(__file__))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue