mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-05-18 08:13:12 +00:00
Compare commits
No commits in common. "91a88e2fa8086b2f0236da0fdc4ed6788784acaa" and "c6c7f1748f1e28053184539a70a6a08f5aeabc37" have entirely different histories.
91a88e2fa8
...
c6c7f1748f
5 changed files with 1 additions and 133 deletions
|
@ -14,7 +14,6 @@ This repository provides the following files:
|
||||||
- `robots.txt`
|
- `robots.txt`
|
||||||
- `.htaccess`
|
- `.htaccess`
|
||||||
- `nginx-block-ai-bots.conf`
|
- `nginx-block-ai-bots.conf`
|
||||||
- `haproxy-block-ai-bots.txt`
|
|
||||||
|
|
||||||
`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
|
`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
|
||||||
|
|
||||||
|
@ -23,14 +22,6 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
|
||||||
|
|
||||||
`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
|
`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
|
||||||
|
|
||||||
`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it;
|
|
||||||
1. Add the file to the config directory of HAProxy
|
|
||||||
2. Add the following lines in the `frontend` section;
|
|
||||||
```
|
|
||||||
acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt
|
|
||||||
http-request deny if ai_robot
|
|
||||||
```
|
|
||||||
(Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.)
|
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
|
|
|
@ -178,11 +178,6 @@ def json_to_nginx(robot_json):
|
||||||
config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}"
|
config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}"
|
||||||
return config
|
return config
|
||||||
|
|
||||||
def json_to_haproxy(robots_json):
|
|
||||||
# Creates a source file for HAProxy. Follow instructions in the README to implement it.
|
|
||||||
txt = "\n".join(f"{k}" for k in robots_json.keys())
|
|
||||||
return txt
|
|
||||||
|
|
||||||
|
|
||||||
def update_file_if_changed(file_name, converter):
|
def update_file_if_changed(file_name, converter):
|
||||||
"""Update files if newer content is available and log the (in)actions."""
|
"""Update files if newer content is available and log the (in)actions."""
|
||||||
|
@ -213,10 +208,6 @@ def conversions():
|
||||||
file_name="./nginx-block-ai-bots.conf",
|
file_name="./nginx-block-ai-bots.conf",
|
||||||
converter=json_to_nginx,
|
converter=json_to_nginx,
|
||||||
)
|
)
|
||||||
update_file_if_changed(
|
|
||||||
file_name="./haproxy-block-ai-bots.txt",
|
|
||||||
converter=json_to_haproxy,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,47 +0,0 @@
|
||||||
AI2Bot
|
|
||||||
Ai2Bot-Dolma
|
|
||||||
Amazonbot
|
|
||||||
anthropic-ai
|
|
||||||
Applebot
|
|
||||||
Applebot-Extended
|
|
||||||
Bytespider
|
|
||||||
CCBot
|
|
||||||
ChatGPT-User
|
|
||||||
Claude-Web
|
|
||||||
ClaudeBot
|
|
||||||
cohere-ai
|
|
||||||
Diffbot
|
|
||||||
FacebookBot
|
|
||||||
facebookexternalhit
|
|
||||||
FriendlyCrawler
|
|
||||||
Google-Extended
|
|
||||||
GoogleOther
|
|
||||||
GoogleOther-Image
|
|
||||||
GoogleOther-Video
|
|
||||||
GPTBot
|
|
||||||
iaskspider/2.0
|
|
||||||
ICC-Crawler
|
|
||||||
ImagesiftBot
|
|
||||||
img2dataset
|
|
||||||
ISSCyberRiskCrawler
|
|
||||||
Kangaroo Bot
|
|
||||||
Meta-ExternalAgent
|
|
||||||
Meta-ExternalFetcher
|
|
||||||
OAI-SearchBot
|
|
||||||
omgili
|
|
||||||
omgilibot
|
|
||||||
Perplexity-User
|
|
||||||
PerplexityBot
|
|
||||||
PetalBot
|
|
||||||
Scrapy
|
|
||||||
Sidetrade indexer bot
|
|
||||||
Timpibot
|
|
||||||
VelenPublicWebCrawler
|
|
||||||
Webzio-Extended
|
|
||||||
YouBot
|
|
||||||
crawler.with.dots
|
|
||||||
star***crawler
|
|
||||||
Is this a crawler?
|
|
||||||
a[mazing]{42}(robot)
|
|
||||||
2^32$
|
|
||||||
curl|sudo bash
|
|
|
@ -4,7 +4,7 @@
|
||||||
import json
|
import json
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy
|
from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx
|
||||||
|
|
||||||
class RobotsUnittestExtensions:
|
class RobotsUnittestExtensions:
|
||||||
def loadJson(self, pathname):
|
def loadJson(self, pathname):
|
||||||
|
@ -60,16 +60,6 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||||
robots_nginx = json_to_nginx(self.robots_dict)
|
robots_nginx = json_to_nginx(self.robots_dict)
|
||||||
self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
|
self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
|
||||||
|
|
||||||
class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
|
||||||
maxDiff = 8192
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
self.robots_dict = self.loadJson("test_files/robots.json")
|
|
||||||
|
|
||||||
def test_haproxy_generation(self):
|
|
||||||
robots_haproxy = json_to_haproxy(self.robots_dict)
|
|
||||||
self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
|
|
||||||
|
|
||||||
class TestRobotsNameCleaning(unittest.TestCase):
|
class TestRobotsNameCleaning(unittest.TestCase):
|
||||||
def test_clean_name(self):
|
def test_clean_name(self):
|
||||||
from robots import clean_robot_name
|
from robots import clean_robot_name
|
||||||
|
|
|
@ -1,57 +0,0 @@
|
||||||
AI2Bot
|
|
||||||
Ai2Bot-Dolma
|
|
||||||
aiHitBot
|
|
||||||
Amazonbot
|
|
||||||
anthropic-ai
|
|
||||||
Applebot
|
|
||||||
Applebot-Extended
|
|
||||||
Brightbot 1.0
|
|
||||||
Bytespider
|
|
||||||
CCBot
|
|
||||||
ChatGPT-User
|
|
||||||
Claude-Web
|
|
||||||
ClaudeBot
|
|
||||||
cohere-ai
|
|
||||||
cohere-training-data-crawler
|
|
||||||
Cotoyogi
|
|
||||||
Crawlspace
|
|
||||||
Diffbot
|
|
||||||
DuckAssistBot
|
|
||||||
FacebookBot
|
|
||||||
Factset_spyderbot
|
|
||||||
FirecrawlAgent
|
|
||||||
FriendlyCrawler
|
|
||||||
Google-Extended
|
|
||||||
GoogleOther
|
|
||||||
GoogleOther-Image
|
|
||||||
GoogleOther-Video
|
|
||||||
GPTBot
|
|
||||||
iaskspider/2.0
|
|
||||||
ICC-Crawler
|
|
||||||
ImagesiftBot
|
|
||||||
img2dataset
|
|
||||||
imgproxy
|
|
||||||
ISSCyberRiskCrawler
|
|
||||||
Kangaroo Bot
|
|
||||||
meta-externalagent
|
|
||||||
Meta-ExternalAgent
|
|
||||||
meta-externalfetcher
|
|
||||||
Meta-ExternalFetcher
|
|
||||||
NovaAct
|
|
||||||
OAI-SearchBot
|
|
||||||
omgili
|
|
||||||
omgilibot
|
|
||||||
Operator
|
|
||||||
PanguBot
|
|
||||||
Perplexity-User
|
|
||||||
PerplexityBot
|
|
||||||
PetalBot
|
|
||||||
Scrapy
|
|
||||||
SemrushBot-OCOB
|
|
||||||
SemrushBot-SWA
|
|
||||||
Sidetrade indexer bot
|
|
||||||
TikTokSpider
|
|
||||||
Timpibot
|
|
||||||
VelenPublicWebCrawler
|
|
||||||
Webzio-Extended
|
|
||||||
YouBot
|
|
Loading…
Add table
Add a link
Reference in a new issue