mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-05-17 16:03:10 +00:00
HAProxy converter added.
This commit is contained in:
parent
c6c7f1748f
commit
50e739dd73
3 changed files with 75 additions and 0 deletions
|
@ -14,6 +14,7 @@ This repository provides the following files:
|
||||||
- `robots.txt`
|
- `robots.txt`
|
||||||
- `.htaccess`
|
- `.htaccess`
|
||||||
- `nginx-block-ai-bots.conf`
|
- `nginx-block-ai-bots.conf`
|
||||||
|
- `haproxy-block-ai-bots.txt`
|
||||||
|
|
||||||
`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
|
`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
|
||||||
|
|
||||||
|
@ -22,6 +23,14 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
|
||||||
|
|
||||||
`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
|
`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
|
||||||
|
|
||||||
|
`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it;
|
||||||
|
1. Add the file to the config directory of HAProxy
|
||||||
|
2. Add the following lines in the `frontend` section;
|
||||||
|
```
|
||||||
|
acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt
|
||||||
|
http-request deny if ai_robot
|
||||||
|
```
|
||||||
|
(Note that the path of the `haproxy-block-ai-bots.txt` may be different on your environment.)
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
|
|
|
@ -178,6 +178,11 @@ def json_to_nginx(robot_json):
|
||||||
config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}"
|
config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}"
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
def json_to_haproxy(robots_json):
|
||||||
|
# Creates a source file for HAProxy. Follow instructions in the README to implement it.
|
||||||
|
txt = "\n".join(f"{k}" for k in robots_json.keys())
|
||||||
|
return txt
|
||||||
|
|
||||||
|
|
||||||
def update_file_if_changed(file_name, converter):
|
def update_file_if_changed(file_name, converter):
|
||||||
"""Update files if newer content is available and log the (in)actions."""
|
"""Update files if newer content is available and log the (in)actions."""
|
||||||
|
@ -208,6 +213,10 @@ def conversions():
|
||||||
file_name="./nginx-block-ai-bots.conf",
|
file_name="./nginx-block-ai-bots.conf",
|
||||||
converter=json_to_nginx,
|
converter=json_to_nginx,
|
||||||
)
|
)
|
||||||
|
update_file_if_changed(
|
||||||
|
file_name="./haproxy-block-ai-bots.txt",
|
||||||
|
converter=json_to_haproxy,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
57
haproxy-block-ai-bots.txt
Normal file
57
haproxy-block-ai-bots.txt
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
AI2Bot
|
||||||
|
Ai2Bot-Dolma
|
||||||
|
aiHitBot
|
||||||
|
Amazonbot
|
||||||
|
anthropic-ai
|
||||||
|
Applebot
|
||||||
|
Applebot-Extended
|
||||||
|
Brightbot 1.0
|
||||||
|
Bytespider
|
||||||
|
CCBot
|
||||||
|
ChatGPT-User
|
||||||
|
Claude-Web
|
||||||
|
ClaudeBot
|
||||||
|
cohere-ai
|
||||||
|
cohere-training-data-crawler
|
||||||
|
Cotoyogi
|
||||||
|
Crawlspace
|
||||||
|
Diffbot
|
||||||
|
DuckAssistBot
|
||||||
|
FacebookBot
|
||||||
|
Factset_spyderbot
|
||||||
|
FirecrawlAgent
|
||||||
|
FriendlyCrawler
|
||||||
|
Google-Extended
|
||||||
|
GoogleOther
|
||||||
|
GoogleOther-Image
|
||||||
|
GoogleOther-Video
|
||||||
|
GPTBot
|
||||||
|
iaskspider/2.0
|
||||||
|
ICC-Crawler
|
||||||
|
ImagesiftBot
|
||||||
|
img2dataset
|
||||||
|
imgproxy
|
||||||
|
ISSCyberRiskCrawler
|
||||||
|
Kangaroo Bot
|
||||||
|
meta-externalagent
|
||||||
|
Meta-ExternalAgent
|
||||||
|
meta-externalfetcher
|
||||||
|
Meta-ExternalFetcher
|
||||||
|
NovaAct
|
||||||
|
OAI-SearchBot
|
||||||
|
omgili
|
||||||
|
omgilibot
|
||||||
|
Operator
|
||||||
|
PanguBot
|
||||||
|
Perplexity-User
|
||||||
|
PerplexityBot
|
||||||
|
PetalBot
|
||||||
|
Scrapy
|
||||||
|
SemrushBot-OCOB
|
||||||
|
SemrushBot-SWA
|
||||||
|
Sidetrade indexer bot
|
||||||
|
TikTokSpider
|
||||||
|
Timpibot
|
||||||
|
VelenPublicWebCrawler
|
||||||
|
Webzio-Extended
|
||||||
|
YouBot
|
Loading…
Add table
Add a link
Reference in a new issue