Compare commits

..

4 commits

Author SHA1 Message Date
Glyn Normington
91a88e2fa8
Merge pull request #113 from rwijnen-um/feature/haproxy
Some checks failed
/ ai-robots-txt (push) Has been cancelled
/ run-tests (push) Has been cancelled
HAProxy converter added.
2025-04-28 09:00:16 +01:00
Rik Wijnen
a4a9f2ac2b Tests for HAProxy file added. 2025-04-28 09:30:26 +02:00
Rik Wijnen
66da70905f Fixed incorrect English sentence. 2025-04-28 09:09:40 +02:00
Rik Wijnen
50e739dd73 HAProxy converter added. 2025-04-28 08:51:02 +02:00
5 changed files with 133 additions and 1 deletions

View file

@ -14,6 +14,7 @@ This repository provides the following files:
- `robots.txt`
- `.htaccess`
- `nginx-block-ai-bots.conf`
- `haproxy-block-ai-bots.txt`
`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
@ -22,6 +23,14 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it;
1. Add the file to the config directory of HAProxy
2. Add the following lines in the `frontend` section;
```
acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt
http-request deny if ai_robot
```
(Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.)
## Contributing

View file

@ -178,6 +178,11 @@ def json_to_nginx(robot_json):
config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}"
return config
def json_to_haproxy(robots_json):
# Creates a source file for HAProxy. Follow instructions in the README to implement it.
txt = "\n".join(f"{k}" for k in robots_json.keys())
return txt
def update_file_if_changed(file_name, converter):
"""Update files if newer content is available and log the (in)actions."""
@ -208,6 +213,10 @@ def conversions():
file_name="./nginx-block-ai-bots.conf",
converter=json_to_nginx,
)
update_file_if_changed(
file_name="./haproxy-block-ai-bots.txt",
converter=json_to_haproxy,
)
if __name__ == "__main__":

View file

@ -0,0 +1,47 @@
AI2Bot
Ai2Bot-Dolma
Amazonbot
anthropic-ai
Applebot
Applebot-Extended
Bytespider
CCBot
ChatGPT-User
Claude-Web
ClaudeBot
cohere-ai
Diffbot
FacebookBot
facebookexternalhit
FriendlyCrawler
Google-Extended
GoogleOther
GoogleOther-Image
GoogleOther-Video
GPTBot
iaskspider/2.0
ICC-Crawler
ImagesiftBot
img2dataset
ISSCyberRiskCrawler
Kangaroo Bot
Meta-ExternalAgent
Meta-ExternalFetcher
OAI-SearchBot
omgili
omgilibot
Perplexity-User
PerplexityBot
PetalBot
Scrapy
Sidetrade indexer bot
Timpibot
VelenPublicWebCrawler
Webzio-Extended
YouBot
crawler.with.dots
star***crawler
Is this a crawler?
a[mazing]{42}(robot)
2^32$
curl|sudo bash

View file

@ -4,7 +4,7 @@
import json
import unittest
from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx
from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy
class RobotsUnittestExtensions:
def loadJson(self, pathname):
@ -60,6 +60,16 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
robots_nginx = json_to_nginx(self.robots_dict)
self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192
def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json")
def test_haproxy_generation(self):
robots_haproxy = json_to_haproxy(self.robots_dict)
self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
class TestRobotsNameCleaning(unittest.TestCase):
def test_clean_name(self):
from robots import clean_robot_name

57
haproxy-block-ai-bots.txt Normal file
View file

@ -0,0 +1,57 @@
AI2Bot
Ai2Bot-Dolma
aiHitBot
Amazonbot
anthropic-ai
Applebot
Applebot-Extended
Brightbot 1.0
Bytespider
CCBot
ChatGPT-User
Claude-Web
ClaudeBot
cohere-ai
cohere-training-data-crawler
Cotoyogi
Crawlspace
Diffbot
DuckAssistBot
FacebookBot
Factset_spyderbot
FirecrawlAgent
FriendlyCrawler
Google-Extended
GoogleOther
GoogleOther-Image
GoogleOther-Video
GPTBot
iaskspider/2.0
ICC-Crawler
ImagesiftBot
img2dataset
imgproxy
ISSCyberRiskCrawler
Kangaroo Bot
meta-externalagent
Meta-ExternalAgent
meta-externalfetcher
Meta-ExternalFetcher
NovaAct
OAI-SearchBot
omgili
omgilibot
Operator
PanguBot
Perplexity-User
PerplexityBot
PetalBot
Scrapy
SemrushBot-OCOB
SemrushBot-SWA
Sidetrade indexer bot
TikTokSpider
Timpibot
VelenPublicWebCrawler
Webzio-Extended
YouBot