mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-04 19:13:57 +00:00
Merge pull request #94 from ThomasLeister/feature/implement-nginx-configuration-snippet-export
Implement Nginx configuration snippet export
This commit is contained in:
commit
dba03d809c
5 changed files with 31 additions and 2 deletions
|
@ -13,16 +13,19 @@ If you'd like to add information about a crawler to the list, please make a pull
|
|||
This repository provides the following files:
|
||||
- `robots.txt`
|
||||
- `.htaccess`
|
||||
- `nginx-block-ai-bots.conf`
|
||||
|
||||
`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
|
||||
|
||||
`.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server.
|
||||
Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist.
|
||||
|
||||
`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
|
||||
|
||||
|
||||
## Contributing
|
||||
|
||||
A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`.
|
||||
A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`.
|
||||
|
||||
You can run the tests by [installing](https://www.python.org/about/gettingstarted/) Python 3 and issuing:
|
||||
```console
|
||||
|
|
|
@ -152,6 +152,12 @@ def json_to_htaccess(robot_json):
|
|||
htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
|
||||
return htaccess
|
||||
|
||||
def json_to_nginx(robot_json):
|
||||
# Creates an Nginx config file. This config snippet can be included in
|
||||
# nginx server{} blocks to block AI bots.
|
||||
config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}"
|
||||
return config
|
||||
|
||||
|
||||
def update_file_if_changed(file_name, converter):
|
||||
"""Update files if newer content is available and log the (in)actions."""
|
||||
|
@ -178,6 +184,10 @@ def conversions():
|
|||
file_name="./.htaccess",
|
||||
converter=json_to_htaccess,
|
||||
)
|
||||
update_file_if_changed(
|
||||
file_name="./nginx-block-ai-bots.conf",
|
||||
converter=json_to_nginx,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
3
code/test_files/nginx-block-ai-bots.conf
Normal file
3
code/test_files/nginx-block-ai-bots.conf
Normal file
|
@ -0,0 +1,3 @@
|
|||
if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") {
|
||||
return 403;
|
||||
}
|
|
@ -4,7 +4,7 @@
|
|||
import json
|
||||
import unittest
|
||||
|
||||
from robots import json_to_txt, json_to_table, json_to_htaccess
|
||||
from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx
|
||||
|
||||
class RobotsUnittestExtensions:
|
||||
def loadJson(self, pathname):
|
||||
|
@ -50,6 +50,16 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
|||
robots_htaccess = json_to_htaccess(self.robots_dict)
|
||||
self.assertEqualsFile("test_files/.htaccess", robots_htaccess)
|
||||
|
||||
class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||
maxDiff = 8192
|
||||
|
||||
def setUp(self):
|
||||
self.robots_dict = self.loadJson("test_files/robots.json")
|
||||
|
||||
def test_nginx_generation(self):
|
||||
robots_nginx = json_to_nginx(self.robots_dict)
|
||||
self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
|
3
nginx-block-ai-bots.conf
Normal file
3
nginx-block-ai-bots.conf
Normal file
|
@ -0,0 +1,3 @@
|
|||
if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
|
||||
return 403;
|
||||
}
|
Loading…
Reference in a new issue