diff --git a/.htaccess b/.htaccess new file mode 100644 index 0000000..31ba5f7 --- /dev/null +++ b/.htaccess @@ -0,0 +1,3 @@ +RewriteEngine On +RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteRule .* - [F,L] \ No newline at end of file diff --git a/README.md b/README.md index b3c2e7c..065b0b7 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,21 @@ A number of these crawlers have been sourced from [Dark Visitors](https://darkvi If you'd like to add information about a crawler to the list, please make a pull request with the bot name added to `robots.txt`, `ai.txt`, and any relevant details in `table-of-bot-metrics.md` to help people understand what's crawling. +## Usage + +This repository provides the following files: +- `robots.txt` +- `.htaccess` + +`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). + +`.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server. +Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. + + ## Contributing -A note about contributing: updates should be added/made to `robots.json`. A GitHub action, courtesy of [Adam](https://github.com/newbold), will then generate the updated `robots.txt` and `table-of-bot-metrics.md`. +A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`. ## Subscribe to updates diff --git a/code/robots.py b/code/robots.py index cf44e8e..087b00b 100644 --- a/code/robots.py +++ b/code/robots.py @@ -132,10 +132,27 @@ def json_to_table(robots_json): return table +def json_to_htaccess(robot_json): + # Creates a .htaccess filter file. It uses a regular expression to filter out + # User agents that contain any of the blocked values. + htaccess = "RewriteEngine On\n" + htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" + + # Escape spaces in each User Agent to build the regular expression + robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys()) + htaccess += "|".join(robots) + htaccess += ").*$ [NC]\n" + htaccess += "RewriteRule .* - [F,L]" + return htaccess + + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" new_content = converter(load_robots_json()) - old_content = Path(file_name).read_text(encoding="utf-8") + filepath = Path(file_name) + # "touch" will create the file if it doesn't exist yet + filepath.touch() + old_content = filepath.read_text(encoding="utf-8") if old_content == new_content: print(f"{file_name} is already up to date.") else: @@ -150,6 +167,10 @@ def conversions(): file_name="./table-of-bot-metrics.md", converter=json_to_table, ) + update_file_if_changed( + file_name="./.htaccess", + converter=json_to_htaccess, + ) if __name__ == "__main__": diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess new file mode 100644 index 0000000..a34bf55 --- /dev/null +++ b/code/test_files/.htaccess @@ -0,0 +1,3 @@ +RewriteEngine On +RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteRule .* - [F,L] \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index 9cf35fe..6f778c3 100644 --- a/code/tests.py +++ b/code/tests.py @@ -6,7 +6,7 @@ cd to the `code` directory and run `pytest` import json from pathlib import Path -from robots import json_to_txt, json_to_table +from robots import json_to_txt, json_to_table, json_to_htaccess def test_robots_txt_creation(): @@ -19,3 +19,9 @@ def test_table_of_bot_metrices_md(): robots_json = json.loads(Path("test_files/robots.json").read_text()) robots_table = json_to_table(robots_json) assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table + + +def test_htaccess_creation(): + robots_json = json.loads(Path("test_files/robots.json").read_text()) + robots_htaccess = json_to_htaccess(robots_json) + assert Path("test_files/.htaccess").read_text() == robots_htaccess