From b455af66e7903e76162d43f3e8f0900084fb9539 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi Date: Fri, 17 Jan 2025 21:42:08 +0100 Subject: [PATCH] Adding clarification about performance and code comment --- README.md | 3 ++- code/robots.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 45c8f3a..dd84a16 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,9 @@ The first one tells search engine and AI crawlers which parts of your website sh The second one tells your own webserver to return an error page when one of the listed AI crawlers tries to request a page from your website. A `.htaccess` file does not work on every webserver, but works correctly on most common and cheap shared hosting providers. The majority of AI crawlers set a "User Agent" string in every request they send, by which they are identifiable: this string is used to filter the request. Instead of simply hoping the crawler pledges to respect our intention, this solution actively sends back a bad webpage (an error or an empty page). Note that this solution isn't bulletproof either, as anyone can fake the sent User Agent. -We suggest adding both files, as some crawlers may respect `robots.txt` while not having an identifiable User Agent; on the other hand, other crawlers may not respect the `robots.txt`, but they provide a identifiable User Agent by which we can filter them out. +Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. Nevertheless, most shared hosting providers only allow `.htaccess` configuration. +We suggest adding both files, as some crawlers may respect `robots.txt` while not having an identifiable User Agent; on the other hand, other crawlers may not respect the `robots.txt`, but they provide a identifiable User Agent by which we can filter them out. ## Contributing diff --git a/code/robots.py b/code/robots.py index d35d74b..f2ddbb8 100644 --- a/code/robots.py +++ b/code/robots.py @@ -133,7 +133,9 @@ def json_to_table(robots_json): def json_to_htaccess(robot_json): - htaccess = "RewriteEngine On\n" + # Creates a .htaccess filter file. It uses a regular expression to filter out + #User agents that contain any of the blocked values. + htaccess += "RewriteEngine On\n" htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys())