From a884a2afb9dbc7338b0faa24b3c10308adbc48e4 Mon Sep 17 00:00:00 2001
From: Dennis Camera <dennis.camera@riiengineering.ch>
Date: Mon, 17 Feb 2025 21:00:57 +0100
Subject: [PATCH] .htaccess: Make regex in RewriteCond safe

Improve the regular expression by removing unneeded anchors and
escaping special characters (not just space) to prevent false positives
or a misbehaving rewrite rule.
---
 .htaccess                 |  2 +-
 code/robots.py            | 19 ++++++++++---------
 code/test_files/.htaccess |  2 +-
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/.htaccess b/.htaccess
index c42f99e..2313293 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot\ 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/code/robots.py b/code/robots.py
index bb18e70..a8a674d 100644
--- a/code/robots.py
+++ b/code/robots.py
@@ -1,8 +1,9 @@
 import json
-from pathlib import Path
-
+import re
 import requests
+
 from bs4 import BeautifulSoup
+from pathlib import Path
 
 
 def load_robots_json():
@@ -99,7 +100,6 @@ def updated_robots_json(soup):
 
 
 def ingest_darkvisitors():
-
     old_robots_json = load_robots_json()
     soup = get_agent_soup()
     if soup:
@@ -132,16 +132,17 @@ def json_to_table(robots_json):
     return table
 
 
+def list_to_pcre(lst):
+    # Python re is not 100% identical to PCRE which is used by Apache, but it
+    # should probably be close enough in the real world for re.escape to work.
+    return f"({"|".join(map(re.escape, lst))})"
+
+
 def json_to_htaccess(robot_json):
     # Creates a .htaccess filter file. It uses a regular expression to filter out
     # User agents that contain any of the blocked values.
     htaccess = "RewriteEngine On\n"
-    htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*("
-
-    # Escape spaces in each User Agent to build the regular expression
-    robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys())
-    htaccess += "|".join(robots)
-    htaccess += ").*$ [NC]\n"
+    htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n"
     htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
     return htaccess
 
diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess
index 2e78674..90ddcf2 100644
--- a/code/test_files/.htaccess
+++ b/code/test_files/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]