mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-04 19:13:57 +00:00
.htaccess: Make regex in RewriteCond safe
Improve the regular expression by removing unneeded anchors and escaping special characters (not just space) to prevent false positives or a misbehaving rewrite rule.
This commit is contained in:
parent
c0d418cd87
commit
a884a2afb9
3 changed files with 12 additions and 11 deletions
|
@ -1,3 +1,3 @@
|
||||||
RewriteEngine On
|
RewriteEngine On
|
||||||
RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot\ 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
|
RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
|
||||||
RewriteRule !^/?robots\.txt$ - [F,L]
|
RewriteRule !^/?robots\.txt$ - [F,L]
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
import re
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def load_robots_json():
|
def load_robots_json():
|
||||||
|
@ -99,7 +100,6 @@ def updated_robots_json(soup):
|
||||||
|
|
||||||
|
|
||||||
def ingest_darkvisitors():
|
def ingest_darkvisitors():
|
||||||
|
|
||||||
old_robots_json = load_robots_json()
|
old_robots_json = load_robots_json()
|
||||||
soup = get_agent_soup()
|
soup = get_agent_soup()
|
||||||
if soup:
|
if soup:
|
||||||
|
@ -132,16 +132,17 @@ def json_to_table(robots_json):
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
||||||
|
def list_to_pcre(lst):
|
||||||
|
# Python re is not 100% identical to PCRE which is used by Apache, but it
|
||||||
|
# should probably be close enough in the real world for re.escape to work.
|
||||||
|
return f"({"|".join(map(re.escape, lst))})"
|
||||||
|
|
||||||
|
|
||||||
def json_to_htaccess(robot_json):
|
def json_to_htaccess(robot_json):
|
||||||
# Creates a .htaccess filter file. It uses a regular expression to filter out
|
# Creates a .htaccess filter file. It uses a regular expression to filter out
|
||||||
# User agents that contain any of the blocked values.
|
# User agents that contain any of the blocked values.
|
||||||
htaccess = "RewriteEngine On\n"
|
htaccess = "RewriteEngine On\n"
|
||||||
htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*("
|
htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n"
|
||||||
|
|
||||||
# Escape spaces in each User Agent to build the regular expression
|
|
||||||
robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys())
|
|
||||||
htaccess += "|".join(robots)
|
|
||||||
htaccess += ").*$ [NC]\n"
|
|
||||||
htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
|
htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
|
||||||
return htaccess
|
return htaccess
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
RewriteEngine On
|
RewriteEngine On
|
||||||
RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
|
RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
|
||||||
RewriteRule !^/?robots\.txt$ - [F,L]
|
RewriteRule !^/?robots\.txt$ - [F,L]
|
||||||
|
|
Loading…
Reference in a new issue