Implementing htaccess generation

This commit is contained in:
Massimo Gismondi 2025-01-07 11:02:29 +01:00
parent b7f908e305
commit 933aa6159d
4 changed files with 34 additions and 2 deletions

3
.htaccess Normal file
View file

@ -0,0 +1,3 @@
RewriteEngine On
RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
RewriteRule .* - [F,L]

View file

@ -132,10 +132,26 @@ def json_to_table(robots_json):
return table
def json_to_htaccess(robot_json):
htaccess = "RewriteEngine On\n"
htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*("
robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys())
htaccess += "|".join(robots)
htaccess += ").*$ [NC]\n"
htaccess += "RewriteRule .* - [F,L]"
return htaccess
def update_file_if_changed(file_name, converter):
"""Update files if newer content is available and log the (in)actions."""
new_content = converter(load_robots_json())
old_content = Path(file_name).read_text(encoding="utf-8")
filepath = Path(file_name)
if not filepath.exists():
filepath.write_text(new_content, encoding="utf-8")
print(f"{file_name} has been created.")
return
old_content = filepath.read_text(encoding="utf-8")
if old_content == new_content:
print(f"{file_name} is already up to date.")
else:
@ -150,6 +166,10 @@ def conversions():
file_name="./table-of-bot-metrics.md",
converter=json_to_table,
)
update_file_if_changed(
file_name="./.htaccess",
converter=json_to_htaccess,
)
if __name__ == "__main__":

View file

@ -0,0 +1,3 @@
RewriteEngine On
RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
RewriteRule .* - [F,L]

View file

@ -6,7 +6,7 @@ cd to the `code` directory and run `pytest`
import json
from pathlib import Path
from robots import json_to_txt, json_to_table
from robots import json_to_txt, json_to_table, json_to_htaccess
def test_robots_txt_creation():
@ -19,3 +19,9 @@ def test_table_of_bot_metrices_md():
robots_json = json.loads(Path("test_files/robots.json").read_text())
robots_table = json_to_table(robots_json)
assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table
def test_htaccess_creation():
robots_json = json.loads(Path("test_files/robots.json").read_text())
robots_htaccess = json_to_htaccess(robots_json)
assert Path("test_files/.htaccess").read_text() == robots_htaccess