mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-04 19:13:57 +00:00
Merge pull request #68 from MassiminoilTrace/main
Implementing automatic htaccess generation
This commit is contained in:
commit
81cc81b35e
5 changed files with 48 additions and 3 deletions
3
.htaccess
Normal file
3
.htaccess
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
RewriteEngine On
|
||||||
|
RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
|
||||||
|
RewriteRule .* - [F,L]
|
14
README.md
14
README.md
|
@ -8,9 +8,21 @@ A number of these crawlers have been sourced from [Dark Visitors](https://darkvi
|
||||||
|
|
||||||
If you'd like to add information about a crawler to the list, please make a pull request with the bot name added to `robots.txt`, `ai.txt`, and any relevant details in `table-of-bot-metrics.md` to help people understand what's crawling.
|
If you'd like to add information about a crawler to the list, please make a pull request with the bot name added to `robots.txt`, `ai.txt`, and any relevant details in `table-of-bot-metrics.md` to help people understand what's crawling.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
This repository provides the following files:
|
||||||
|
- `robots.txt`
|
||||||
|
- `.htaccess`
|
||||||
|
|
||||||
|
`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
|
||||||
|
|
||||||
|
`.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server.
|
||||||
|
Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist.
|
||||||
|
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
A note about contributing: updates should be added/made to `robots.json`. A GitHub action, courtesy of [Adam](https://github.com/newbold), will then generate the updated `robots.txt` and `table-of-bot-metrics.md`.
|
A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`.
|
||||||
|
|
||||||
## Subscribe to updates
|
## Subscribe to updates
|
||||||
|
|
||||||
|
|
|
@ -132,10 +132,27 @@ def json_to_table(robots_json):
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
||||||
|
def json_to_htaccess(robot_json):
|
||||||
|
# Creates a .htaccess filter file. It uses a regular expression to filter out
|
||||||
|
# User agents that contain any of the blocked values.
|
||||||
|
htaccess = "RewriteEngine On\n"
|
||||||
|
htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*("
|
||||||
|
|
||||||
|
# Escape spaces in each User Agent to build the regular expression
|
||||||
|
robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys())
|
||||||
|
htaccess += "|".join(robots)
|
||||||
|
htaccess += ").*$ [NC]\n"
|
||||||
|
htaccess += "RewriteRule .* - [F,L]"
|
||||||
|
return htaccess
|
||||||
|
|
||||||
|
|
||||||
def update_file_if_changed(file_name, converter):
|
def update_file_if_changed(file_name, converter):
|
||||||
"""Update files if newer content is available and log the (in)actions."""
|
"""Update files if newer content is available and log the (in)actions."""
|
||||||
new_content = converter(load_robots_json())
|
new_content = converter(load_robots_json())
|
||||||
old_content = Path(file_name).read_text(encoding="utf-8")
|
filepath = Path(file_name)
|
||||||
|
# "touch" will create the file if it doesn't exist yet
|
||||||
|
filepath.touch()
|
||||||
|
old_content = filepath.read_text(encoding="utf-8")
|
||||||
if old_content == new_content:
|
if old_content == new_content:
|
||||||
print(f"{file_name} is already up to date.")
|
print(f"{file_name} is already up to date.")
|
||||||
else:
|
else:
|
||||||
|
@ -150,6 +167,10 @@ def conversions():
|
||||||
file_name="./table-of-bot-metrics.md",
|
file_name="./table-of-bot-metrics.md",
|
||||||
converter=json_to_table,
|
converter=json_to_table,
|
||||||
)
|
)
|
||||||
|
update_file_if_changed(
|
||||||
|
file_name="./.htaccess",
|
||||||
|
converter=json_to_htaccess,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
3
code/test_files/.htaccess
Normal file
3
code/test_files/.htaccess
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
RewriteEngine On
|
||||||
|
RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
|
||||||
|
RewriteRule .* - [F,L]
|
|
@ -6,7 +6,7 @@ cd to the `code` directory and run `pytest`
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from robots import json_to_txt, json_to_table
|
from robots import json_to_txt, json_to_table, json_to_htaccess
|
||||||
|
|
||||||
|
|
||||||
def test_robots_txt_creation():
|
def test_robots_txt_creation():
|
||||||
|
@ -19,3 +19,9 @@ def test_table_of_bot_metrices_md():
|
||||||
robots_json = json.loads(Path("test_files/robots.json").read_text())
|
robots_json = json.loads(Path("test_files/robots.json").read_text())
|
||||||
robots_table = json_to_table(robots_json)
|
robots_table = json_to_table(robots_json)
|
||||||
assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table
|
assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table
|
||||||
|
|
||||||
|
|
||||||
|
def test_htaccess_creation():
|
||||||
|
robots_json = json.loads(Path("test_files/robots.json").read_text())
|
||||||
|
robots_htaccess = json_to_htaccess(robots_json)
|
||||||
|
assert Path("test_files/.htaccess").read_text() == robots_htaccess
|
||||||
|
|
Loading…
Reference in a new issue