mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-16 15:27:46 +00:00
237 lines
8.2 KiB
Python
Executable file
237 lines
8.2 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
|
||
import json
|
||
import re
|
||
import requests
|
||
|
||
from bs4 import BeautifulSoup
|
||
from pathlib import Path
|
||
|
||
|
||
def load_robots_json():
|
||
"""Load the robots.json contents into a dictionary."""
|
||
return json.loads(Path("./robots.json").read_text(encoding="utf-8"))
|
||
|
||
|
||
def get_agent_soup():
|
||
"""Retrieve current known agents from darkvisitors.com"""
|
||
session = requests.Session()
|
||
try:
|
||
response = session.get("https://darkvisitors.com/agents")
|
||
except requests.exceptions.ConnectionError:
|
||
print(
|
||
"ERROR: Could not gather the current agents from https://darkvisitors.com/agents"
|
||
)
|
||
return
|
||
return BeautifulSoup(response.text, "html.parser")
|
||
|
||
|
||
def updated_robots_json(soup):
|
||
"""Update AI scraper information with data from darkvisitors."""
|
||
existing_content = load_robots_json()
|
||
to_include = [
|
||
"AI Assistants",
|
||
"AI Data Scrapers",
|
||
"AI Search Crawlers",
|
||
# "Archivers",
|
||
# "Developer Helpers",
|
||
# "Fetchers",
|
||
# "Intelligence Gatherers",
|
||
# "Scrapers",
|
||
# "Search Engine Crawlers",
|
||
# "SEO Crawlers",
|
||
# "Uncategorized",
|
||
"Undocumented AI Agents",
|
||
]
|
||
|
||
for section in soup.find_all("div", {"class": "agent-links-section"}):
|
||
category = section.find("h2").get_text()
|
||
if category not in to_include:
|
||
continue
|
||
for agent in section.find_all("a", href=True):
|
||
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
|
||
name = clean_robot_name(name)
|
||
desc = agent.find("p").get_text().strip()
|
||
|
||
default_values = {
|
||
"Unclear at this time.",
|
||
"No information provided.",
|
||
"No information.",
|
||
"No explicit frequency provided.",
|
||
}
|
||
default_value = "Unclear at this time."
|
||
|
||
# Parse the operator information from the description if possible
|
||
operator = default_value
|
||
if "operated by " in desc:
|
||
try:
|
||
operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip()
|
||
except Exception as e:
|
||
print(f"Error: {e}")
|
||
|
||
def consolidate(field: str, value: str) -> str:
|
||
# New entry
|
||
if name not in existing_content:
|
||
return value
|
||
# New field
|
||
if field not in existing_content[name]:
|
||
return value
|
||
# Unclear value
|
||
if (
|
||
existing_content[name][field] in default_values
|
||
and value not in default_values
|
||
):
|
||
return value
|
||
# Existing value
|
||
return existing_content[name][field]
|
||
|
||
existing_content[name] = {
|
||
"operator": consolidate("operator", operator),
|
||
"respect": consolidate("respect", default_value),
|
||
"function": consolidate("function", f"{category}"),
|
||
"frequency": consolidate("frequency", default_value),
|
||
"description": consolidate(
|
||
"description",
|
||
f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}",
|
||
),
|
||
}
|
||
|
||
print(f"Total: {len(existing_content)}")
|
||
sorted_keys = sorted(existing_content, key=lambda k: k.lower())
|
||
sorted_robots = {k: existing_content[k] for k in sorted_keys}
|
||
return sorted_robots
|
||
|
||
|
||
def clean_robot_name(name):
|
||
""" Clean the robot name by removing some characters that were mangled by html software once. """
|
||
# This was specifically spotted in "Perplexity-User"
|
||
# Looks like a non-breaking hyphen introduced by the HTML rendering software
|
||
# Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
|
||
# You can see the bot is listed several times as "Perplexity‑User" with a normal hyphen,
|
||
# and it's only the Row-Heading that has the special hyphen
|
||
#
|
||
# Technically, there's no reason there wouldn't someday be a bot that
|
||
# actually uses a non-breaking hyphen, but that seems unlikely,
|
||
# so this solution should be fine for now.
|
||
return re.sub(r"\u2011", "-", name)
|
||
|
||
|
||
def ingest_darkvisitors():
|
||
old_robots_json = load_robots_json()
|
||
soup = get_agent_soup()
|
||
if soup:
|
||
robots_json = updated_robots_json(soup)
|
||
print(
|
||
"robots.json is unchanged."
|
||
if robots_json == old_robots_json
|
||
else "robots.json got updates."
|
||
)
|
||
Path("./robots.json").write_text(
|
||
json.dumps(robots_json, indent=4), encoding="utf-8"
|
||
)
|
||
|
||
|
||
def json_to_txt(robots_json):
|
||
"""Compose the robots.txt from the robots.json file."""
|
||
robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys())
|
||
robots_txt += "\nDisallow: /\n"
|
||
return robots_txt
|
||
|
||
|
||
def escape_md(s):
|
||
return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)
|
||
|
||
|
||
def json_to_table(robots_json):
|
||
"""Compose a markdown table with the information in robots.json"""
|
||
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
|
||
table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
|
||
|
||
for name, robot in robots_json.items():
|
||
table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'
|
||
|
||
return table
|
||
|
||
|
||
def list_to_pcre(lst):
|
||
# Python re is not 100% identical to PCRE which is used by Apache, but it
|
||
# should probably be close enough in the real world for re.escape to work.
|
||
formatted = "|".join(map(re.escape, lst))
|
||
return f"({formatted})"
|
||
|
||
|
||
def json_to_htaccess(robot_json):
|
||
# Creates a .htaccess filter file. It uses a regular expression to filter out
|
||
# User agents that contain any of the blocked values.
|
||
htaccess = "RewriteEngine On\n"
|
||
htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n"
|
||
htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
|
||
return htaccess
|
||
|
||
def json_to_nginx(robot_json):
|
||
# Creates an Nginx config file. This config snippet can be included in
|
||
# nginx server{} blocks to block AI bots.
|
||
config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}"
|
||
return config
|
||
|
||
|
||
def update_file_if_changed(file_name, converter):
|
||
"""Update files if newer content is available and log the (in)actions."""
|
||
new_content = converter(load_robots_json())
|
||
filepath = Path(file_name)
|
||
# "touch" will create the file if it doesn't exist yet
|
||
filepath.touch()
|
||
old_content = filepath.read_text(encoding="utf-8")
|
||
if old_content == new_content:
|
||
print(f"{file_name} is already up to date.")
|
||
else:
|
||
Path(file_name).write_text(new_content, encoding="utf-8")
|
||
print(f"{file_name} has been updated.")
|
||
|
||
|
||
def conversions():
|
||
"""Triggers the conversions from the json file."""
|
||
update_file_if_changed(file_name="./robots.txt", converter=json_to_txt)
|
||
update_file_if_changed(
|
||
file_name="./table-of-bot-metrics.md",
|
||
converter=json_to_table,
|
||
)
|
||
update_file_if_changed(
|
||
file_name="./.htaccess",
|
||
converter=json_to_htaccess,
|
||
)
|
||
update_file_if_changed(
|
||
file_name="./nginx-block-ai-bots.conf",
|
||
converter=json_to_nginx,
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser()
|
||
parser = argparse.ArgumentParser(
|
||
prog="ai-robots",
|
||
description="Collects and updates information about web scrapers of AI companies.",
|
||
epilog="One of the flags must be set.\n",
|
||
)
|
||
parser.add_argument(
|
||
"--update",
|
||
action="store_true",
|
||
help="Update the robots.json file with data from darkvisitors.com/agents",
|
||
)
|
||
parser.add_argument(
|
||
"--convert",
|
||
action="store_true",
|
||
help="Create the robots.txt and markdown table from robots.json",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
if not (args.update or args.convert):
|
||
print("ERROR: please provide one of the possible flags.")
|
||
parser.print_help()
|
||
|
||
if args.update:
|
||
ingest_darkvisitors()
|
||
if args.convert:
|
||
conversions()
|