mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-12 05:57:45 +00:00
no more failure when run without network
This commit is contained in:
parent
7e2b3ab037
commit
6ab8fb2d37
1 changed files with 60 additions and 9 deletions
|
@ -5,12 +5,27 @@ import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
def get_updated_robots_json():
|
def load_robots_json():
|
||||||
session = requests.Session()
|
"""Load the robots.json contents into a dictionary."""
|
||||||
response = session.get("https://darkvisitors.com/agents")
|
return json.loads(Path("./robots.json").read_text(encoding="utf-8"))
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
|
|
||||||
existing_content = json.loads(Path("./robots.json").read_text())
|
|
||||||
|
def get_agent_soup():
|
||||||
|
"""Retrieve current known agents from darkvisitors.com"""
|
||||||
|
session = requests.Session()
|
||||||
|
try:
|
||||||
|
response = session.get("https://darkvisitors.com/agents")
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
print(
|
||||||
|
"ERROR: Could not gather the current agents from https://darkvisitors.com/agents"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
return BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
|
||||||
|
def updated_robots_json(soup):
|
||||||
|
"""Update AI scraper information with data from darkvisitors."""
|
||||||
|
existing_content = load_robots_json()
|
||||||
to_include = [
|
to_include = [
|
||||||
"AI Assistants",
|
"AI Assistants",
|
||||||
"AI Data Scrapers",
|
"AI Data Scrapers",
|
||||||
|
@ -83,13 +98,31 @@ def get_updated_robots_json():
|
||||||
return sorted_robots
|
return sorted_robots
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_darkvisitors():
|
||||||
|
|
||||||
|
old_robots_json = load_robots_json()
|
||||||
|
soup = get_agent_soup()
|
||||||
|
if soup:
|
||||||
|
robots_json = updated_robots_json(soup)
|
||||||
|
print(
|
||||||
|
"robots.json is unchanged."
|
||||||
|
if robots_json == old_robots_json
|
||||||
|
else "robots.json got updates."
|
||||||
|
)
|
||||||
|
Path("./robots.json").write_text(
|
||||||
|
json.dumps(robots_json, indent=4), encoding="utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def json_to_txt(robots_json):
|
def json_to_txt(robots_json):
|
||||||
|
"""Compose the robots.txt from the robots.json file."""
|
||||||
robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys())
|
robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys())
|
||||||
robots_txt += "\nDisallow: /\n"
|
robots_txt += "\nDisallow: /\n"
|
||||||
return robots_txt
|
return robots_txt
|
||||||
|
|
||||||
|
|
||||||
def json_to_table(robots_json):
|
def json_to_table(robots_json):
|
||||||
|
"""Compose a markdown table with the information in robots.json"""
|
||||||
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
|
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
|
||||||
table += "|-----|----------|-----------------------|----------|------------------|-------------|\n"
|
table += "|-----|----------|-----------------------|----------|------------------|-------------|\n"
|
||||||
|
|
||||||
|
@ -99,8 +132,26 @@ def json_to_table(robots_json):
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
||||||
|
def update_file_if_changed(file_name, converter):
|
||||||
|
"""Update files if newer content is available and log the (in)actions."""
|
||||||
|
new_content = converter(load_robots_json())
|
||||||
|
old_content = Path(file_name).read_text(encoding="utf-8")
|
||||||
|
if old_content == new_content:
|
||||||
|
print(f"{file_name} is already up to date.")
|
||||||
|
else:
|
||||||
|
Path(file_name).write_text(new_content, encoding="utf-8")
|
||||||
|
print(f"{file_name} has been updated.")
|
||||||
|
|
||||||
|
|
||||||
|
def conversions():
|
||||||
|
"""Triggers the conversions from the json file."""
|
||||||
|
update_file_if_changed(file_name="./robots.txt", converter=json_to_txt)
|
||||||
|
update_file_if_changed(
|
||||||
|
file_name="./table-of-bot-metrics.md",
|
||||||
|
converter=json_to_table,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
robots_json = get_updated_robots_json()
|
ingest_darkvisitors()
|
||||||
Path("./robots.json").write_text(json.dumps(robots_json, indent=4))
|
conversions()
|
||||||
Path("./robots.txt").write_text(json_to_txt(robots_json))
|
|
||||||
Path("./table-of-bot-metrics.md").write_text(json_to_table(robots_json))
|
|
||||||
|
|
Loading…
Reference in a new issue