mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-19 00:26:01 +00:00
parent
6b0349f37d
commit
5f5a89c38c
12 changed files with 42 additions and 13 deletions
|
@ -50,6 +50,7 @@ def updated_robots_json(soup):
|
|||
continue
|
||||
for agent in section.find_all("a", href=True):
|
||||
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
|
||||
name = clean_robot_name(name)
|
||||
desc = agent.find("p").get_text().strip()
|
||||
|
||||
default_values = {
|
||||
|
@ -101,6 +102,20 @@ def updated_robots_json(soup):
|
|||
return sorted_robots
|
||||
|
||||
|
||||
def clean_robot_name(name):
|
||||
""" Clean the robot name by removing some characters that were mangled by html software once. """
|
||||
# This was specifically spotted in "Perplexity-User"
|
||||
# Looks like a non-breaking hyphen introduced by the HTML rendering software
|
||||
# Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
|
||||
# You can see the bot is listed several times as "Perplexity‑User" with a normal hyphen,
|
||||
# and it's only the Row-Heading that has the special hyphen
|
||||
#
|
||||
# Technically, there's no reason there wouldn't someday be a bot that
|
||||
# actually uses a non-breaking hyphen, but that seems unlikely,
|
||||
# so this solution should be fine for now.
|
||||
return re.sub(r"\u2011", "-", name)
|
||||
|
||||
|
||||
def ingest_darkvisitors():
|
||||
old_robots_json = load_robots_json()
|
||||
soup = get_agent_soup()
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue