Fix html-mangled hyphen in Perplexity-Users

Fixes: #99
This commit is contained in:
Frederic Barthelemy 2025-04-04 17:34:14 -07:00
parent 6b0349f37d
commit 5f5a89c38c
No known key found for this signature in database
GPG key ID: 6FF43C49A5D473EF
12 changed files with 42 additions and 13 deletions

View file

@ -50,6 +50,7 @@ def updated_robots_json(soup):
continue
for agent in section.find_all("a", href=True):
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
name = clean_robot_name(name)
desc = agent.find("p").get_text().strip()
default_values = {
@ -101,6 +102,20 @@ def updated_robots_json(soup):
return sorted_robots
def clean_robot_name(name):
""" Clean the robot name by removing some characters that were mangled by html software once. """
# This was specifically spotted in "Perplexity-User"
# Looks like a non-breaking hyphen introduced by the HTML rendering software
# Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
# You can see the bot is listed several times as "PerplexityUser" with a normal hyphen,
# and it's only the Row-Heading that has the special hyphen
#
# Technically, there's no reason there wouldn't someday be a bot that
# actually uses a non-breaking hyphen, but that seems unlikely,
# so this solution should be fine for now.
return re.sub(r"\u2011", "-", name)
def ingest_darkvisitors():
old_robots_json = load_robots_json()
soup = get_agent_soup()