From 5f5a89c38c27b676c3212f6ea3895d31f315f37e Mon Sep 17 00:00:00 2001
From: Frederic Barthelemy <git@fbartho.com>
Date: Fri, 4 Apr 2025 17:34:14 -0700
Subject: [PATCH] Fix html-mangled hyphen in Perplexity-Users

Fixes: #99
---
 .htaccess                                |  2 +-
 code/robots.py                           | 15 +++++++++++++++
 code/test_files/.htaccess                |  2 +-
 code/test_files/nginx-block-ai-bots.conf |  2 +-
 code/test_files/robots.json              |  7 +++++++
 code/test_files/robots.txt               |  1 +
 code/test_files/table-of-bot-metrics.md  |  1 +
 code/tests.py                            |  5 +++++
 nginx-block-ai-bots.conf                 |  2 +-
 robots.json                              | 14 +++++++-------
 robots.txt                               |  2 +-
 table-of-bot-metrics.md                  |  2 +-
 12 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/.htaccess b/.htaccess
index 2f5d0e4..27a7e11 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/code/robots.py b/code/robots.py
index 90c0e8c..d158b36 100755
--- a/code/robots.py
+++ b/code/robots.py
@@ -50,6 +50,7 @@ def updated_robots_json(soup):
             continue
         for agent in section.find_all("a", href=True):
             name = agent.find("div", {"class": "agent-name"}).get_text().strip()
+            name = clean_robot_name(name)
             desc = agent.find("p").get_text().strip()
 
             default_values = {
@@ -101,6 +102,20 @@ def updated_robots_json(soup):
     return sorted_robots
 
 
+def clean_robot_name(name):
+    """ Clean the robot name by removing some characters that were mangled by html software once. """
+    # This was specifically spotted in "Perplexity-User"
+    # Looks like a non-breaking hyphen introduced by the HTML rendering software
+    # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
+    # You can see the bot is listed several times as "Perplexity‑User" with a normal hyphen, 
+    # and it's only the Row-Heading that has the special hyphen
+    # 
+    # Technically, there's no reason there wouldn't someday be a bot that 
+    # actually uses a non-breaking hyphen, but that seems unlikely,
+    # so this solution should be fine for now.
+    return re.sub(r"\u2011", "-", name)
+
+
 def ingest_darkvisitors():
     old_robots_json = load_robots_json()
     soup = get_agent_soup()
diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess
index 7e39092..f0d6783 100644
--- a/code/test_files/.htaccess
+++ b/code/test_files/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/code/test_files/nginx-block-ai-bots.conf b/code/test_files/nginx-block-ai-bots.conf
index d1b559e..c569b15 100644
--- a/code/test_files/nginx-block-ai-bots.conf
+++ b/code/test_files/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") {
     return 403;
 }
\ No newline at end of file
diff --git a/code/test_files/robots.json b/code/test_files/robots.json
index b0cbfbb..385f284 100644
--- a/code/test_files/robots.json
+++ b/code/test_files/robots.json
@@ -223,6 +223,13 @@
         "operator": "[Webz.io](https://webz.io/)",
         "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)"
     },
+    "Perplexity-User": {
+        "operator": "[Perplexity](https://www.perplexity.ai/)",
+        "respect": "[No](https://docs.perplexity.ai/guides/bots)",
+        "function": "Used to answer queries at the request of users.",
+        "frequency": "Only when prompted by a user.",
+        "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
+    },
     "PerplexityBot": {
         "operator": "[Perplexity](https://www.perplexity.ai/)",
         "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)",
diff --git a/code/test_files/robots.txt b/code/test_files/robots.txt
index 03c3c25..ee201f8 100644
--- a/code/test_files/robots.txt
+++ b/code/test_files/robots.txt
@@ -30,6 +30,7 @@ User-agent: Meta-ExternalFetcher
 User-agent: OAI-SearchBot
 User-agent: omgili
 User-agent: omgilibot
+User-agent: Perplexity-User
 User-agent: PerplexityBot
 User-agent: PetalBot
 User-agent: Scrapy
diff --git a/code/test_files/table-of-bot-metrics.md b/code/test_files/table-of-bot-metrics.md
index 88af6c0..9b280aa 100644
--- a/code/test_files/table-of-bot-metrics.md
+++ b/code/test_files/table-of-bot-metrics.md
@@ -32,6 +32,7 @@
 | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
+| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
diff --git a/code/tests.py b/code/tests.py
index 61d69b4..f58b445 100755
--- a/code/tests.py
+++ b/code/tests.py
@@ -60,6 +60,11 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
         robots_nginx = json_to_nginx(self.robots_dict)
         self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
 
+class TestRobotsNameCleaning(unittest.TestCase):
+    def test_clean_name(self):
+        from robots import clean_robot_name
+
+        self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
 
 if __name__ == "__main__":
     import os
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 72d65ec..0577bd9 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.json b/robots.json
index e907c8b..8fd7572 100644
--- a/robots.json
+++ b/robots.json
@@ -251,6 +251,13 @@
         "frequency": "Unclear at this time.",
         "description": "PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot"
     },
+    "Perplexity-User": {
+        "operator": "[Perplexity](https://www.perplexity.ai/)",
+        "respect": "[No](https://docs.perplexity.ai/guides/bots)",
+        "function": "Used to answer queries at the request of users.",
+        "frequency": "Only when prompted by a user.",
+        "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
+    },
     "PerplexityBot": {
         "operator": "[Perplexity](https://www.perplexity.ai/)",
         "respect": "[Yes](https://docs.perplexity.ai/guides/bots)",
@@ -258,13 +265,6 @@
         "frequency": "No information.",
         "description": "Crawls sites to surface as results in Perplexity."
     },
-    "Perplexity\u2011User": {
-        "operator": "[Perplexity](https://www.perplexity.ai/)",
-        "respect": "[No](https://docs.perplexity.ai/guides/bots)",
-        "function": "Used to answer queries at the request of users.",
-        "frequency": "Only when prompted by a user.",
-        "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
-    },
     "PetalBot": {
         "description": "Operated by Huawei to provide search and AI assistant services.",
         "frequency": "No explicit frequency provided.",
diff --git a/robots.txt b/robots.txt
index 8c79fc2..c531918 100644
--- a/robots.txt
+++ b/robots.txt
@@ -34,8 +34,8 @@ User-agent: OAI-SearchBot
 User-agent: omgili
 User-agent: omgilibot
 User-agent: PanguBot
+User-agent: Perplexity-User
 User-agent: PerplexityBot
-User-agent: Perplexity‑User
 User-agent: PetalBot
 User-agent: Scrapy
 User-agent: SemrushBot-OCOB
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 0cc2264..d92df34 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -36,8 +36,8 @@
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
 | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot |
+| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |
-| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |