Fix html-mangled hyphen in Perplexity-Users

Fixes: #99
2025-04-09 13:27:46 +00:00 · 2025-04-04 17:34:14 -07:00 · 2025-04-04 17:34:14 -07:00 · 5f5a89c38c
commit 5f5a89c38c
parent 6b0349f37d
12 changed files with 42 additions and 13 deletions
--- a/.htaccess
+++ b/.htaccess
@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
--- a/code/robots.py
+++ b/code/robots.py
@ -50,6 +50,7 @@ def updated_robots_json(soup):
            continue
        for agent in section.find_all("a", href=True):
            name = agent.find("div", {"class": "agent-name"}).get_text().strip()
+            name = clean_robot_name(name)
            desc = agent.find("p").get_text().strip()

            default_values = {
@ -101,6 +102,20 @@ def updated_robots_json(soup):
    return sorted_robots


+def clean_robot_name(name):
+    """ Clean the robot name by removing some characters that were mangled by html software once. """
+    # This was specifically spotted in "Perplexity-User"
+    # Looks like a non-breaking hyphen introduced by the HTML rendering software
+    # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
+    # You can see the bot is listed several times as "Perplexity‑User" with a normal hyphen, 
+    # and it's only the Row-Heading that has the special hyphen
+    # 
+    # Technically, there's no reason there wouldn't someday be a bot that 
+    # actually uses a non-breaking hyphen, but that seems unlikely,
+    # so this solution should be fine for now.
+    return re.sub(r"\u2011", "-", name)
+
+
 def ingest_darkvisitors():
    old_robots_json = load_robots_json()
    soup = get_agent_soup()
--- a/code/test_files/.htaccess
+++ b/code/test_files/.htaccess
@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
--- a/code/test_files/nginx-block-ai-bots.conf
+++ b/code/test_files/nginx-block-ai-bots.conf
@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") {
    return 403;
 }
--- a/code/test_files/robots.json
+++ b/code/test_files/robots.json
@ -223,6 +223,13 @@
        "operator": "[Webz.io](https://webz.io/)",
        "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)"
    },
+    "Perplexity-User": {
+        "operator": "[Perplexity](https://www.perplexity.ai/)",
+        "respect": "[No](https://docs.perplexity.ai/guides/bots)",
+        "function": "Used to answer queries at the request of users.",
+        "frequency": "Only when prompted by a user.",
+        "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
+    },
    "PerplexityBot": {
        "operator": "[Perplexity](https://www.perplexity.ai/)",
        "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)",
--- a/code/test_files/robots.txt
+++ b/code/test_files/robots.txt
@ -30,6 +30,7 @@ User-agent: Meta-ExternalFetcher
 User-agent: OAI-SearchBot
 User-agent: omgili
 User-agent: omgilibot
+User-agent: Perplexity-User
 User-agent: PerplexityBot
 User-agent: PetalBot
 User-agent: Scrapy
--- a/code/test_files/table-of-bot-metrics.md
+++ b/code/test_files/table-of-bot-metrics.md
@ -32,6 +32,7 @@
 | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
+| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
--- a/code/tests.py
+++ b/code/tests.py
@ -60,6 +60,11 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
        robots_nginx = json_to_nginx(self.robots_dict)
        self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)

+class TestRobotsNameCleaning(unittest.TestCase):
+    def test_clean_name(self):
+        from robots import clean_robot_name
+
+        self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")

 if __name__ == "__main__":
    import os
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
    return 403;
 }
--- a/robots.json
+++ b/robots.json
@ -251,6 +251,13 @@
        "frequency": "Unclear at this time.",
        "description": "PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot"
    },
+    "Perplexity-User": {
+        "operator": "[Perplexity](https://www.perplexity.ai/)",
+        "respect": "[No](https://docs.perplexity.ai/guides/bots)",
+        "function": "Used to answer queries at the request of users.",
+        "frequency": "Only when prompted by a user.",
+        "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
+    },
    "PerplexityBot": {
        "operator": "[Perplexity](https://www.perplexity.ai/)",
        "respect": "[Yes](https://docs.perplexity.ai/guides/bots)",
@ -258,13 +265,6 @@
        "frequency": "No information.",
        "description": "Crawls sites to surface as results in Perplexity."
    },
-    "Perplexity\u2011User": {
-        "operator": "[Perplexity](https://www.perplexity.ai/)",
-        "respect": "[No](https://docs.perplexity.ai/guides/bots)",
-        "function": "Used to answer queries at the request of users.",
-        "frequency": "Only when prompted by a user.",
-        "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
-    },
    "PetalBot": {
        "description": "Operated by Huawei to provide search and AI assistant services.",
        "frequency": "No explicit frequency provided.",
--- a/robots.txt
+++ b/robots.txt
@ -34,8 +34,8 @@ User-agent: OAI-SearchBot
 User-agent: omgili
 User-agent: omgilibot
 User-agent: PanguBot
+User-agent: Perplexity-User
 User-agent: PerplexityBot
-User-agent: Perplexity‑User
 User-agent: PetalBot
 User-agent: Scrapy
 User-agent: SemrushBot-OCOB
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@ -36,8 +36,8 @@
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
 | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot |
+| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |
-| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |