mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-09 13:27:46 +00:00
parent
6b0349f37d
commit
5f5a89c38c
12 changed files with 42 additions and 13 deletions
|
@ -1,3 +1,3 @@
|
|||
RewriteEngine On
|
||||
RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
|
||||
RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
|
||||
RewriteRule !^/?robots\.txt$ - [F,L]
|
||||
|
|
|
@ -50,6 +50,7 @@ def updated_robots_json(soup):
|
|||
continue
|
||||
for agent in section.find_all("a", href=True):
|
||||
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
|
||||
name = clean_robot_name(name)
|
||||
desc = agent.find("p").get_text().strip()
|
||||
|
||||
default_values = {
|
||||
|
@ -101,6 +102,20 @@ def updated_robots_json(soup):
|
|||
return sorted_robots
|
||||
|
||||
|
||||
def clean_robot_name(name):
|
||||
""" Clean the robot name by removing some characters that were mangled by html software once. """
|
||||
# This was specifically spotted in "Perplexity-User"
|
||||
# Looks like a non-breaking hyphen introduced by the HTML rendering software
|
||||
# Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
|
||||
# You can see the bot is listed several times as "Perplexity‑User" with a normal hyphen,
|
||||
# and it's only the Row-Heading that has the special hyphen
|
||||
#
|
||||
# Technically, there's no reason there wouldn't someday be a bot that
|
||||
# actually uses a non-breaking hyphen, but that seems unlikely,
|
||||
# so this solution should be fine for now.
|
||||
return re.sub(r"\u2011", "-", name)
|
||||
|
||||
|
||||
def ingest_darkvisitors():
|
||||
old_robots_json = load_robots_json()
|
||||
soup = get_agent_soup()
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
RewriteEngine On
|
||||
RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC]
|
||||
RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC]
|
||||
RewriteRule !^/?robots\.txt$ - [F,L]
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") {
|
||||
if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") {
|
||||
return 403;
|
||||
}
|
|
@ -223,6 +223,13 @@
|
|||
"operator": "[Webz.io](https://webz.io/)",
|
||||
"respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)"
|
||||
},
|
||||
"Perplexity-User": {
|
||||
"operator": "[Perplexity](https://www.perplexity.ai/)",
|
||||
"respect": "[No](https://docs.perplexity.ai/guides/bots)",
|
||||
"function": "Used to answer queries at the request of users.",
|
||||
"frequency": "Only when prompted by a user.",
|
||||
"description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
|
||||
},
|
||||
"PerplexityBot": {
|
||||
"operator": "[Perplexity](https://www.perplexity.ai/)",
|
||||
"respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)",
|
||||
|
|
|
@ -30,6 +30,7 @@ User-agent: Meta-ExternalFetcher
|
|||
User-agent: OAI-SearchBot
|
||||
User-agent: omgili
|
||||
User-agent: omgilibot
|
||||
User-agent: Perplexity-User
|
||||
User-agent: PerplexityBot
|
||||
User-agent: PetalBot
|
||||
User-agent: Scrapy
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
| OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
|
||||
| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
|
||||
| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
|
||||
| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
|
||||
| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. |
|
||||
| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
|
||||
| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
|
||||
|
|
|
@ -60,6 +60,11 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
|||
robots_nginx = json_to_nginx(self.robots_dict)
|
||||
self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
|
||||
|
||||
class TestRobotsNameCleaning(unittest.TestCase):
|
||||
def test_clean_name(self):
|
||||
from robots import clean_robot_name
|
||||
|
||||
self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
|
||||
if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
|
||||
return 403;
|
||||
}
|
14
robots.json
14
robots.json
|
@ -251,6 +251,13 @@
|
|||
"frequency": "Unclear at this time.",
|
||||
"description": "PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot"
|
||||
},
|
||||
"Perplexity-User": {
|
||||
"operator": "[Perplexity](https://www.perplexity.ai/)",
|
||||
"respect": "[No](https://docs.perplexity.ai/guides/bots)",
|
||||
"function": "Used to answer queries at the request of users.",
|
||||
"frequency": "Only when prompted by a user.",
|
||||
"description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
|
||||
},
|
||||
"PerplexityBot": {
|
||||
"operator": "[Perplexity](https://www.perplexity.ai/)",
|
||||
"respect": "[Yes](https://docs.perplexity.ai/guides/bots)",
|
||||
|
@ -258,13 +265,6 @@
|
|||
"frequency": "No information.",
|
||||
"description": "Crawls sites to surface as results in Perplexity."
|
||||
},
|
||||
"Perplexity\u2011User": {
|
||||
"operator": "[Perplexity](https://www.perplexity.ai/)",
|
||||
"respect": "[No](https://docs.perplexity.ai/guides/bots)",
|
||||
"function": "Used to answer queries at the request of users.",
|
||||
"frequency": "Only when prompted by a user.",
|
||||
"description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
|
||||
},
|
||||
"PetalBot": {
|
||||
"description": "Operated by Huawei to provide search and AI assistant services.",
|
||||
"frequency": "No explicit frequency provided.",
|
||||
|
|
|
@ -34,8 +34,8 @@ User-agent: OAI-SearchBot
|
|||
User-agent: omgili
|
||||
User-agent: omgilibot
|
||||
User-agent: PanguBot
|
||||
User-agent: Perplexity-User
|
||||
User-agent: PerplexityBot
|
||||
User-agent: Perplexity‑User
|
||||
User-agent: PetalBot
|
||||
User-agent: Scrapy
|
||||
User-agent: SemrushBot-OCOB
|
||||
|
|
|
@ -36,8 +36,8 @@
|
|||
| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
|
||||
| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
|
||||
| PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot |
|
||||
| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
|
||||
| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |
|
||||
| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
|
||||
| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
|
||||
| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
|
||||
| SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
|
||||
|
|
Loading…
Reference in a new issue