From 305188b2e78855d4e7193f29a3e7205f96fa86f6 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 11 Apr 2025 00:55:52 +0000 Subject: [PATCH 01/29] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 4c9f7d7..eff38ac 100644 --- a/robots.json +++ b/robots.json @@ -335,4 +335,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From d9f882a9b21170754c4b37ff1bbc237171876684 Mon Sep 17 00:00:00 2001 From: Joshua Sheard Date: Mon, 14 Apr 2025 15:46:01 +0100 Subject: [PATCH 02/29] Include "AI Agents" from Dark Visitors --- code/robots.py | 1 + 1 file changed, 1 insertion(+) diff --git a/code/robots.py b/code/robots.py index 86ea413..8a06b55 100755 --- a/code/robots.py +++ b/code/robots.py @@ -30,6 +30,7 @@ def updated_robots_json(soup): """Update AI scraper information with data from darkvisitors.""" existing_content = load_robots_json() to_include = [ + "AI Agents", "AI Assistants", "AI Data Scrapers", "AI Search Crawlers", From a96e33098975edf1c05c8d9684b36b9fa31f7ef2 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 15 Apr 2025 00:57:01 +0000 Subject: [PATCH 03/29] Update from Dark Visitors --- robots.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/robots.json b/robots.json index eff38ac..8bba6b2 100644 --- a/robots.json +++ b/robots.json @@ -230,6 +230,13 @@ "frequency": "Unclear at this time.", "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, + "NovaAct": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Agents", + "frequency": "Unclear at this time.", + "description": "Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact" + }, "OAI-SearchBot": { "operator": "[OpenAI](https://openai.com)", "respect": "[Yes](https://platform.openai.com/docs/bots)", @@ -251,6 +258,13 @@ "operator": "[Webz.io](https://webz.io/)", "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)" }, + "Operator": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Agents", + "frequency": "Unclear at this time.", + "description": "Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator" + }, "PanguBot": { "operator": "the Chinese company Huawei", "respect": "Unclear at this time.", From e0cdb278fbd243f554579fe5050850f124b286a8 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 16 Apr 2025 00:57:11 +0000 Subject: [PATCH 04/29] Update from Dark Visitors --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index c0e5fbb..d10e796 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index a6bbfa2..c37cef5 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index de25a56..1e3aa80 100644 --- a/robots.txt +++ b/robots.txt @@ -31,9 +31,11 @@ User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: Meta-ExternalAgent User-agent: Meta-ExternalFetcher +User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot +User-agent: Operator User-agent: PanguBot User-agent: Perplexity-User User-agent: PerplexityBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index b3e51fe..4c87b41 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -33,9 +33,11 @@ | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | +| Operator | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | From d05ede8fe164c5c3f47acecc1343e6d2ea5c294b Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Fri, 18 Apr 2025 17:46:56 +0100 Subject: [PATCH 05/29] Clarify our position on sponsorship Some firms, including those with .ai domains, have offered to sponsor this project. So make our position clear. --- FAQ.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/FAQ.md b/FAQ.md index 967cf41..487f784 100644 --- a/FAQ.md +++ b/FAQ.md @@ -55,3 +55,7 @@ That depends on your stack. ## How can I contribute? Open a pull request. It will be reviewed and acted upon appropriately. **We really appreciate contributions** — this is a community effort. + +## Can my company sponsor ai.robots.txt? + +No, thank you. We do not accept sponsorship of any kind. We prefer to maintain our independence. Our costs are negligible as we are entirely volunteer-based and community-driven. From b1856e6988a93bd834b228f121fa3524d11c7be7 Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Fri, 18 Apr 2025 18:40:44 +0100 Subject: [PATCH 06/29] Donations --- FAQ.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/FAQ.md b/FAQ.md index 487f784..7264819 100644 --- a/FAQ.md +++ b/FAQ.md @@ -56,6 +56,10 @@ That depends on your stack. Open a pull request. It will be reviewed and acted upon appropriately. **We really appreciate contributions** — this is a community effort. +## I'd like to donate money + +That's kind of you, but we don't need your money. If you insist, we'd love you to make a donation to the [American Civil Liberties Union](https://www.aclu.org/), the [Disasters Emergency Committee](https://www.dec.org.uk/), or a similar organisation. + ## Can my company sponsor ai.robots.txt? No, thank you. We do not accept sponsorship of any kind. We prefer to maintain our independence. Our costs are negligible as we are entirely volunteer-based and community-driven. From 33c5ce1326367abecccad23742779783c10c36a1 Mon Sep 17 00:00:00 2001 From: Dennis Lee Date: Mon, 21 Apr 2025 18:55:11 +0100 Subject: [PATCH 07/29] Update robots.json Updated robots list with five new proposed AI bots: aiHitBot Cotoyogi Factset_spyderbot FirecrawlAgent TikTokSpider --- robots.json | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 8bba6b2..698b31e 100644 --- a/robots.json +++ b/robots.json @@ -13,6 +13,13 @@ "operator": "[Ai2](https://allenai.org/crawler)", "respect": "Yes" }, + "aiHitBot": { + "operator": "[aiHit](https://www.aihitdata.com/about)", + "respect": "Yes", + "function": "A massive, artificial intelligence/machine learning, automated system.", + "frequency": "No information provided.", + "description": "Scrapes data for AI systems." + }, "Amazonbot": { "operator": "Amazon", "respect": "Yes", @@ -97,6 +104,13 @@ "frequency": "Unclear at this time.", "description": "cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler" }, + "Cotoyogi": { + "operator": "[ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/)", + "respect": "Yes", + "function": "AI LLM Scraper.", + "frequency": "No information provided.", + "description": "Scrapes data for AI training in Japanese language." + }, "Crawlspace": { "operator": "[Crawlspace](https://crawlspace.dev)", "respect": "[Yes](https://news.ycombinator.com/item?id=42756654)", @@ -125,6 +139,20 @@ "frequency": "Up to 1 page per second", "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically." }, + "Factset_spyderbot": { + "operator": "[Factset](https://www.factset.com/ai)", + "respect": "Unclear at this time.", + "function": "AI model training.", + "frequency": "No information provided.", + "description": "Scrapes data for AI training." + }, + "FirecrawlAgent": { + "operator": "[Firecrawl](https://www.firecrawl.dev/)", + "respect": "Yes", + "function": "AI scraper and LLM training", + "frequency": "No information provided.", + "description": "Scrapes data for AI systems and LLM training." + }, "FriendlyCrawler": { "description": "Unclear who the operator is; but data is used for training/machine learning.", "frequency": "Unclear at this time.", @@ -321,6 +349,13 @@ "operator": "[Sidetrade](https://www.sidetrade.com)", "respect": "Unclear at this time." }, + "TikTokSpider": { + "operator": "ByteDance", + "respect": "Unclear at this time.", + "function": "LLM training.", + "frequency": "Unclear at this time.", + "description": "Downloads data to train LLMS, as per Bytespider." + }, "Timpibot": { "operator": "[Timpi](https://timpi.io)", "respect": "Unclear at this time.", @@ -349,4 +384,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From bbec639c14f3e7258729718dd2c6fc0b1734a9b1 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 22 Apr 2025 14:50:26 +0000 Subject: [PATCH 08/29] Merge pull request #109 from dennislee1/patch-1 AI bots to consider adding --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 5 +++++ table-of-bot-metrics.md | 5 +++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index d10e796..b4ab72f 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index c37cef5..090275a 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 1e3aa80..1f8eaf2 100644 --- a/robots.txt +++ b/robots.txt @@ -1,5 +1,6 @@ User-agent: AI2Bot User-agent: Ai2Bot-Dolma +User-agent: aiHitBot User-agent: Amazonbot User-agent: anthropic-ai User-agent: Applebot @@ -12,10 +13,13 @@ User-agent: Claude-Web User-agent: ClaudeBot User-agent: cohere-ai User-agent: cohere-training-data-crawler +User-agent: Cotoyogi User-agent: Crawlspace User-agent: Diffbot User-agent: DuckAssistBot User-agent: FacebookBot +User-agent: Factset_spyderbot +User-agent: FirecrawlAgent User-agent: FriendlyCrawler User-agent: Google-Extended User-agent: GoogleOther @@ -44,6 +48,7 @@ User-agent: Scrapy User-agent: SemrushBot-OCOB User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot +User-agent: TikTokSpider User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 4c87b41..0249766 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -2,6 +2,7 @@ |------|----------|-----------------------|----------|------------------|-------------| | AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | | Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| aiHitBot | [aiHit](https://www.aihitdata.com/about) | Yes | A massive, artificial intelligence/machine learning, automated system. | No information provided. | Scrapes data for AI systems. | | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | @@ -14,10 +15,13 @@ | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | +| Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. | | Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | +| FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | | Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | @@ -46,6 +50,7 @@ | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | +| TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. | | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | From 8d25a424d96dbf4a3cb12d4bb51929a764aa8f89 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Wed, 23 Apr 2025 00:56:52 +0000 Subject: [PATCH 09/29] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 698b31e..df9dcda 100644 --- a/robots.json +++ b/robots.json @@ -384,4 +384,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 9d846ced45cdd13c7ecc03353c6ec554b5f9015d Mon Sep 17 00:00:00 2001 From: maia Date: Thu, 24 Apr 2025 04:08:20 +0200 Subject: [PATCH 10/29] Update robots.json Lowercase meta-external* as that was not technically the UA for the bots, also removed a period in the "respect" for consistency --- robots.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/robots.json b/robots.json index df9dcda..f2cec91 100644 --- a/robots.json +++ b/robots.json @@ -244,14 +244,14 @@ "frequency": "Unclear at this time.", "description": "Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot" }, - "Meta-ExternalAgent": { + "meta-externalagent": { "operator": "[Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)", - "respect": "Yes.", + "respect": "Yes", "function": "Used to train models and improve products.", "frequency": "No information.", "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\"" }, - "Meta-ExternalFetcher": { + "meta-externalfetcher": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", "function": "AI Assistants", @@ -384,4 +384,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 4654e14e9c857a228289d3258835182838202503 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 24 Apr 2025 07:00:34 +0000 Subject: [PATCH 11/29] Merge pull request #112 from maiavixen/main Fixed meta-external* being titlecase, and removed period for consistency --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 4 ++-- table-of-bot-metrics.md | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.htaccess b/.htaccess index b4ab72f..a97f98a 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 090275a..3320071 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 1f8eaf2..53291ca 100644 --- a/robots.txt +++ b/robots.txt @@ -33,8 +33,8 @@ User-agent: img2dataset User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot -User-agent: Meta-ExternalAgent -User-agent: Meta-ExternalFetcher +User-agent: meta-externalagent +User-agent: meta-externalfetcher User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 0249766..5c093b8 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -35,8 +35,8 @@ | imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | -| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From 934ac7b31864d0cf0b21bb1580ddea97ec8b4994 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 25 Apr 2025 00:56:57 +0000 Subject: [PATCH 12/29] Update from Dark Visitors --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index f2cec91..e15a196 100644 --- a/robots.json +++ b/robots.json @@ -251,6 +251,13 @@ "frequency": "No information.", "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\"" }, + "Meta-ExternalAgent": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent" + }, "meta-externalfetcher": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -258,6 +265,13 @@ "frequency": "Unclear at this time.", "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, + "Meta-ExternalFetcher": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Assistants", + "frequency": "Unclear at this time.", + "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" + }, "NovaAct": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -384,4 +398,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From c6c7f1748f1e28053184539a70a6a08f5aeabc37 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 26 Apr 2025 00:55:12 +0000 Subject: [PATCH 13/29] Update from Dark Visitors --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index a97f98a..586adab 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 3320071..fc58d61 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 53291ca..232e119 100644 --- a/robots.txt +++ b/robots.txt @@ -34,7 +34,9 @@ User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: meta-externalagent +User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher +User-agent: Meta-ExternalFetcher User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 5c093b8..4dd6076 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -36,7 +36,9 @@ | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta\-ExternalAgent | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent | | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From 50e739dd738bb821018a863491b770dd8ee61155 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 08:42:52 +0200 Subject: [PATCH 14/29] HAProxy converter added. --- README.md | 9 +++++++ code/robots.py | 9 +++++++ haproxy-block-ai-bots.txt | 57 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 haproxy-block-ai-bots.txt diff --git a/README.md b/README.md index b984672..1f1eff6 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This repository provides the following files: - `robots.txt` - `.htaccess` - `nginx-block-ai-bots.conf` +- `haproxy-block-ai-bots.txt` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). @@ -22,6 +23,14 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. +`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it; +1. Add the file to the config directory of HAProxy +2. Add the following lines in the `frontend` section; + ``` + acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt + http-request deny if ai_robot + ``` + (Note that the path of the `haproxy-block-ai-bots.txt` may be different on your environment.) ## Contributing diff --git a/code/robots.py b/code/robots.py index 8a06b55..da157c1 100755 --- a/code/robots.py +++ b/code/robots.py @@ -178,6 +178,11 @@ def json_to_nginx(robot_json): config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" return config +def json_to_haproxy(robots_json): + # Creates a source file for HAProxy. Follow instructions in the README to implement it. + txt = "\n".join(f"{k}" for k in robots_json.keys()) + return txt + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" @@ -208,6 +213,10 @@ def conversions(): file_name="./nginx-block-ai-bots.conf", converter=json_to_nginx, ) + update_file_if_changed( + file_name="./haproxy-block-ai-bots.txt", + converter=json_to_haproxy, + ) if __name__ == "__main__": diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt new file mode 100644 index 0000000..3c326bd --- /dev/null +++ b/haproxy-block-ai-bots.txt @@ -0,0 +1,57 @@ +AI2Bot +Ai2Bot-Dolma +aiHitBot +Amazonbot +anthropic-ai +Applebot +Applebot-Extended +Brightbot 1.0 +Bytespider +CCBot +ChatGPT-User +Claude-Web +ClaudeBot +cohere-ai +cohere-training-data-crawler +Cotoyogi +Crawlspace +Diffbot +DuckAssistBot +FacebookBot +Factset_spyderbot +FirecrawlAgent +FriendlyCrawler +Google-Extended +GoogleOther +GoogleOther-Image +GoogleOther-Video +GPTBot +iaskspider/2.0 +ICC-Crawler +ImagesiftBot +img2dataset +imgproxy +ISSCyberRiskCrawler +Kangaroo Bot +meta-externalagent +Meta-ExternalAgent +meta-externalfetcher +Meta-ExternalFetcher +NovaAct +OAI-SearchBot +omgili +omgilibot +Operator +PanguBot +Perplexity-User +PerplexityBot +PetalBot +Scrapy +SemrushBot-OCOB +SemrushBot-SWA +Sidetrade indexer bot +TikTokSpider +Timpibot +VelenPublicWebCrawler +Webzio-Extended +YouBot \ No newline at end of file From 66da70905f503239faeb0e49204776f508928048 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 09:09:40 +0200 Subject: [PATCH 15/29] Fixed incorrect English sentence. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1f1eff6..ff124e3 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt http-request deny if ai_robot ``` - (Note that the path of the `haproxy-block-ai-bots.txt` may be different on your environment.) + (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) ## Contributing From a4a9f2ac2b9116d104789664231af4017d3828a7 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 09:30:26 +0200 Subject: [PATCH 16/29] Tests for HAProxy file added. --- code/test_files/haproxy-block-ai-bots.txt | 47 +++++++++++++++++++++++ code/tests.py | 12 +++++- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 code/test_files/haproxy-block-ai-bots.txt diff --git a/code/test_files/haproxy-block-ai-bots.txt b/code/test_files/haproxy-block-ai-bots.txt new file mode 100644 index 0000000..5ed6939 --- /dev/null +++ b/code/test_files/haproxy-block-ai-bots.txt @@ -0,0 +1,47 @@ +AI2Bot +Ai2Bot-Dolma +Amazonbot +anthropic-ai +Applebot +Applebot-Extended +Bytespider +CCBot +ChatGPT-User +Claude-Web +ClaudeBot +cohere-ai +Diffbot +FacebookBot +facebookexternalhit +FriendlyCrawler +Google-Extended +GoogleOther +GoogleOther-Image +GoogleOther-Video +GPTBot +iaskspider/2.0 +ICC-Crawler +ImagesiftBot +img2dataset +ISSCyberRiskCrawler +Kangaroo Bot +Meta-ExternalAgent +Meta-ExternalFetcher +OAI-SearchBot +omgili +omgilibot +Perplexity-User +PerplexityBot +PetalBot +Scrapy +Sidetrade indexer bot +Timpibot +VelenPublicWebCrawler +Webzio-Extended +YouBot +crawler.with.dots +star***crawler +Is this a crawler? +a[mazing]{42}(robot) +2^32$ +curl|sudo bash \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index f58b445..e179c44 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -60,6 +60,16 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_nginx = json_to_nginx(self.robots_dict) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) +class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_haproxy_generation(self): + robots_haproxy = json_to_haproxy(self.robots_dict) + self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) + class TestRobotsNameCleaning(unittest.TestCase): def test_clean_name(self): from robots import clean_robot_name From 1310dbae4656e212ff01e7d8530d78c76dfd5a9f Mon Sep 17 00:00:00 2001 From: Crazyroostereye <63781667+Crazyroostereye1@users.noreply.github.com> Date: Thu, 1 May 2025 12:21:32 +0200 Subject: [PATCH 17/29] Added a Caddyfile converter (#110) Co-authored-by: Julian Beittel Co-authored-by: Glyn Normington --- Caddyfile | 3 +++ README.md | 3 +++ code/robots.py | 12 ++++++++++++ code/test_files/Caddyfile | 3 +++ code/tests.py | 13 ++++++++++++- 5 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 Caddyfile create mode 100644 code/test_files/Caddyfile diff --git a/Caddyfile b/Caddyfile new file mode 100644 index 0000000..1857d75 --- /dev/null +++ b/Caddyfile @@ -0,0 +1,3 @@ +@aibots { + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" +} \ No newline at end of file diff --git a/README.md b/README.md index ff124e3..8d7bfb1 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This repository provides the following files: - `robots.txt` - `.htaccess` - `nginx-block-ai-bots.conf` +- `Caddyfile` - `haproxy-block-ai-bots.txt` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). @@ -23,6 +24,8 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. +`Caddyfile` includes a Header Regex matcher group you can copy or import into your Caddyfile, the rejection can then be handled as followed `abort @aibots` + `haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it; 1. Add the file to the config directory of HAProxy 2. Add the following lines in the `frontend` section; diff --git a/code/robots.py b/code/robots.py index da157c1..054c2be 100755 --- a/code/robots.py +++ b/code/robots.py @@ -178,12 +178,20 @@ def json_to_nginx(robot_json): config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" return config + +def json_to_caddy(robot_json): + caddyfile = "@aibots {\n " + caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"' + caddyfile += "\n}" + return caddyfile + def json_to_haproxy(robots_json): # Creates a source file for HAProxy. Follow instructions in the README to implement it. txt = "\n".join(f"{k}" for k in robots_json.keys()) return txt + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" new_content = converter(load_robots_json()) @@ -213,6 +221,10 @@ def conversions(): file_name="./nginx-block-ai-bots.conf", converter=json_to_nginx, ) + update_file_if_changed( + file_name="./Caddyfile", + converter=json_to_caddy + update_file_if_changed( file_name="./haproxy-block-ai-bots.txt", converter=json_to_haproxy, diff --git a/code/test_files/Caddyfile b/code/test_files/Caddyfile new file mode 100644 index 0000000..82f365a --- /dev/null +++ b/code/test_files/Caddyfile @@ -0,0 +1,3 @@ +@aibots { + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)" +} \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index e179c44..434406f 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -76,6 +76,17 @@ class TestRobotsNameCleaning(unittest.TestCase): self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") +class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_caddyfile_generation(self): + robots_caddyfile = json_to_caddy(self.robots_dict) + self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile) + + if __name__ == "__main__": import os os.chdir(os.path.dirname(__file__)) From ec995cd686a09b4af1c6a59d95e1ced122f1d5fc Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Thu, 1 May 2025 11:27:40 +0100 Subject: [PATCH 18/29] Fix Python syntax error --- code/robots.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/robots.py b/code/robots.py index 054c2be..c795649 100755 --- a/code/robots.py +++ b/code/robots.py @@ -223,7 +223,8 @@ def conversions(): ) update_file_if_changed( file_name="./Caddyfile", - converter=json_to_caddy + converter=json_to_caddy, + ) update_file_if_changed( file_name="./haproxy-block-ai-bots.txt", From 678380727e8685af8c5311bcfa1f55c7aa866d3b Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 1 May 2025 10:29:06 +0000 Subject: [PATCH 19/29] Merge pull request #115 from glyn/syntax Fix Python syntax error --- Caddyfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Caddyfile b/Caddyfile index 1857d75..0e10cfa 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file From 36a52a88d8e3832091d73062ef268acb46f6e031 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 12 May 2025 20:20:18 -0700 Subject: [PATCH 20/29] Bing AI opt-out instructions --- README.md | 2 ++ docs/additional-steps/bing.md | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 docs/additional-steps/bing.md diff --git a/README.md b/README.md index 8d7bfb1..28ef743 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,8 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ ``` (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) +[Bing uses the data it crawls for AI and training, you may opt out by adding a `meta` tag to the `head` of your site.]((./docs/additional-steps/bing.md)) + ## Contributing A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`. diff --git a/docs/additional-steps/bing.md b/docs/additional-steps/bing.md new file mode 100644 index 0000000..37c60c7 --- /dev/null +++ b/docs/additional-steps/bing.md @@ -0,0 +1,36 @@ +# Bing (bingbot) + +It's not well publicised, but Bing uses the data it crawls for AI and training. + +However, the current thinking is, blocking a search engine of this size using `robots.txt` seems a quite drastic approach as it is second only to Google and could significantly impact your website in search results. + +Additionally, Bing powers a number of search engines such as Yahoo and AOL, and its search results are also used in Duck Duck Go, amongst others. + +Fortunately, Bing supports a relatively simple opt-out method, requiring an additional step. + +## How to opt-out of AI training + +You must add a metatag in the `` of your webpage. This also needs to be added to every page on your website. + +The line you need to add is: + +```plaintext + +``` + +By adding this line, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models." + +## Will my site be negatively affected + +Simple answer, no. +The original use of "noarchive" has been retired by all search engines. Google retired its use in 2024. + +The use of this metatag will not impact your site in search engines or in any other meaningful way if you add it to your page(s). + +It is now solely used by a handful of crawlers, such as Bingbot and Amazonbot, to signify to them not to use your data for AI/training. + +## Resources + +Bing Blog AI opt-out announcement: https://blogs.bing.com/webmaster/september-2023/Announcing-new-options-for-webmasters-to-control-usage-of-their-content-in-Bing-Chat + +Bing metatag information, including AI opt-out: https://www.bing.com/webmasters/help/which-robots-metatags-does-bing-support-5198d240 From b4610a725cac409b5c686d0f68383ea3f6daa818 Mon Sep 17 00:00:00 2001 From: Florent Poinsaut Date: Wed, 14 May 2025 14:11:56 +0200 Subject: [PATCH 21/29] Add Traefik plugin --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 8d7bfb1..80de135 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,12 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ ``` (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) +### Related + +- [Robots.txt Traefik plugin](https://plugins.traefik.io/plugins/681b2f3fba3486128fc34fae/robots-txt-plugin): +middleware plugin for [Traefik](https://traefik.io/traefik/) to automatically add rules of [robots.txt](./robots.txt) +file on-the-fly. + ## Contributing A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`. From 9539256cb3116b626439bf79a776ea67b7aa2edd Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 14 May 2025 16:46:32 -0700 Subject: [PATCH 22/29] chore(robots.json): adds QualifiedBot crawler --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index e15a196..99e28d3 100644 --- a/robots.json +++ b/robots.json @@ -335,6 +335,13 @@ "operator": "[Huawei](https://huawei.com/)", "respect": "Yes" }, + "QualifiedBot": { + "description": "Operated by Qualified as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", + "operator": "[Qualified](https://www.qualified.com)", + "respect": "Unclear at this time.", + }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", "frequency": "No information.", @@ -398,4 +405,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 0c56b96fd99bfcc736c4f64c0df9bb87a1fc6075 Mon Sep 17 00:00:00 2001 From: Joe Hoyle Date: Thu, 15 May 2025 11:26:47 -0400 Subject: [PATCH 23/29] Fix JSON syntax error --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 99e28d3..f518037 100644 --- a/robots.json +++ b/robots.json @@ -340,7 +340,7 @@ "frequency": "No explicit frequency provided.", "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", "operator": "[Qualified](https://www.qualified.com)", - "respect": "Unclear at this time.", + "respect": "Unclear at this time." }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", From 1c470babbefed7b470443f6dd834e721c58481d6 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 15 May 2025 16:12:30 +0000 Subject: [PATCH 24/29] Merge pull request #123 from joehoyle/patch-1 Fix JSON syntax error --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 586adab..de88e50 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 0e10cfa..43ad3bf 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 3c326bd..9770b45 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -46,6 +46,7 @@ PanguBot Perplexity-User PerplexityBot PetalBot +QualifiedBot Scrapy SemrushBot-OCOB SemrushBot-SWA diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index fc58d61..afbd77c 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 232e119..dea4dd5 100644 --- a/robots.txt +++ b/robots.txt @@ -46,6 +46,7 @@ User-agent: PanguBot User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot +User-agent: QualifiedBot User-agent: Scrapy User-agent: SemrushBot-OCOB User-agent: SemrushBot-SWA diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 4dd6076..57469fa 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -48,6 +48,7 @@ | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 498aa50760fe3850820b46933bf87b82918e5803 Mon Sep 17 00:00:00 2001 From: Patrick Evans Date: Thu, 15 May 2025 11:10:06 -0500 Subject: [PATCH 25/29] lint robots.json during pull requests --- .github/workflows/run-tests.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index c98861f..042cc13 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -19,3 +19,10 @@ jobs: - name: Run tests run: | code/tests.py + lint-json: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + - name: JQ Json Lint + run: jq . robots.json From 16d1de70943e9d448d3d5e02e91d86e38dac80d7 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 16 May 2025 00:59:08 +0000 Subject: [PATCH 26/29] Update from Dark Visitors --- robots.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/robots.json b/robots.json index f518037..f2ed4c2 100644 --- a/robots.json +++ b/robots.json @@ -336,11 +336,11 @@ "respect": "Yes" }, "QualifiedBot": { - "description": "Operated by Qualified as part of their suite of AI product offerings.", - "frequency": "No explicit frequency provided.", - "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", - "operator": "[Qualified](https://www.qualified.com)", - "respect": "Unclear at this time." + "description": "Operated by Qualified as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", + "operator": "[Qualified](https://www.qualified.com)", + "respect": "Unclear at this time." }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", @@ -405,4 +405,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From ca918a963f735019a0c66343bf8338a9228d94f5 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 15 May 2025 21:16:49 -0700 Subject: [PATCH 27/29] chore(robots.json): adds Google-CloudVertexBot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index f2ed4c2..b459533 100644 --- a/robots.json +++ b/robots.json @@ -160,6 +160,13 @@ "operator": "Unknown", "respect": "[Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler)" }, + "Google-CloudVertexBot": { + "operator": "Google", + "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", + "function": "Build and manage AI models for businesses employing Vertex AI", + "frequency": "No information.", + "description": "Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents." + }, "Google-Extended": { "operator": "Google", "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", @@ -405,4 +412,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From dd1ed174b77ca2c0c4a40d6f4bce6beda4a1c296 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 16 May 2025 11:35:15 +0000 Subject: [PATCH 28/29] Merge pull request #129 from ai-robots-txt/google-cloudvertexbot chore(robots.json): adds Google-CloudVertexBot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index de88e50..b2204d7 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 43ad3bf..36fd20c 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 9770b45..7389f10 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -21,6 +21,7 @@ FacebookBot Factset_spyderbot FirecrawlAgent FriendlyCrawler +Google-CloudVertexBot Google-Extended GoogleOther GoogleOther-Image diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index afbd77c..f05f785 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index dea4dd5..a5be10b 100644 --- a/robots.txt +++ b/robots.txt @@ -21,6 +21,7 @@ User-agent: FacebookBot User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler +User-agent: Google-CloudVertexBot User-agent: Google-Extended User-agent: GoogleOther User-agent: GoogleOther-Image diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 57469fa..d8542b3 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -23,6 +23,7 @@ | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | +| Google\-CloudVertexBot | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Build and manage AI models for businesses employing Vertex AI | No information. | Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents. | | Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | From 7a2e6cba52e782ab32552c2335637af761afbe51 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sat, 17 May 2025 00:57:28 +0000 Subject: [PATCH 29/29] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index b459533..1ecfcd8 100644 --- a/robots.json +++ b/robots.json @@ -412,4 +412,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file