diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 042cc13..c98861f 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -19,10 +19,3 @@ jobs: - name: Run tests run: | code/tests.py - lint-json: - runs-on: ubuntu-latest - steps: - - name: Check out repository - uses: actions/checkout@v4 - - name: JQ Json Lint - run: jq . robots.json diff --git a/.htaccess b/.htaccess index b2204d7..b4ab72f 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile deleted file mode 100644 index 36fd20c..0000000 --- a/Caddyfile +++ /dev/null @@ -1,3 +0,0 @@ -@aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" -} \ No newline at end of file diff --git a/README.md b/README.md index 232b3ed..b984672 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,6 @@ This repository provides the following files: - `robots.txt` - `.htaccess` - `nginx-block-ai-bots.conf` -- `Caddyfile` -- `haproxy-block-ai-bots.txt` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). @@ -24,25 +22,6 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. -`Caddyfile` includes a Header Regex matcher group you can copy or import into your Caddyfile, the rejection can then be handled as followed `abort @aibots` - -`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it; -1. Add the file to the config directory of HAProxy -2. Add the following lines in the `frontend` section; - ``` - acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt - http-request deny if ai_robot - ``` - (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) - - -[Bing uses the data it crawls for AI and training, you may opt out by adding a `meta` tag to the `head` of your site.](./docs/additional-steps/bing.md) - -### Related - -- [Robots.txt Traefik plugin](https://plugins.traefik.io/plugins/681b2f3fba3486128fc34fae/robots-txt-plugin): -middleware plugin for [Traefik](https://traefik.io/traefik/) to automatically add rules of [robots.txt](./robots.txt) -file on-the-fly. ## Contributing diff --git a/code/robots.py b/code/robots.py index c795649..8a06b55 100755 --- a/code/robots.py +++ b/code/robots.py @@ -179,19 +179,6 @@ def json_to_nginx(robot_json): return config -def json_to_caddy(robot_json): - caddyfile = "@aibots {\n " - caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"' - caddyfile += "\n}" - return caddyfile - -def json_to_haproxy(robots_json): - # Creates a source file for HAProxy. Follow instructions in the README to implement it. - txt = "\n".join(f"{k}" for k in robots_json.keys()) - return txt - - - def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" new_content = converter(load_robots_json()) @@ -221,15 +208,6 @@ def conversions(): file_name="./nginx-block-ai-bots.conf", converter=json_to_nginx, ) - update_file_if_changed( - file_name="./Caddyfile", - converter=json_to_caddy, - ) - - update_file_if_changed( - file_name="./haproxy-block-ai-bots.txt", - converter=json_to_haproxy, - ) if __name__ == "__main__": diff --git a/code/test_files/Caddyfile b/code/test_files/Caddyfile deleted file mode 100644 index 82f365a..0000000 --- a/code/test_files/Caddyfile +++ /dev/null @@ -1,3 +0,0 @@ -@aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)" -} \ No newline at end of file diff --git a/code/test_files/haproxy-block-ai-bots.txt b/code/test_files/haproxy-block-ai-bots.txt deleted file mode 100644 index 5ed6939..0000000 --- a/code/test_files/haproxy-block-ai-bots.txt +++ /dev/null @@ -1,47 +0,0 @@ -AI2Bot -Ai2Bot-Dolma -Amazonbot -anthropic-ai -Applebot -Applebot-Extended -Bytespider -CCBot -ChatGPT-User -Claude-Web -ClaudeBot -cohere-ai -Diffbot -FacebookBot -facebookexternalhit -FriendlyCrawler -Google-Extended -GoogleOther -GoogleOther-Image -GoogleOther-Video -GPTBot -iaskspider/2.0 -ICC-Crawler -ImagesiftBot -img2dataset -ISSCyberRiskCrawler -Kangaroo Bot -Meta-ExternalAgent -Meta-ExternalFetcher -OAI-SearchBot -omgili -omgilibot -Perplexity-User -PerplexityBot -PetalBot -Scrapy -Sidetrade indexer bot -Timpibot -VelenPublicWebCrawler -Webzio-Extended -YouBot -crawler.with.dots -star***crawler -Is this a crawler? -a[mazing]{42}(robot) -2^32$ -curl|sudo bash \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index 434406f..f58b445 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -60,33 +60,12 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_nginx = json_to_nginx(self.robots_dict) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) -class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): - maxDiff = 8192 - - def setUp(self): - self.robots_dict = self.loadJson("test_files/robots.json") - - def test_haproxy_generation(self): - robots_haproxy = json_to_haproxy(self.robots_dict) - self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) - class TestRobotsNameCleaning(unittest.TestCase): def test_clean_name(self): from robots import clean_robot_name self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") -class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): - maxDiff = 8192 - - def setUp(self): - self.robots_dict = self.loadJson("test_files/robots.json") - - def test_caddyfile_generation(self): - robots_caddyfile = json_to_caddy(self.robots_dict) - self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile) - - if __name__ == "__main__": import os os.chdir(os.path.dirname(__file__)) diff --git a/docs/additional-steps/bing.md b/docs/additional-steps/bing.md deleted file mode 100644 index 37c60c7..0000000 --- a/docs/additional-steps/bing.md +++ /dev/null @@ -1,36 +0,0 @@ -# Bing (bingbot) - -It's not well publicised, but Bing uses the data it crawls for AI and training. - -However, the current thinking is, blocking a search engine of this size using `robots.txt` seems a quite drastic approach as it is second only to Google and could significantly impact your website in search results. - -Additionally, Bing powers a number of search engines such as Yahoo and AOL, and its search results are also used in Duck Duck Go, amongst others. - -Fortunately, Bing supports a relatively simple opt-out method, requiring an additional step. - -## How to opt-out of AI training - -You must add a metatag in the `
` of your webpage. This also needs to be added to every page on your website. - -The line you need to add is: - -```plaintext - -``` - -By adding this line, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models." - -## Will my site be negatively affected - -Simple answer, no. -The original use of "noarchive" has been retired by all search engines. Google retired its use in 2024. - -The use of this metatag will not impact your site in search engines or in any other meaningful way if you add it to your page(s). - -It is now solely used by a handful of crawlers, such as Bingbot and Amazonbot, to signify to them not to use your data for AI/training. - -## Resources - -Bing Blog AI opt-out announcement: https://blogs.bing.com/webmaster/september-2023/Announcing-new-options-for-webmasters-to-control-usage-of-their-content-in-Bing-Chat - -Bing metatag information, including AI opt-out: https://www.bing.com/webmasters/help/which-robots-metatags-does-bing-support-5198d240 diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt deleted file mode 100644 index 7389f10..0000000 --- a/haproxy-block-ai-bots.txt +++ /dev/null @@ -1,59 +0,0 @@ -AI2Bot -Ai2Bot-Dolma -aiHitBot -Amazonbot -anthropic-ai -Applebot -Applebot-Extended -Brightbot 1.0 -Bytespider -CCBot -ChatGPT-User -Claude-Web -ClaudeBot -cohere-ai -cohere-training-data-crawler -Cotoyogi -Crawlspace -Diffbot -DuckAssistBot -FacebookBot -Factset_spyderbot -FirecrawlAgent -FriendlyCrawler -Google-CloudVertexBot -Google-Extended -GoogleOther -GoogleOther-Image -GoogleOther-Video -GPTBot -iaskspider/2.0 -ICC-Crawler -ImagesiftBot -img2dataset -imgproxy -ISSCyberRiskCrawler -Kangaroo Bot -meta-externalagent -Meta-ExternalAgent -meta-externalfetcher -Meta-ExternalFetcher -NovaAct -OAI-SearchBot -omgili -omgilibot -Operator -PanguBot -Perplexity-User -PerplexityBot -PetalBot -QualifiedBot -Scrapy -SemrushBot-OCOB -SemrushBot-SWA -Sidetrade indexer bot -TikTokSpider -Timpibot -VelenPublicWebCrawler -Webzio-Extended -YouBot \ No newline at end of file diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index f05f785..090275a 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.json b/robots.json index 1ecfcd8..698b31e 100644 --- a/robots.json +++ b/robots.json @@ -160,13 +160,6 @@ "operator": "Unknown", "respect": "[Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler)" }, - "Google-CloudVertexBot": { - "operator": "Google", - "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", - "function": "Build and manage AI models for businesses employing Vertex AI", - "frequency": "No information.", - "description": "Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents." - }, "Google-Extended": { "operator": "Google", "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", @@ -251,27 +244,13 @@ "frequency": "Unclear at this time.", "description": "Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot" }, - "meta-externalagent": { + "Meta-ExternalAgent": { "operator": "[Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)", - "respect": "Yes", + "respect": "Yes.", "function": "Used to train models and improve products.", "frequency": "No information.", "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\"" }, - "Meta-ExternalAgent": { - "operator": "Unclear at this time.", - "respect": "Unclear at this time.", - "function": "AI Data Scrapers", - "frequency": "Unclear at this time.", - "description": "Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent" - }, - "meta-externalfetcher": { - "operator": "Unclear at this time.", - "respect": "Unclear at this time.", - "function": "AI Assistants", - "frequency": "Unclear at this time.", - "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" - }, "Meta-ExternalFetcher": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -342,13 +321,6 @@ "operator": "[Huawei](https://huawei.com/)", "respect": "Yes" }, - "QualifiedBot": { - "description": "Operated by Qualified as part of their suite of AI product offerings.", - "frequency": "No explicit frequency provided.", - "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", - "operator": "[Qualified](https://www.qualified.com)", - "respect": "Unclear at this time." - }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", "frequency": "No information.", @@ -412,4 +384,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} diff --git a/robots.txt b/robots.txt index a5be10b..1f8eaf2 100644 --- a/robots.txt +++ b/robots.txt @@ -21,7 +21,6 @@ User-agent: FacebookBot User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler -User-agent: Google-CloudVertexBot User-agent: Google-Extended User-agent: GoogleOther User-agent: GoogleOther-Image @@ -34,9 +33,7 @@ User-agent: img2dataset User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot -User-agent: meta-externalagent User-agent: Meta-ExternalAgent -User-agent: meta-externalfetcher User-agent: Meta-ExternalFetcher User-agent: NovaAct User-agent: OAI-SearchBot @@ -47,7 +44,6 @@ User-agent: PanguBot User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot -User-agent: QualifiedBot User-agent: Scrapy User-agent: SemrushBot-OCOB User-agent: SemrushBot-SWA diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index d8542b3..0249766 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -23,7 +23,6 @@ | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| Google\-CloudVertexBot | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Build and manage AI models for businesses employing Vertex AI | No information. | Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents. | | Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | @@ -36,9 +35,7 @@ | imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | -| meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta\-ExternalAgent | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent | -| meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | @@ -49,7 +46,6 @@ | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | -| QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |