chore: propagate applebot-extended out

This commit is contained in:
Cory Dransfeldt 2024-06-12 18:53:26 -07:00
parent c546dc048c
commit 1ac25b9135
No known key found for this signature in database
4 changed files with 4 additions and 2 deletions

View file

@ -20,6 +20,7 @@ const botUas = [
'Amazonbot',
'anthropic-ai',
'Applebot',
'Applebot-Extended',
'AwarioRssBot',
'AwarioSmartBot',
'Bytespider',

View file

@ -1,4 +1,4 @@
# for apache2.conf or .htaccess; intended to block via user agent string
RewriteEngine On
RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot) [NC]
RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot) [NC]
RewriteRule .* - [F,L]

View file

@ -1,6 +1,6 @@
# for nginx.conf; intended to block via user agent string
# note: we recommend following the approach outlined here https://underlap.org/nginx-robot-access
# and contributing if you're able: https://github.com/glyn/nginx_robot_access
if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot)) {
if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot)) {
return 403;
}

View file

@ -4,6 +4,7 @@
|Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. |
|anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
|Applebot | Apple | Yes | Indexes sites to provide answers and search results for Siri users. | Irregular and may be prompted by user queries. | Used to answer queries from users; may included references to the indexed site. |
|Applebot-Extended | | | | | |
|AwarioRssBot | | | | | |
|AwarioSmartBot | | | | | |
|Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |