mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-05 03:17:46 +00:00
chore: propagate applebot-extended out
This commit is contained in:
parent
c546dc048c
commit
1ac25b9135
4 changed files with 4 additions and 2 deletions
|
@ -20,6 +20,7 @@ const botUas = [
|
|||
'Amazonbot',
|
||||
'anthropic-ai',
|
||||
'Applebot',
|
||||
'Applebot-Extended',
|
||||
'AwarioRssBot',
|
||||
'AwarioSmartBot',
|
||||
'Bytespider',
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# for apache2.conf or .htaccess; intended to block via user agent string
|
||||
RewriteEngine On
|
||||
RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot) [NC]
|
||||
RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot) [NC]
|
||||
RewriteRule .* - [F,L]
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# for nginx.conf; intended to block via user agent string
|
||||
# note: we recommend following the approach outlined here https://underlap.org/nginx-robot-access
|
||||
# and contributing if you're able: https://github.com/glyn/nginx_robot_access
|
||||
if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot)) {
|
||||
if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot)) {
|
||||
return 403;
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
|Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. |
|
||||
|anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
|
||||
|Applebot | Apple | Yes | Indexes sites to provide answers and search results for Siri users. | Irregular and may be prompted by user queries. | Used to answer queries from users; may included references to the indexed site. |
|
||||
|Applebot-Extended | | | | | |
|
||||
|AwarioRssBot | | | | | |
|
||||
|AwarioSmartBot | | | | | |
|
||||
|Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |
|
||||
|
|
Loading…
Reference in a new issue