diff --git a/edge-functions/block-bots.js b/edge-functions/block-bots.js index 158e6c1..449a11b 100644 --- a/edge-functions/block-bots.js +++ b/edge-functions/block-bots.js @@ -20,6 +20,7 @@ const botUas = [ 'Amazonbot', 'anthropic-ai', 'Applebot', + 'Applebot-Extended', 'AwarioRssBot', 'AwarioSmartBot', 'Bytespider', diff --git a/servers/apache.conf b/servers/apache.conf index fc4807f..38f3198 100644 --- a/servers/apache.conf +++ b/servers/apache.conf @@ -1,4 +1,4 @@ # for apache2.conf or .htaccess; intended to block via user agent string RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot) [NC] RewriteRule .* - [F,L] diff --git a/servers/nginx.conf b/servers/nginx.conf index 099285d..99345d2 100644 --- a/servers/nginx.conf +++ b/servers/nginx.conf @@ -1,6 +1,6 @@ # for nginx.conf; intended to block via user agent string # note: we recommend following the approach outlined here https://underlap.org/nginx-robot-access # and contributing if you're able: https://github.com/glyn/nginx_robot_access -if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot)) { +if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot)) { return 403; } diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index f64242b..213ac81 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -4,6 +4,7 @@ |Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | |anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | |Applebot | Apple | Yes | Indexes sites to provide answers and search results for Siri users. | Irregular and may be prompted by user queries. | Used to answer queries from users; may included references to the indexed site. | +|Applebot-Extended | | | | | | |AwarioRssBot | | | | | | |AwarioSmartBot | | | | | | |Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |