mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-05 11:27:45 +00:00
chore: keeps Applebot-Extended in favor of Applebot as the latter is simply for search
This commit is contained in:
parent
548a6686ab
commit
3f65a93891
6 changed files with 3 additions and 6 deletions
|
@ -19,7 +19,6 @@ const botUas = [
|
||||||
'AdsBot-Google',
|
'AdsBot-Google',
|
||||||
'Amazonbot',
|
'Amazonbot',
|
||||||
'anthropic-ai',
|
'anthropic-ai',
|
||||||
'Applebot',
|
|
||||||
'Applebot-Extended',
|
'Applebot-Extended',
|
||||||
'AwarioRssBot',
|
'AwarioRssBot',
|
||||||
'AwarioSmartBot',
|
'AwarioSmartBot',
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
User-agent: AdsBot-Google
|
User-agent: AdsBot-Google
|
||||||
User-agent: Amazonbot
|
User-agent: Amazonbot
|
||||||
User-agent: anthropic-ai
|
User-agent: anthropic-ai
|
||||||
User-agent: Applebot
|
|
||||||
User-agent: Applebot-Extended
|
User-agent: Applebot-Extended
|
||||||
User-agent: AwarioRssBot
|
User-agent: AwarioRssBot
|
||||||
User-agent: AwarioSmartBot
|
User-agent: AwarioSmartBot
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# for apache2.conf or .htaccess; intended to block via user agent string
|
# for apache2.conf or .htaccess; intended to block via user agent string
|
||||||
RewriteEngine On
|
RewriteEngine On
|
||||||
RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot) [NC]
|
RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot) [NC]
|
||||||
RewriteRule .* - [F,L]
|
RewriteRule .* - [F,L]
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
(http.user_agent contains "AdsBot-Google") or (http.user_agent contains "Amazonbot") or (http.user_agent contains "anthropic-ai") or (http.user_agent contains "Applebot") or (http.user_agent contains "Bytespider") or (http.user_agent contains "CCBot") or (http.user_agent contains "ChatGPT-User") or (http.user_agent contains "ClaudeBot") or (http.user_agent contains "Claude-Web") or (http.user_agent contains "cohere-ai") or (http.user_agent contains "DataForSeoBot") or (http.user_agent contains "Diffbot") or (http.user_agent contains "FacebookBot") or (http.user_agent contains "FriendlyCrawler") or (http.user_agent contains "Google-Extended") or (http.user_agent contains "GoogleOther") or (http.user_agent contains "GPTBot") or (http.user_agent contains "img2dataset") or (http.user_agent contains "ImagesiftBot") or (http.user_agent contains "magpie-crawler") or (http.user_agent contains "Meltwater") or (http.user_agent contains "omgili") or (http.user_agent contains "omgilibot") or (http.user_agent contains "peer39_crawler") or (http.user_agent contains "PerplexityBot") or (http.user_agent contains "PiplBot") or (http.user_agent contains "scoop.it") or (http.user_agent contains "Seekr") or (http.user_agent contains "YouBot")
|
(http.user_agent contains "AdsBot-Google") or (http.user_agent contains "Amazonbot") or (http.user_agent contains "anthropic-ai") or (http.user_agent contains "Applebot-Extended") or (http.user_agent contains "Bytespider") or (http.user_agent contains "CCBot") or (http.user_agent contains "ChatGPT-User") or (http.user_agent contains "ClaudeBot") or (http.user_agent contains "Claude-Web") or (http.user_agent contains "cohere-ai") or (http.user_agent contains "DataForSeoBot") or (http.user_agent contains "Diffbot") or (http.user_agent contains "FacebookBot") or (http.user_agent contains "FriendlyCrawler") or (http.user_agent contains "Google-Extended") or (http.user_agent contains "GoogleOther") or (http.user_agent contains "GPTBot") or (http.user_agent contains "img2dataset") or (http.user_agent contains "ImagesiftBot") or (http.user_agent contains "magpie-crawler") or (http.user_agent contains "Meltwater") or (http.user_agent contains "omgili") or (http.user_agent contains "omgilibot") or (http.user_agent contains "peer39_crawler") or (http.user_agent contains "PerplexityBot") or (http.user_agent contains "PiplBot") or (http.user_agent contains "scoop.it") or (http.user_agent contains "Seekr") or (http.user_agent contains "YouBot")
|
|
@ -1,6 +1,6 @@
|
||||||
# for nginx.conf; intended to block via user agent string
|
# for nginx.conf; intended to block via user agent string
|
||||||
# note: we recommend following the approach outlined here https://underlap.org/nginx-robot-access
|
# note: we recommend following the approach outlined here https://underlap.org/nginx-robot-access
|
||||||
# and contributing if you're able: https://github.com/glyn/nginx_robot_access
|
# and contributing if you're able: https://github.com/glyn/nginx_robot_access
|
||||||
if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot)) {
|
if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot)) {
|
||||||
return 403;
|
return 403;
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,7 +3,6 @@
|
||||||
| AdsBot-Google | Google | Yes (Exceptions for Dynamic Search Ads) | Analyzes website content for ad relevancy, improves ad serving for Google Ads. Data anonymized according to [Google's Privacy Policy](https://policies.google.com/privacy). Unclear on data retention or use by other products. | Varies depending on campaign activity and website updates. Crawls optimized to minimize impact, specific frequency not public. | Web crawler by Google Ads to analyze websites for ad effectiveness and ensure ad relevancy to webpage content. |
|
| AdsBot-Google | Google | Yes (Exceptions for Dynamic Search Ads) | Analyzes website content for ad relevancy, improves ad serving for Google Ads. Data anonymized according to [Google's Privacy Policy](https://policies.google.com/privacy). Unclear on data retention or use by other products. | Varies depending on campaign activity and website updates. Crawls optimized to minimize impact, specific frequency not public. | Web crawler by Google Ads to analyze websites for ad effectiveness and ensure ad relevancy to webpage content. |
|
||||||
|Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. |
|
|Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. |
|
||||||
|anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
|
|anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
|
||||||
|Applebot | Apple | Yes | Indexes sites to provide answers and search results for Siri users. | Irregular and may be prompted by user queries. | Used to answer queries from users; may included references to the indexed site. |
|
|
||||||
|Applebot-Extended | | | | | |
|
|Applebot-Extended | | | | | |
|
||||||
|AwarioRssBot | | | | | |
|
|AwarioRssBot | | | | | |
|
||||||
|AwarioSmartBot | | | | | |
|
|AwarioSmartBot | | | | | |
|
||||||
|
|
Loading…
Reference in a new issue