mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-04 19:13:57 +00:00
chore: clean up bots and narrow scope
This commit is contained in:
parent
429336d725
commit
a90ee5e9f0
7 changed files with 7 additions and 90 deletions
|
@ -14,6 +14,12 @@ A number of these crawlers have been sourced from [Dark Visitors](https://darkvi
|
|||
|
||||
If you'd like to add information about a crawler to the list, please make a pull request with the bot name added to `robots.txt`, `ai.txt`, and any relevant details in `table-of-bot-metrics.md` to help people understand what's crawling.
|
||||
|
||||
## Additional resources
|
||||
|
||||
- [Blocking Bots with Nginx](https://rknight.me/blog/blocking-bots-with-nginx/) by Robb Knight
|
||||
- [Blockin' bots.](https://ethanmarcotte.com/wrote/blockin-bots/) by Ethan Marcotte
|
||||
- [Blocking Bots With 11ty And Apache](https://flamedfury.com/posts/blocking-bots-with-11ty-and-apache/) by fLaMEd fury
|
||||
|
||||
---
|
||||
|
||||
Thank you to [Glyn](https://github.com/glyn) for pushing [me](https://coryd.dev) to set this up after [I posted about blocking these crawlers](https://coryd.dev/posts/2024/go-ahead-and-block-ai-web-crawlers/).
|
||||
|
|
|
@ -1,66 +0,0 @@
|
|||
/**
|
||||
* block-bots.js
|
||||
* View the original post by Jeremia Kimelman at:
|
||||
* https://www.jeremiak.com/blog/block-bots-netlify-edge-functions/
|
||||
*
|
||||
* modify `netlify.toml`
|
||||
* [[edge_functions]]
|
||||
* function = "block-bots"
|
||||
* path = "/*"
|
||||
*
|
||||
* Place at `netlify/edge-functions/block-bots.js`
|
||||
*
|
||||
* (Or adapt for your edge function-supporting platform of choice.)
|
||||
*/
|
||||
|
||||
// inspired (and taken) from ethan marcotte's blog post
|
||||
// https://ethanmarcotte.com/wrote/blockin-bots/
|
||||
const botUas = [
|
||||
'AdsBot-Google',
|
||||
'Amazonbot',
|
||||
'anthropic-ai',
|
||||
'Applebot-Extended',
|
||||
'AwarioRssBot',
|
||||
'AwarioSmartBot',
|
||||
'Bytespider',
|
||||
'CCBot',
|
||||
'ChatGPT-User',
|
||||
'ClaudeBot',
|
||||
'Claude-Web',
|
||||
'cohere-ai',
|
||||
'DataForSeoBot',
|
||||
'Diffbot',
|
||||
'FacebookBot',
|
||||
'FriendlyCrawler',
|
||||
'Google-Extended',
|
||||
'GoogleOther',
|
||||
'GPTBot',
|
||||
'img2dataset',
|
||||
'ImagesiftBot',
|
||||
'magpie-crawler',
|
||||
'Meltwater',
|
||||
'omgili',
|
||||
'omgilibot',
|
||||
'peer39_crawler',
|
||||
'peer39_crawler/1.0',
|
||||
'PerplexityBot',
|
||||
'PiplBot',
|
||||
'scoop.it',
|
||||
'Seekr',
|
||||
'YouBot',
|
||||
]
|
||||
|
||||
export default async (request, context) => {
|
||||
const ua = request.headers.get('user-agent');
|
||||
|
||||
let isBot = false
|
||||
|
||||
botUas.forEach(u => {
|
||||
if (ua.toLowerCase().includes(u.toLowerCase())) {
|
||||
isBot = true
|
||||
}
|
||||
})
|
||||
|
||||
const response = isBot ? new Response(null, { status: 401 }) : await context.next();
|
||||
return response
|
||||
};
|
|
@ -10,7 +10,6 @@ User-agent: ChatGPT-User
|
|||
User-agent: ClaudeBot
|
||||
User-agent: Claude-Web
|
||||
User-agent: cohere-ai
|
||||
User-agent: DataForSeoBot
|
||||
User-agent: Diffbot
|
||||
User-agent: FacebookBot
|
||||
User-agent: FriendlyCrawler
|
||||
|
@ -18,7 +17,6 @@ User-agent: Google-Extended
|
|||
User-agent: GoogleOther
|
||||
User-agent: GPTBot
|
||||
User-agent: img2dataset
|
||||
User-agent: ImagesiftBot
|
||||
User-agent: magpie-crawler
|
||||
User-agent: Meltwater
|
||||
User-agent: omgili
|
||||
|
@ -28,6 +26,5 @@ User-agent: peer39_crawler/1.0
|
|||
User-agent: PerplexityBot
|
||||
User-agent: PiplBot
|
||||
User-agent: scoop.it
|
||||
User-agent: Seekr
|
||||
User-agent: YouBot
|
||||
Disallow: /
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
# for apache2.conf or .htaccess; intended to block via user agent string
|
||||
RewriteEngine On
|
||||
RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot) [NC]
|
||||
RewriteRule .* - [F,L]
|
|
@ -1 +0,0 @@
|
|||
(http.user_agent contains "AdsBot-Google") or (http.user_agent contains "Amazonbot") or (http.user_agent contains "anthropic-ai") or (http.user_agent contains "Applebot-Extended") or (http.user_agent contains "Bytespider") or (http.user_agent contains "CCBot") or (http.user_agent contains "ChatGPT-User") or (http.user_agent contains "ClaudeBot") or (http.user_agent contains "Claude-Web") or (http.user_agent contains "cohere-ai") or (http.user_agent contains "DataForSeoBot") or (http.user_agent contains "Diffbot") or (http.user_agent contains "FacebookBot") or (http.user_agent contains "FriendlyCrawler") or (http.user_agent contains "Google-Extended") or (http.user_agent contains "GoogleOther") or (http.user_agent contains "GPTBot") or (http.user_agent contains "img2dataset") or (http.user_agent contains "ImagesiftBot") or (http.user_agent contains "magpie-crawler") or (http.user_agent contains "Meltwater") or (http.user_agent contains "omgili") or (http.user_agent contains "omgilibot") or (http.user_agent contains "peer39_crawler") or (http.user_agent contains "PerplexityBot") or (http.user_agent contains "PiplBot") or (http.user_agent contains "scoop.it") or (http.user_agent contains "Seekr") or (http.user_agent contains "YouBot")
|
|
@ -1,6 +0,0 @@
|
|||
# for nginx.conf; intended to block via user agent string
|
||||
# note: we recommend following the approach outlined here https://underlap.org/nginx-robot-access
|
||||
# and contributing if you're able: https://github.com/glyn/nginx_robot_access
|
||||
if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot-Extended|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot)) {
|
||||
return 403;
|
||||
}
|
|
@ -3,31 +3,22 @@
|
|||
| AdsBot-Google | Google | Yes (Exceptions for Dynamic Search Ads) | Analyzes website content for ad relevancy, improves ad serving for Google Ads. Data anonymized according to [Google's Privacy Policy](https://policies.google.com/privacy). Unclear on data retention or use by other products. | Varies depending on campaign activity and website updates. Crawls optimized to minimize impact, specific frequency not public. | Web crawler by Google Ads to analyze websites for ad effectiveness and ensure ad relevancy to webpage content. |
|
||||
|Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. |
|
||||
|anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
|
||||
|Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | | | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple’s foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. |
|
||||
|AwarioRssBot | | | | | |
|
||||
|AwarioSmartBot | | | | | |
|
||||
|Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | | | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. |
|
||||
|Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |
|
||||
|CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. |
|
||||
|ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. |
|
||||
|ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
|
||||
|Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
|
||||
|cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. |
|
||||
|DataForSeoBot | [DataForSEO](https://dataforseo.com/) | [Yes](https://dataforseo.com/dataforseo-bot) | Backlink checking and SEO data collection to be resolt to clients. | As often as every 5 seconds. | Operated by DataForSEO to check backlinks and scrape SEO data for resale. |
|
||||
|Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. |
|
||||
|FacebookBot | | | | | |
|
||||
|Google-Extended| | | | | |
|
||||
|GoogleOther | | | | | |
|
||||
|GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information provided. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. |
|
||||
| img2dataset | | | | | |
|
||||
|ImagesiftBot | | | | | |
|
||||
|magpie-crawler | | | | | |
|
||||
|Meltwater | | | | | |
|
||||
|omgili | | | | | |
|
||||
|omgilibot | | | | | |
|
||||
|peer39_crawler| | | | | |
|
||||
|peer39_crawler/1.0| | | | | |
|
||||
|PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/docs/perplexitybot) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. |
|
||||
|PiplBot | | | | | |
|
||||
|scoop.it | | | | | |
|
||||
|Seekr | | | | | |
|
||||
|YouBot | | | | | |
|
||||
|
|
Loading…
Reference in a new issue