mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-05-17 16:03:10 +00:00
Compare commits
6 commits
120bd4962a
...
d0ef519283
Author | SHA1 | Date | |
---|---|---|---|
![]() |
d0ef519283 | ||
![]() |
4a764bba18 | ||
a891ad7213 | |||
b65f45e408 | |||
![]() |
8dc36aa2e2 | ||
![]() |
ae8f74c10c |
5 changed files with 28 additions and 11 deletions
|
@ -1,3 +1,3 @@
|
|||
RewriteEngine On
|
||||
RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
|
||||
RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
|
||||
RewriteRule !^/?robots\.txt$ - [F,L]
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
|
||||
if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
|
||||
return 403;
|
||||
}
|
30
robots.json
30
robots.json
|
@ -69,13 +69,6 @@
|
|||
"frequency": "Only when prompted by a user.",
|
||||
"description": "Used by plugins in ChatGPT to answer queries based on user input."
|
||||
},
|
||||
"Claude-Web": {
|
||||
"operator": "[Anthropic](https://www.anthropic.com)",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "Scrapes data to train Anthropic's AI products.",
|
||||
"frequency": "No information provided.",
|
||||
"description": "Scrapes data to train LLMs and AI products offered by Anthropic."
|
||||
},
|
||||
"ClaudeBot": {
|
||||
"operator": "[Anthropic](https://www.anthropic.com)",
|
||||
"respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)",
|
||||
|
@ -83,6 +76,20 @@
|
|||
"frequency": "No information provided.",
|
||||
"description": "Scrapes data to train LLMs and AI products offered by Anthropic."
|
||||
},
|
||||
"Claude-User": {
|
||||
"operator": "[Anthropic](https://www.anthropic.com)",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent.",
|
||||
"frequency": "No information provided.",
|
||||
"description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent."
|
||||
},
|
||||
"Claude-SearchBot": {
|
||||
"operator": "[Anthropic](https://www.anthropic.com)",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.",
|
||||
"frequency": "No information provided.",
|
||||
"description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses."
|
||||
},
|
||||
"cohere-ai": {
|
||||
"operator": "[Cohere](https://cohere.com)",
|
||||
"respect": "Unclear at this time.",
|
||||
|
@ -195,6 +202,13 @@
|
|||
"operator": "[img2dataset](https://github.com/rom1504/img2dataset)",
|
||||
"respect": "Unclear at this time."
|
||||
},
|
||||
"imgproxy": {
|
||||
"frequency": "No information.",
|
||||
"function": "Not documented or explained on operator's site.",
|
||||
"operator": "[imgproxy](https://imgproxy.net)",
|
||||
"respect": "Unclear at this time.",
|
||||
"description": "AI-powered image processing."
|
||||
},
|
||||
"ISSCyberRiskCrawler": {
|
||||
"description": "Used to train machine learning based models to quantify cyber risk.",
|
||||
"frequency": "No information.",
|
||||
|
@ -328,4 +342,4 @@
|
|||
"frequency": "No information.",
|
||||
"description": "Retrieves data used for You.com web search engine and LLMs."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,8 +8,9 @@ User-agent: Brightbot 1.0
|
|||
User-agent: Bytespider
|
||||
User-agent: CCBot
|
||||
User-agent: ChatGPT-User
|
||||
User-agent: Claude-Web
|
||||
User-agent: ClaudeBot
|
||||
User-agent: Claude-User
|
||||
User-agent: Claude-SearchBot
|
||||
User-agent: cohere-ai
|
||||
User-agent: cohere-training-data-crawler
|
||||
User-agent: Crawlspace
|
||||
|
@ -26,6 +27,7 @@ User-agent: iaskspider/2.0
|
|||
User-agent: ICC-Crawler
|
||||
User-agent: ImagesiftBot
|
||||
User-agent: img2dataset
|
||||
User-agent: imgproxy
|
||||
User-agent: ISSCyberRiskCrawler
|
||||
User-agent: Kangaroo Bot
|
||||
User-agent: Meta-ExternalAgent
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
| ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. |
|
||||
| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. |
|
||||
| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. |
|
||||
| imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. |
|
||||
| ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. |
|
||||
| Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot |
|
||||
| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue