From 4a764bba18f10167cb5f7107c8721e5dc208100f Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 10 Apr 2025 19:22:34 +0000 Subject: [PATCH] Merge pull request #102 from ai-robots-txt/imgproxy-bot chore(robots.json): adds imgproxy crawler --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index 27a7e11..c0e5fbb 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 0577bd9..a6bbfa2 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index c531918..de25a56 100644 --- a/robots.txt +++ b/robots.txt @@ -26,6 +26,7 @@ User-agent: iaskspider/2.0 User-agent: ICC-Crawler User-agent: ImagesiftBot User-agent: img2dataset +User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: Meta-ExternalAgent diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index d92df34..b3e51fe 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -28,6 +28,7 @@ | ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | +| imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |