From dea035365f5f659b6d512903a72973d8a66de689 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sun, 5 May 2024 14:50:04 -0700 Subject: [PATCH] chore: add Diffbot and scoopit --- edge-functions/block-bots.js | 2 ++ robots.txt | 2 ++ servers/apache.conf | 2 +- servers/nginx.conf | 2 +- table-of-bot-metrics.md | 2 ++ 5 files changed, 8 insertions(+), 2 deletions(-) diff --git a/edge-functions/block-bots.js b/edge-functions/block-bots.js index 5fc3c84..158e6c1 100644 --- a/edge-functions/block-bots.js +++ b/edge-functions/block-bots.js @@ -29,6 +29,7 @@ const botUas = [ 'Claude-Web', 'cohere-ai', 'DataForSeoBot', + 'Diffbot', 'FacebookBot', 'FriendlyCrawler', 'Google-Extended', @@ -44,6 +45,7 @@ const botUas = [ 'peer39_crawler/1.0', 'PerplexityBot', 'PiplBot', + 'scoop.it', 'Seekr', 'YouBot', ] diff --git a/robots.txt b/robots.txt index 54b4151..fc122f3 100644 --- a/robots.txt +++ b/robots.txt @@ -11,6 +11,7 @@ User-agent: ClaudeBot User-agent: Claude-Web User-agent: cohere-ai User-agent: DataForSeoBot +User-agent: Diffbot User-agent: FacebookBot User-agent: FriendlyCrawler User-agent: Google-Extended @@ -26,6 +27,7 @@ User-agent: peer39_crawler User-agent: peer39_crawler/1.0 User-agent: PerplexityBot User-agent: PiplBot +User-agent: scoop.it User-agent: Seekr User-agent: YouBot Disallow: / diff --git a/servers/apache.conf b/servers/apache.conf index 4ae7673..fc4807f 100644 --- a/servers/apache.conf +++ b/servers/apache.conf @@ -1,4 +1,4 @@ # for apache2.conf or .htaccess; intended to block via user agent string RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot) [NC] RewriteRule .* - [F,L] diff --git a/servers/nginx.conf b/servers/nginx.conf index 98b4527..099285d 100644 --- a/servers/nginx.conf +++ b/servers/nginx.conf @@ -1,6 +1,6 @@ # for nginx.conf; intended to block via user agent string # note: we recommend following the approach outlined here https://underlap.org/nginx-robot-access # and contributing if you're able: https://github.com/glyn/nginx_robot_access -if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot)) { +if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|Diffbot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|img2dataset|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|scoop.it|Seekr|YouBot)) { return 403; } diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 13382a4..b3be8c3 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -13,6 +13,7 @@ |Claude-Web | | | | | | |coher-ai | | | | | | |DataForSeoBot | | | | | | +|Diffbot | | | | | | |FacebookBot | | | | | | |Google-Extended| | | | | | |GoogleOther | | | | | | @@ -27,5 +28,6 @@ |peer39_crawler/1.0| | | | | | |PerplexityBot | | | | | | |PiplBot | | | | | | +|scoop.it | | | | | | |Seekr | | | | | | |YouBot | | | | | |