From 609de2af12a540c4d8cc6806b3911398aa58705b Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 8 Apr 2024 12:26:04 -0700 Subject: [PATCH 1/8] chore: create apache.conf.txt --- apache.conf.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 apache.conf.txt diff --git a/apache.conf.txt b/apache.conf.txt new file mode 100644 index 0000000..ee8cb3b --- /dev/null +++ b/apache.conf.txt @@ -0,0 +1,4 @@ +# for apache2.conf or .htaccess; intended to block via user agent string +RewriteEngine On +RewriteCond %{HTTP_USER_AGENT (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|Google-Extended|GoogleOther|GPTBot|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot) [NC] +RewriteRule .* - [F,L] From 74e4f64d898be63e2564d67dc3b98c0c4cf4bbbc Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 8 Apr 2024 12:27:59 -0700 Subject: [PATCH 2/8] chore: create nginx.conf.txt --- nginx.conf.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 nginx.conf.txt diff --git a/nginx.conf.txt b/nginx.conf.txt new file mode 100644 index 0000000..9446604 --- /dev/null +++ b/nginx.conf.txt @@ -0,0 +1,4 @@ +# for nginx.conf; intended to block via user agent string +if ($http_user_agent ~ (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|Google-Extended|GoogleOther|GPTBot|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot) ) { + return 403; +} From d6d40989f4008fc0c054c499d741c8617c8e1627 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 8 Apr 2024 12:40:59 -0700 Subject: [PATCH 3/8] chore: add FriendlyCrawler to robots.txt --- robots.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/robots.txt b/robots.txt index c3d0a13..e8574d2 100644 --- a/robots.txt +++ b/robots.txt @@ -12,6 +12,7 @@ User-agent: Claude-Web User-agent: cohere-ai User-agent: DataForSeoBot User-agent: FacebookBot +User-agent: FriendlyCrawler User-agent: Google-Extended User-agent: GoogleOther User-agent: GPTBot From 2d4efec184771bec66e4be84bbec10bc91cfeb52 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 8 Apr 2024 12:41:19 -0700 Subject: [PATCH 4/8] chore: add FriendlyCrawler to apache.conf.txt --- apache.conf.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apache.conf.txt b/apache.conf.txt index ee8cb3b..9057bdc 100644 --- a/apache.conf.txt +++ b/apache.conf.txt @@ -1,4 +1,4 @@ # for apache2.conf or .htaccess; intended to block via user agent string RewriteEngine On -RewriteCond %{HTTP_USER_AGENT (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|Google-Extended|GoogleOther|GPTBot|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot) [NC] RewriteRule .* - [F,L] From e03d5f4ec2e4dc612eebe09b5d864f6624773ea1 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 8 Apr 2024 12:41:40 -0700 Subject: [PATCH 5/8] chore: add FriendlyCrawler to nginx.conf.txt --- nginx.conf.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nginx.conf.txt b/nginx.conf.txt index 9446604..883f1f0 100644 --- a/nginx.conf.txt +++ b/nginx.conf.txt @@ -1,4 +1,4 @@ # for nginx.conf; intended to block via user agent string -if ($http_user_agent ~ (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|Google-Extended|GoogleOther|GPTBot|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot) ) { +if ($http_user_agent ~ (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot) ) { return 403; } From 0518b3e7776f0729ff49998f901afc2341bc1a49 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 8 Apr 2024 12:50:36 -0700 Subject: [PATCH 6/8] chore: add FriendlyCrawler to the table --- table-of-bot-metrics.md | 61 +++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index aeacccd..623253d 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -1,30 +1,31 @@ -|Name |Operator |Respects `robots.txt` |Data use |Visit regularity |Description | -|----------------|---------|-----------------------|----------|------------------|-------------| -|AdsBot-Google | | | | | | -|Amazonbot | | | | | | -|anthropic-ai | | | | | | -|Applebot | | | | | | -|AwarioRssBot | | | | | | -|AwarioSmartBot | | | | | | -|Bytespider | | | | | | -|CCBot | | | | | | -|ChatGPT-User | | | | | | -|ClaudeBot | | | | | | -|Claude-Web | | | | | | -|coher-ai | | | | | | -|DataForSeoBot | | | | | | -|FacebookBot | | | | | | -|Google-Extended| | | | | | -|GoogleOther | | | | | | -|GPTBot | | | | | | -|ImagesiftBot | | | | | | -|magpie-crawler | | | | | | -|Meltwater | | | | | | -|omgili | | | | | | -|omgilibot | | | | | | -|peer39_crawler| | | | | | -|peer39_crawler/1.0| | | | | | -|PerplexityBot | | | | | | -|PiplBot | | | | | | -|Seekr | | | | | | -|YouBot | | | | | | \ No newline at end of file +|Name |Operator |Respects `robots.txt` |Data use |Visit regularity |Description | +|-------------------|---------|-----------------------|----------|------------------|-------------| +|AdsBot-Google | | | | | | +|Amazonbot | | | | | | +|anthropic-ai | | | | | | +|Applebot | | | | | | +|AwarioRssBot | | | | | | +|AwarioSmartBot | | | | | | +|Bytespider | | | | | | +|CCBot | | | | | | +|ChatGPT-User | | | | | | +|ClaudeBot | | | | | | +|Claude-Web | | | | | | +|coher-ai | | | | | | +|DataForSeoBot | | | | | | +|FacebookBot | | | | | | +| FriendlyCrawler | | | | | | +|Google-Extended | | | | | | +|GoogleOther | | | | | | +|GPTBot | | | | | | +|ImagesiftBot | | | | | | +|magpie-crawler | | | | | | +|Meltwater | | | | | | +|omgili | | | | | | +|omgilibot | | | | | | +|peer39_crawler | | | | | | +|peer39_crawler/1.0 | | | | | | +|PerplexityBot | | | | | | +|PiplBot | | | | | | +|Seekr | | | | | | +|YouBot | | | | | | From ed190d48035584020dc47da3e16e47db7858ad73 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Tue, 9 Apr 2024 16:08:13 -0700 Subject: [PATCH 7/8] fix: syntax --- apache.conf.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apache.conf.txt b/apache.conf.txt index 9057bdc..83d2cc6 100644 --- a/apache.conf.txt +++ b/apache.conf.txt @@ -1,4 +1,4 @@ # for apache2.conf or .htaccess; intended to block via user agent string RewriteEngine On -RewriteCond %{HTTP_USER_AGENT (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot) [NC] RewriteRule .* - [F,L] From b6be895994b75f0cb925d68ea454004d7b4e663d Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Tue, 9 Apr 2024 16:08:57 -0700 Subject: [PATCH 8/8] fix: clean up syntax --- nginx.conf.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nginx.conf.txt b/nginx.conf.txt index 883f1f0..289b937 100644 --- a/nginx.conf.txt +++ b/nginx.conf.txt @@ -1,4 +1,4 @@ # for nginx.conf; intended to block via user agent string -if ($http_user_agent ~ (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot) ) { +if ($http_user_agent ~* (AdsBot-Google|Amazonbot|anthropic-ai|Applebot|AwarioRssBot|AwarioSmartBot|Bytespider|CCBot|ChatGPT-User|ClaudeBot|Claude-Web|cohere-ai|DataForSeoBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GPTBot|ImagesiftBot|magpie-crawler|Meltwater|omgili|omgilibot|peer39_crawler|peer39_crawler/1.0|PerplexityBot|PiplBot|Seekr|YouBot)) { return 403; }