From a6de89e6bdcc552a13ac7bd56b78017d251e01bc Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Thu, 26 Sep 2024 21:41:28 +0000 Subject: [PATCH] feat: make CCBot entry more accurate --- robots.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/robots.json b/robots.json index a53cebd..12ed898 100644 --- a/robots.json +++ b/robots.json @@ -42,10 +42,10 @@ "respect": "No" }, "CCBot": { - "description": "Sources data that is made openly available and is used to train AI models.", - "frequency": "Unclear at this time.", - "function": "Provides crawl data for an open source repository that has been used to train LLMs.", - "operator": "[Common Crawl](https://commoncrawl.org)", + "description": "Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers).", + "frequency": "Monthly at present.", + "function": "Provides open crawl dataset, used for many purposes, including Machine Learning/AI.", + "operator": "[Common Crawl Foundation](https://commoncrawl.org)", "respect": "[Yes](https://commoncrawl.org/ccbot)" }, "ChatGPT-User": {