mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-05-17 16:03:10 +00:00
Update robots.json
Updated robots list with five new proposed AI bots: aiHitBot Cotoyogi Factset_spyderbot FirecrawlAgent TikTokSpider
This commit is contained in:
parent
774b1ddf52
commit
33c5ce1326
1 changed files with 36 additions and 1 deletions
37
robots.json
37
robots.json
|
@ -13,6 +13,13 @@
|
|||
"operator": "[Ai2](https://allenai.org/crawler)",
|
||||
"respect": "Yes"
|
||||
},
|
||||
"aiHitBot": {
|
||||
"operator": "[aiHit](https://www.aihitdata.com/about)",
|
||||
"respect": "Yes",
|
||||
"function": "A massive, artificial intelligence/machine learning, automated system.",
|
||||
"frequency": "No information provided.",
|
||||
"description": "Scrapes data for AI systems."
|
||||
},
|
||||
"Amazonbot": {
|
||||
"operator": "Amazon",
|
||||
"respect": "Yes",
|
||||
|
@ -97,6 +104,13 @@
|
|||
"frequency": "Unclear at this time.",
|
||||
"description": "cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler"
|
||||
},
|
||||
"Cotoyogi": {
|
||||
"operator": "[ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/)",
|
||||
"respect": "Yes",
|
||||
"function": "AI LLM Scraper.",
|
||||
"frequency": "No information provided.",
|
||||
"description": "Scrapes data for AI training in Japanese language."
|
||||
},
|
||||
"Crawlspace": {
|
||||
"operator": "[Crawlspace](https://crawlspace.dev)",
|
||||
"respect": "[Yes](https://news.ycombinator.com/item?id=42756654)",
|
||||
|
@ -125,6 +139,20 @@
|
|||
"frequency": "Up to 1 page per second",
|
||||
"description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically."
|
||||
},
|
||||
"Factset_spyderbot": {
|
||||
"operator": "[Factset](https://www.factset.com/ai)",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "AI model training.",
|
||||
"frequency": "No information provided.",
|
||||
"description": "Scrapes data for AI training."
|
||||
},
|
||||
"FirecrawlAgent": {
|
||||
"operator": "[Firecrawl](https://www.firecrawl.dev/)",
|
||||
"respect": "Yes",
|
||||
"function": "AI scraper and LLM training",
|
||||
"frequency": "No information provided.",
|
||||
"description": "Scrapes data for AI systems and LLM training."
|
||||
},
|
||||
"FriendlyCrawler": {
|
||||
"description": "Unclear who the operator is; but data is used for training/machine learning.",
|
||||
"frequency": "Unclear at this time.",
|
||||
|
@ -321,6 +349,13 @@
|
|||
"operator": "[Sidetrade](https://www.sidetrade.com)",
|
||||
"respect": "Unclear at this time."
|
||||
},
|
||||
"TikTokSpider": {
|
||||
"operator": "ByteDance",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "LLM training.",
|
||||
"frequency": "Unclear at this time.",
|
||||
"description": "Downloads data to train LLMS, as per Bytespider."
|
||||
},
|
||||
"Timpibot": {
|
||||
"operator": "[Timpi](https://timpi.io)",
|
||||
"respect": "Unclear at this time.",
|
||||
|
@ -349,4 +384,4 @@
|
|||
"frequency": "No information.",
|
||||
"description": "Retrieves data used for You.com web search engine and LLMs."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue