mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-04 11:03:59 +00:00
Adding GitHub Action
This commit is contained in:
parent
17a84f2c2d
commit
1fdc79dacb
3 changed files with 244 additions and 0 deletions
25
.github/workflows/main.yml
vendored
Normal file
25
.github/workflows/main.yml
vendored
Normal file
|
@ -0,0 +1,25 @@
|
|||
on: [push]
|
||||
|
||||
jobs:
|
||||
ai-robots-txt:
|
||||
runs-on: ubuntu-latest
|
||||
name: ai-robots-txt
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- run: |
|
||||
git config --global user.name "ai.robots.txt"
|
||||
git config --global user.email "ai.robots.txt@users.noreply.github.com"
|
||||
git rm robots.txt
|
||||
git rm table-of-bot-metrics.md
|
||||
git add -A
|
||||
git commit -m "Removing previously generated files"
|
||||
git push
|
||||
php -f code/action.php
|
||||
git config --global user.name "ai.robots.txt"
|
||||
git config --global user.email "ai.robots.txt@users.noreply.github.com"
|
||||
git add -A
|
||||
git commit -m "${{ github.event.head_commit.message }}"
|
||||
git push
|
||||
shell: bash
|
28
code/action.php
Normal file
28
code/action.php
Normal file
|
@ -0,0 +1,28 @@
|
|||
<?php
|
||||
|
||||
/*
|
||||
|
||||
This script processes updates to the https://github.com/ai-robots-txt/ai.robots.txt repository.
|
||||
|
||||
It generates:
|
||||
|
||||
- robots.txt
|
||||
- table-of-bot-metrics.md
|
||||
|
||||
*/
|
||||
|
||||
$robots = json_decode(file_get_contents('robots.json'), 1);
|
||||
|
||||
$robots_txt = null;
|
||||
$robots_table = '| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |'."\n";
|
||||
$robots_table .= '|-----|----------|-----------------------|----------|------------------|-------------|'."\n";
|
||||
|
||||
foreach($robots as $robot => $details) {
|
||||
$robots_txt .= 'User-agent: '.$robot."\n";
|
||||
$robots_table .= '| '.$robot.' | '.$details['operator'].' | '.$details['respect'].' | '.$details['function'].' | '.$details['frequency'].' | '.$details['description'].' | '."\n";
|
||||
}
|
||||
|
||||
$robots_txt .= 'Disallow: /';
|
||||
|
||||
file_put_contents('robots.txt', $robots_txt);
|
||||
file_put_contents('table-of-bot-metrics.md', $robots_table);
|
191
robots.json
Normal file
191
robots.json
Normal file
|
@ -0,0 +1,191 @@
|
|||
{
|
||||
"Amazonbot": {
|
||||
"operator": "Amazon",
|
||||
"respect": "Yes",
|
||||
"function": "Service improvement and enabling answers for Alexa users.",
|
||||
"frequency": "No information. provided.",
|
||||
"description": "Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses."
|
||||
},
|
||||
"anthropic-ai": {
|
||||
"operator": "[Anthropic](https:\/\/www.anthropic.com)",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "Scrapes data to train Anthropic's AI products.",
|
||||
"frequency": "No information. provided.",
|
||||
"description": "Scrapes data to train LLMs and AI products offered by Anthropic."
|
||||
},
|
||||
"Applebot-Extended": {
|
||||
"operator": "[Apple](https:\/\/support.apple.com\/en-us\/119829#datausage)",
|
||||
"respect": "Yes",
|
||||
"function": "Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others.",
|
||||
"frequency": "Unclear at this time.",
|
||||
"description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools."
|
||||
},
|
||||
"Bytespider": {
|
||||
"operator": "ByteDance",
|
||||
"respect": "No",
|
||||
"function": "LLM training.",
|
||||
"frequency": "Unclear at this time.",
|
||||
"description": "Downloads data to train LLMS, including ChatGPT competitors."
|
||||
},
|
||||
"CCBot": {
|
||||
"operator": "[Common Crawl](https:\/\/commoncrawl.org)",
|
||||
"respect": "[Yes](https:\/\/commoncrawl.org\/ccbot)",
|
||||
"function": "Provides crawl data for an open source repository that has been used to train LLMs.",
|
||||
"frequency": "Unclear at this time.",
|
||||
"description": "Sources data that is made openly available and is used to train AI models."
|
||||
},
|
||||
"ChatGPT-User": {
|
||||
"operator": "[OpenAI](https:\/\/openai.com)",
|
||||
"respect": "Yes",
|
||||
"function": "Takes action based on user prompts.",
|
||||
"frequency": "Only when prompted by a user.",
|
||||
"description": "Used by plugins in ChatGPT to answer queries based on user input."
|
||||
},
|
||||
"ClaudeBot": {
|
||||
"operator": "[Anthropic](https:\/\/www.anthropic.com)",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "Scrapes data to train Anthropic's AI products.",
|
||||
"frequency": "No information. provided.",
|
||||
"description": "Scrapes data to train LLMs and AI products offered by Anthropic."
|
||||
},
|
||||
"Claude-Web": {
|
||||
"operator": "[Anthropic](https:\/\/www.anthropic.com)",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "Scrapes data to train Anthropic's AI products.",
|
||||
"frequency": "No information. provided.",
|
||||
"description": "Scrapes data to train LLMs and AI products offered by Anthropic."
|
||||
},
|
||||
"cohere-ai": {
|
||||
"operator": "[Cohere](https:\/\/cohere.com)",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "Retrieves data to provide responses to user-initiated prompts.",
|
||||
"frequency": "Takes action based on user prompts.",
|
||||
"description": "Retrieves data based on user prompts."
|
||||
},
|
||||
"Diffbot": {
|
||||
"operator": "[Diffbot](https:\/\/www.diffbot.com\/)",
|
||||
"respect": "At the discretion of Diffbot users.",
|
||||
"function": "Aggregates structured web data for monitoring and AI model training.",
|
||||
"frequency": "Unclear at this time.",
|
||||
"description": "Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training."
|
||||
},
|
||||
"FacebookBot": {
|
||||
"operator": "Meta\/Facebook",
|
||||
"respect": "[Yes](https:\/\/developers.facebook.com\/docs\/sharing\/bot\/)",
|
||||
"function": "Training language models",
|
||||
"frequency": "Up to 1 page per second",
|
||||
"description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically."
|
||||
},
|
||||
"Google-Extended": {
|
||||
"operator": "Google",
|
||||
"respect": "[Yes](https:\/\/developers.google.com\/search\/docs\/crawling-indexing\/overview-google-crawlers)",
|
||||
"function": "LLM training.",
|
||||
"frequency": "No information.",
|
||||
"description": "Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search."
|
||||
},
|
||||
"GoogleOther": {
|
||||
"operator": "Google",
|
||||
"respect": "[Yes](https:\/\/developers.google.com\/search\/docs\/crawling-indexing\/overview-google-crawlers)",
|
||||
"function": "Scrapes data.",
|
||||
"frequency": "No information.",
|
||||
"description": "\"Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development.\""
|
||||
},
|
||||
"GoogleOther-Image": {
|
||||
"operator": "Google",
|
||||
"respect": "[Yes](https:\/\/developers.google.com\/search\/docs\/crawling-indexing\/overview-google-crawlers)",
|
||||
"function": "Scrapes data.",
|
||||
"frequency": "No information.",
|
||||
"description": "\"Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development.\""
|
||||
},
|
||||
"GoogleOther-Video": {
|
||||
"operator": "Google",
|
||||
"respect": "[Yes](https:\/\/developers.google.com\/search\/docs\/crawling-indexing\/overview-google-crawlers)",
|
||||
"function": "Scrapes data.",
|
||||
"frequency": "No information.",
|
||||
"description": "\"Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development.\""
|
||||
},
|
||||
"GPTBot": {
|
||||
"operator": "[OpenAI](https:\/\/openai.com)",
|
||||
"respect": "Yes",
|
||||
"function": "Scrapes data to train OpenAI's products.",
|
||||
"frequency": "No information.",
|
||||
"description": "Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies."
|
||||
},
|
||||
"img2dataset": {
|
||||
"operator": "[img2dataset](https:\/\/github.com\/rom1504\/img2dataset)",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "Scrapes images for use in LLMs.",
|
||||
"frequency": "At the discretion of img2dataset users.",
|
||||
"description": "Downloads large sets of images into datasets for LLM training or other purposes."
|
||||
},
|
||||
"Meta-ExternalAgent": {
|
||||
"operator": "[Meta](https:\/\/developers.facebook.com\/docs\/sharing\/webmasters\/web-crawlers)",
|
||||
"respect": "Yes.",
|
||||
"function": "Used to train models and improve products.",
|
||||
"frequency": "No information.",
|
||||
"description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\""
|
||||
},
|
||||
"OAI-SearchBot": {
|
||||
"operator": "[OpenAI](https:\/\/openai.com)",
|
||||
"respect": "[Yes](https:\/\/platform.openai.com\/docs\/bots)",
|
||||
"function": "Search result generation.",
|
||||
"frequency": "No information.",
|
||||
"description": "Crawls sites to surface as results in SearchGPT."
|
||||
},
|
||||
"omgili": {
|
||||
"operator": "[Webz.io](https:\/\/webz.io\/)",
|
||||
"respect": "[Yes](https:\/\/webz.io\/blog\/web-data\/what-is-the-omgili-bot-and-why-is-it-crawling-your-website\/)",
|
||||
"function": "Data is sold.",
|
||||
"frequency": "No information.",
|
||||
"description": "Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training."
|
||||
},
|
||||
"omgilibot": {
|
||||
"operator": "[Webz.io](https:\/\/webz.io\/)",
|
||||
"respect": "[Yes](https:\/\/web.archive.org\/web\/20170704003301\/http:\/\/omgili.com\/Crawler.html)",
|
||||
"function": "Data is sold.",
|
||||
"frequency": "No information.",
|
||||
"description": "Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io."
|
||||
},
|
||||
"PerplexityBot": {
|
||||
"operator": "[Perplexity](https:\/\/www.perplexity.ai\/)",
|
||||
"respect": "[No](https:\/\/www.macstories.net\/stories\/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler\/)",
|
||||
"function": "Used to answer queries at the request of users.",
|
||||
"frequency": "Takes action based on user prompts.",
|
||||
"description": "Operated by Perplexity to obtain results in response to user queries."
|
||||
},
|
||||
"Scrapy": {
|
||||
"operator": "[Zyte](https:\/\/www.zyte.com)",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "Scrapes data a variety of uses including training AI.",
|
||||
"frequency": "No information.",
|
||||
"description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\""
|
||||
},
|
||||
"Timpibot": {
|
||||
"operator": "[Timpi](https:\/\/timpi.io)",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "Scrapes data for use in training LLMs.",
|
||||
"frequency": "No information.",
|
||||
"description": "Makes data available for training AI models."
|
||||
},
|
||||
"VelenPublicWebCrawler": {
|
||||
"operator": "[Velen Crawler](https:\/\/velen.io)",
|
||||
"respect": "[Yes](https:\/\/velen.io)",
|
||||
"function": "Scrapes data for business data sets and machine learning models.",
|
||||
"frequency": "No information.",
|
||||
"description": "\"Our goal with this crawler is to build business datasets and machine learning models to better understand the web.\""
|
||||
},
|
||||
"YouBot": {
|
||||
"operator": "[You](https:\/\/about.you.com\/youchat\/)",
|
||||
"respect": "[Yes](https:\/\/about.you.com\/youbot\/)",
|
||||
"function": "Scrapes data for search engine and LLMs.",
|
||||
"frequency": "No information.",
|
||||
"description": "Retrieves data used for You.com web search engine and LLMs."
|
||||
},
|
||||
"TestBot2": {
|
||||
"operator": "Testing operator",
|
||||
"respect": "Testing respect",
|
||||
"function": "Testing function",
|
||||
"frequency": "Testing frequency",
|
||||
"description": "Testing description"
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue