diff --git a/.github/workflows/daily_update.yml b/.github/workflows/daily_update.yml new file mode 100644 index 0000000..1e36f7b --- /dev/null +++ b/.github/workflows/daily_update.yml @@ -0,0 +1,22 @@ +name: Daily Update from Dark Visitors +on: + schedule: + - cron: "0 0 * * *" + +jobs: + dark-visitors: + runs-on: ubuntu-latest + name: dark-visitors + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 2 + - run: | + pip install beautifulsoup4 requests + git config --global user.name "dark-visitors" + git config --global user.email "dark-visitors@users.noreply.github.com" + python code/dark_visitors.py + git add -A + git commit -m "Daily update from Dark Visitors" + git push + shell: bash \ No newline at end of file diff --git a/code/dark_visitors.py b/code/dark_visitors.py new file mode 100644 index 0000000..01965b9 --- /dev/null +++ b/code/dark_visitors.py @@ -0,0 +1,38 @@ +import json +from pathlib import Path + +import requests +from bs4 import BeautifulSoup + +session = requests.Session() +response = session.get("https://darkvisitors.com/agents") +soup = BeautifulSoup(response.text, "html.parser") + +existing_content = json.loads(Path("./robots.json").read_text()) + +for section in soup.find_all("div", {"class": "agent-links-section"}): + category = section.find("h2").get_text() + for agent in section.find_all("a", href=True): + name = agent.find("div", {"class": "agent-name"}).get_text().strip() + desc = agent.find("p").get_text().strip() + + if name in existing_content: + print(f"{name} already exists in robots.json") + continue + # Template: + # "Claude-Web": { + # "operator": "[Anthropic](https:\/\/www.anthropic.com)", + # "respect": "Unclear at this time.", + # "function": "Scrapes data to train Anthropic's AI products.", + # "frequency": "No information. provided.", + # "description": "Scrapes data to train LLMs and AI products offered by Anthropic." + # } + existing_content[name] = { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "Unclear at this time.", + "frequency": "Unclear at this time.", + "description": f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}" + } + +Path("./robots.json").write_text(json.dumps(existing_content, indent=4)) \ No newline at end of file