add dark visitor workflow

This commit is contained in:
Chenghao Mou 2024-08-06 17:01:56 +01:00
parent e12ddc0f42
commit 192bf67631
2 changed files with 60 additions and 0 deletions

22
.github/workflows/daily_update.yml vendored Normal file
View file

@ -0,0 +1,22 @@
name: Daily Update from Dark Visitors
on:
schedule:
- cron: "0 0 * * *"
jobs:
dark-visitors:
runs-on: ubuntu-latest
name: dark-visitors
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 2
- run: |
pip install beautifulsoup4 requests
git config --global user.name "dark-visitors"
git config --global user.email "dark-visitors@users.noreply.github.com"
python code/dark_visitors.py
git add -A
git commit -m "Daily update from Dark Visitors"
git push
shell: bash

38
code/dark_visitors.py Normal file
View file

@ -0,0 +1,38 @@
import json
from pathlib import Path
import requests
from bs4 import BeautifulSoup
session = requests.Session()
response = session.get("https://darkvisitors.com/agents")
soup = BeautifulSoup(response.text, "html.parser")
existing_content = json.loads(Path("./robots.json").read_text())
for section in soup.find_all("div", {"class": "agent-links-section"}):
category = section.find("h2").get_text()
for agent in section.find_all("a", href=True):
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
desc = agent.find("p").get_text().strip()
if name in existing_content:
print(f"{name} already exists in robots.json")
continue
# Template:
# "Claude-Web": {
# "operator": "[Anthropic](https:\/\/www.anthropic.com)",
# "respect": "Unclear at this time.",
# "function": "Scrapes data to train Anthropic's AI products.",
# "frequency": "No information. provided.",
# "description": "Scrapes data to train LLMs and AI products offered by Anthropic."
# }
existing_content[name] = {
"operator": "Unclear at this time.",
"respect": "Unclear at this time.",
"function": "Unclear at this time.",
"frequency": "Unclear at this time.",
"description": f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}"
}
Path("./robots.json").write_text(json.dumps(existing_content, indent=4))