diff --git a/.github/workflows/daily_update.yml b/.github/workflows/daily_update.yml index 1e36f7b..28f777f 100644 --- a/.github/workflows/daily_update.yml +++ b/.github/workflows/daily_update.yml @@ -1,7 +1,7 @@ name: Daily Update from Dark Visitors on: schedule: - - cron: "0 0 * * *" + - cron: "*/10 * * * *" jobs: dark-visitors: diff --git a/code/dark_visitors.py b/code/dark_visitors.py index c7d11dc..2a84d58 100644 --- a/code/dark_visitors.py +++ b/code/dark_visitors.py @@ -9,7 +9,6 @@ response = session.get("https://darkvisitors.com/agents") soup = BeautifulSoup(response.text, "html.parser") existing_content = json.loads(Path("./robots.json").read_text()) -added = 0 to_include = [ "AI Assistants", "AI Data Scrapers", @@ -30,25 +29,44 @@ for section in soup.find_all("div", {"class": "agent-links-section"}): for agent in section.find_all("a", href=True): name = agent.find("div", {"class": "agent-name"}).get_text().strip() desc = agent.find("p").get_text().strip() - - if name in existing_content: - continue - # Template: - # "Claude-Web": { - # "operator": "[Anthropic](https:\/\/www.anthropic.com)", - # "respect": "Unclear at this time.", - # "function": "Scrapes data to train Anthropic's AI products.", - # "frequency": "No information. provided.", - # "description": "Scrapes data to train LLMs and AI products offered by Anthropic." - # } - existing_content[name] = { - "operator": "Unclear at this time.", - "respect": "Unclear at this time.", - "function": f"{category}", - "frequency": "Unclear at this time.", - "description": f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}" - } - added += 1 -print(f"Added {added} new agents, total is now {len(existing_content)}") + # TODO: there seems to be a typo? + default_values = { + "Unclear at this time.", + "No information. provided.", + "No information.", + "No explicit frequency provided." + } + default_value = "Unclear at this time." + + operator = default_value + if "operated by " in desc: + try: + operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip() + except Exception as e: + print(f"Error: {e}") + + + def consolidate(field: str, value: str) -> str: + # New entry + if name not in existing_content: + return value + # New field + if field not in existing_content[name]: + return value + # Unclear value + if existing_content[name][field] in default_values: + return value + # Existing value + return existing_content[name][field] + + existing_content[name] = { + "operator": consolidate("operator", operator), + "respect": consolidate("respect", default_value), + "function": consolidate("function", f"{category}"), + "frequency": consolidate("frequency", default_value), + "description": consolidate("description", f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}") + } + +print(f"Total: {len(existing_content)}") Path("./robots.json").write_text(json.dumps(existing_content, indent=4)) \ No newline at end of file