mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-12 05:57:45 +00:00
update existing ones
This commit is contained in:
parent
52d54cf127
commit
55e92f4324
2 changed files with 40 additions and 22 deletions
2
.github/workflows/daily_update.yml
vendored
2
.github/workflows/daily_update.yml
vendored
|
@ -1,7 +1,7 @@
|
||||||
name: Daily Update from Dark Visitors
|
name: Daily Update from Dark Visitors
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: "0 0 * * *"
|
- cron: "*/10 * * * *"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
dark-visitors:
|
dark-visitors:
|
||||||
|
|
|
@ -9,7 +9,6 @@ response = session.get("https://darkvisitors.com/agents")
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
existing_content = json.loads(Path("./robots.json").read_text())
|
existing_content = json.loads(Path("./robots.json").read_text())
|
||||||
added = 0
|
|
||||||
to_include = [
|
to_include = [
|
||||||
"AI Assistants",
|
"AI Assistants",
|
||||||
"AI Data Scrapers",
|
"AI Data Scrapers",
|
||||||
|
@ -30,25 +29,44 @@ for section in soup.find_all("div", {"class": "agent-links-section"}):
|
||||||
for agent in section.find_all("a", href=True):
|
for agent in section.find_all("a", href=True):
|
||||||
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
|
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
|
||||||
desc = agent.find("p").get_text().strip()
|
desc = agent.find("p").get_text().strip()
|
||||||
|
|
||||||
if name in existing_content:
|
|
||||||
continue
|
|
||||||
# Template:
|
|
||||||
# "Claude-Web": {
|
|
||||||
# "operator": "[Anthropic](https:\/\/www.anthropic.com)",
|
|
||||||
# "respect": "Unclear at this time.",
|
|
||||||
# "function": "Scrapes data to train Anthropic's AI products.",
|
|
||||||
# "frequency": "No information. provided.",
|
|
||||||
# "description": "Scrapes data to train LLMs and AI products offered by Anthropic."
|
|
||||||
# }
|
|
||||||
existing_content[name] = {
|
|
||||||
"operator": "Unclear at this time.",
|
|
||||||
"respect": "Unclear at this time.",
|
|
||||||
"function": f"{category}",
|
|
||||||
"frequency": "Unclear at this time.",
|
|
||||||
"description": f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}"
|
|
||||||
}
|
|
||||||
added += 1
|
|
||||||
|
|
||||||
print(f"Added {added} new agents, total is now {len(existing_content)}")
|
# TODO: there seems to be a typo?
|
||||||
|
default_values = {
|
||||||
|
"Unclear at this time.",
|
||||||
|
"No information. provided.",
|
||||||
|
"No information.",
|
||||||
|
"No explicit frequency provided."
|
||||||
|
}
|
||||||
|
default_value = "Unclear at this time."
|
||||||
|
|
||||||
|
operator = default_value
|
||||||
|
if "operated by " in desc:
|
||||||
|
try:
|
||||||
|
operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def consolidate(field: str, value: str) -> str:
|
||||||
|
# New entry
|
||||||
|
if name not in existing_content:
|
||||||
|
return value
|
||||||
|
# New field
|
||||||
|
if field not in existing_content[name]:
|
||||||
|
return value
|
||||||
|
# Unclear value
|
||||||
|
if existing_content[name][field] in default_values:
|
||||||
|
return value
|
||||||
|
# Existing value
|
||||||
|
return existing_content[name][field]
|
||||||
|
|
||||||
|
existing_content[name] = {
|
||||||
|
"operator": consolidate("operator", operator),
|
||||||
|
"respect": consolidate("respect", default_value),
|
||||||
|
"function": consolidate("function", f"{category}"),
|
||||||
|
"frequency": consolidate("frequency", default_value),
|
||||||
|
"description": consolidate("description", f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}")
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"Total: {len(existing_content)}")
|
||||||
Path("./robots.json").write_text(json.dumps(existing_content, indent=4))
|
Path("./robots.json").write_text(json.dumps(existing_content, indent=4))
|
Loading…
Reference in a new issue