mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-04-12 05:57:45 +00:00
restrict scope
This commit is contained in:
parent
8c6482fb45
commit
2a3685385c
1 changed files with 11 additions and 12 deletions
|
@ -13,14 +13,14 @@ to_include = [
|
||||||
"AI Assistants",
|
"AI Assistants",
|
||||||
"AI Data Scrapers",
|
"AI Data Scrapers",
|
||||||
"AI Search Crawlers",
|
"AI Search Crawlers",
|
||||||
"Archivers",
|
# "Archivers",
|
||||||
"Developer Helpers",
|
# "Developer Helpers",
|
||||||
"Fetchers",
|
# "Fetchers",
|
||||||
"Intelligence Gatherers",
|
# "Intelligence Gatherers",
|
||||||
"Scrapers",
|
# "Scrapers",
|
||||||
"Search Engine Crawlers",
|
# "Search Engine Crawlers",
|
||||||
"SEO Crawlers",
|
# "SEO Crawlers",
|
||||||
"Uncategorized",
|
# "Uncategorized",
|
||||||
"Undocumented AI Agents"
|
"Undocumented AI Agents"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -29,8 +29,7 @@ for section in soup.find_all("div", {"class": "agent-links-section"}):
|
||||||
for agent in section.find_all("a", href=True):
|
for agent in section.find_all("a", href=True):
|
||||||
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
|
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
|
||||||
desc = agent.find("p").get_text().strip()
|
desc = agent.find("p").get_text().strip()
|
||||||
|
|
||||||
# TODO: there seems to be a typo?
|
|
||||||
default_values = {
|
default_values = {
|
||||||
"Unclear at this time.",
|
"Unclear at this time.",
|
||||||
"No information. provided.",
|
"No information. provided.",
|
||||||
|
@ -39,6 +38,7 @@ for section in soup.find_all("div", {"class": "agent-links-section"}):
|
||||||
}
|
}
|
||||||
default_value = "Unclear at this time."
|
default_value = "Unclear at this time."
|
||||||
|
|
||||||
|
# Parse the operator information from the description if possible
|
||||||
operator = default_value
|
operator = default_value
|
||||||
if "operated by " in desc:
|
if "operated by " in desc:
|
||||||
try:
|
try:
|
||||||
|
@ -46,7 +46,6 @@ for section in soup.find_all("div", {"class": "agent-links-section"}):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error: {e}")
|
print(f"Error: {e}")
|
||||||
|
|
||||||
|
|
||||||
def consolidate(field: str, value: str) -> str:
|
def consolidate(field: str, value: str) -> str:
|
||||||
# New entry
|
# New entry
|
||||||
if name not in existing_content:
|
if name not in existing_content:
|
||||||
|
@ -55,7 +54,7 @@ for section in soup.find_all("div", {"class": "agent-links-section"}):
|
||||||
if field not in existing_content[name]:
|
if field not in existing_content[name]:
|
||||||
return value
|
return value
|
||||||
# Unclear value
|
# Unclear value
|
||||||
if existing_content[name][field] in default_values:
|
if existing_content[name][field] in default_values and value not in default_values:
|
||||||
return value
|
return value
|
||||||
# Existing value
|
# Existing value
|
||||||
return existing_content[name][field]
|
return existing_content[name][field]
|
||||||
|
|
Loading…
Reference in a new issue