pool-publication-page/project.py
Michał Szczepanik 339c3e1d70 Figure out project association status
We want the website to split contributors into current and former. We
implement the logic in Python and only add simple information to the
Hugo template.
2026-04-17 13:43:38 +02:00

288 lines
9.4 KiB
Python

"""Process project records into project pages
Needs the associated_with property on a project to be inlined to
correctly assign contributors and sites. Needs the (inverse of)
influenced_by field from the umbrella project to be joined to the
record (as "x_had_influence") to correctly assign roles.
"""
import json
from pathlib import Path
import re
import textwrap
import click
import pendulum
import yaml
SITE_DICT = {
"ror:04xfq0f34": "aachen", # RWTH Aachen
"ror:04cvxnb49": "frankfurt", # Goethe University Frankfurt
"ror:038t36y30": "heidelberg", # Heidelberg University
"ror:02nv7yv05": "juelich", # Forschungszentrum Jülich
"ror:023b0x485": "mainz", # Johannes Gutenberg University Mainz
"ror:01hynnt93": "mannheim", # CIMH Mannheim
"ror:00fbnyb24": "wuerzburg", # University of Würzburg
}
def format_title(d: dict) -> str | None:
"""Return formatted title
Combines short name and title if possible. Makes do with what's
available otherwise.
"""
if "short_name" in d and "title" in d:
res = f"{d['short_name']}: {d['title']}"
elif "title" in d:
res = d["title"]
elif "short_name" in d:
res = d["short_name"]
else:
res = None
return res
def get_contributors_and_sites(d: dict) -> tuple[list[str], list[str]]:
"""Go through attributions, return contributor and site labels
Only contributors with TRR root PID and known sites are returned.
"""
ctb_pat = re.compile(r"trr379root:contributors/([\w\-]+)")
contributors = set()
sites = set()
for association in d.get("associated_with", []):
obj = association.get("object")
roles = set(x if isinstance(x, str) else x.get("pid") for x in association.get("roles", []))
if not isinstance(obj, dict):
# silently demand inlined input so we can check schema type
continue
if obj.get("schema_type") == "trr379ri:TRR379Organization" and "marcrel:sht" in roles:
# associated_with(org, supporting host) -> site
if (site_label := SITE_DICT.get(obj["pid"])) is not None:
sites.add(site_label)
elif obj.get("schema_type") == "trr379ri:TRR379Person":
# associated_with(person, any) -> contributor
if (m := re.match(ctb_pat, obj["pid"])) is not None:
contributors.add(m.group(1))
return sorted(list(contributors)), sorted(list(sites))
def get_detailed_contributors(d: dict) -> tuple[list[str], list[str]]:
"""Return associations to Person, grouped by Person"""
ctb_pat = re.compile(r"trr379root:(contributors/[\w\-]+)")
contributors = {}
# group associations by their objects
for association in d.get("associated_with", []):
ass = association.copy()
obj = ass.pop("object")
if (
obj.get("schema_type") == "trr379ri:TRR379Person"
and (m := re.match(ctb_pat, obj["pid"])) is not None
):
label = m.group(1)
if label in contributors:
contributors[label].append(ass)
else:
contributors[label] = [ass]
# divide contributors into current and past
current_contributors = []
past_contributors = []
for k, v in sorted(contributors.items()):
if is_active_contributor(v):
current_contributors.append(k)
else:
past_contributors.append(k)
return current_contributors, past_contributors
def is_active_contributor(associations):
"""Check whether one of associations has not ended
An association with an end date in the past has ended. An
association with an end but no defined end date has also ended
(maybe someone only recorded location). If the end date is
explicitly in the future, the association has not ended.
"""
ended_associations = [
"ended" in ass
and pendulum.parse(ass["ended"].get("at_time", "1970-01-01")).is_past()
for ass in associations
]
return not all(ended_associations)
def get_topics(d: dict) -> list[str]:
topic_pat = re.compile(r"trr379root:topics/([\w\-]+)")
topics = []
for x in d.get("about", []):
x_pid = x if isinstance(x, str) else x.get("pid")
if (m := re.match(topic_pat, x_pid)) is not None:
topics.append(m.group(1))
return topics
def get_roles(d: dict) -> list[str]:
"""Get roles of the project
A role of the project is defined by its influence on the TRR
umbrella project. For this reason, we rely on the association
class in the umbrella project's "influenced_by" to have been
processed. We will assume that the result was appended as
"x_had_influence" property on any given project.
"""
pat = re.compile(r"trr379root:roles/([\w\-]+)")
roles = set()
for influence in d.get("x_had_influence", []):
# we could check object of that influence, but we accept any
for role in influence.get("roles", []):
role_pid = role if isinstance(role, str) else role.get("pid")
if (m := re.match(pat, role_pid)) is not None:
roles.add(m.group(1))
return sorted(list(roles))
def get_weight(d: dict) -> int | None:
"""Return sorting weight, following existing convention"""
pat = re.compile(r"trr379root:projects/([a-cq])(\d+)")
major_weights = {"a": 1000, "b": 2000, "c": 3000, "q": 4000}
if (m := re.match(pat, d["pid"])) is not None:
return major_weights[m.group(1)] + (int(m.group(2)) - 1) * 10
def pid_of(x: str | dict, label_only: bool = False) -> str:
pid = x if isinstance(x, str) else x["pid"]
if label_only:
return pid.split(":", maxsplit=1)[-1]
return pid
def read_markdown_content(p: Path) -> tuple[dict | None, str | None]:
"""Read content from a markdown page with a yaml front matter"""
with p.open() as fp:
lines = fp.readlines()
fence_loc = [i for i, line in enumerate(lines) if line.rstrip() == "---"]
if len(fence_loc) >= 2:
header_str = "".join(lines[fence_loc[0]+1:fence_loc[1]])
content = "".join(lines[fence_loc[1]+1:])
try:
header = yaml.safe_load(header_str)
except yaml.YAMLError:
header = None
else:
header = None
content = None
return header, content
def write_page(p: Path, fm: dict, content: str | None, sep=True):
"""Write a markdown page with yaml front matter"""
with p.open("wt") as fp:
fp.write("---\n")
yaml.dump(fm, stream=fp, allow_unicode=True, sort_keys=False)
fp.write("---\n")
if content is not None:
if sep:
fp.write("\n")
fp.write(content)
@click.command()
@click.argument("input", type=click.File("rb"))
@click.argument(
"outdir",
type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
)
def main(input, outdir):
pat = re.compile(r"(https://trr379\.de/|trr379root:)projects/([\w\-]+)")
for line in input:
project = json.loads(line)
if (m := re.match(pat, project["pid"])) is not None:
label = m.group(2)
else:
# stick to trr379root:projects/ namespace (excludes root project)
continue
front_matter_dict = {}
title = format_title(project)
if title is not None:
front_matter_dict["title"] = title
contributors, sites = get_contributors_and_sites(project)
if len(contributors) > 0:
front_matter_dict["contributors"] = contributors
if len(sites) > 0:
front_matter_dict["sites"] = sites
topics = get_topics(project)
if len(topics) > 0:
front_matter_dict["topics"] = topics
roles = get_roles(project)
if len(roles) > 0:
front_matter_dict["roles"] = roles
weight = get_weight(project)
if weight is not None:
front_matter_dict["weight"] = weight
curr_contrib, past_contrib = get_detailed_contributors(project)
if len(curr_contrib) > 0 or len(past_contrib) > 0:
front_matter_dict["params"] = {}
if len(curr_contrib) > 0:
front_matter_dict["params"]["current_contributors"] = curr_contrib
if len(past_contrib) > 0:
front_matter_dict["params"]["past_contributors"] = past_contrib
description = project.get("description")
# prepare for writing markdown page(s)
out_file = outdir / label / "_index.md"
if not out_file.parent.is_dir():
out_file.parent.mkdir()
# write default (English) page
write_page(
out_file,
front_matter_dict,
(
textwrap.fill(description, width=80, break_long_words=False)
if description is not None
else None
),
)
# for German, keep description and title from the old file, update header
# (until internationalization is solved in the Pool)
out_file_de = out_file.with_suffix(".de.md")
if out_file_de.exists():
old_header_de, old_content_de, = read_markdown_content(out_file_de)
else:
old_header_de = old_content_de = None
new_header_de = front_matter_dict.copy()
if old_header_de is not None and "title" in old_header_de:
new_header_de["title"] = old_header_de["title"]
write_page(out_file_de, new_header_de, old_content_de, sep=False)
if __name__ == "__main__":
main()