pool-publication-page/person.py
Michał Szczepanik 339c3e1d70 Figure out project association status
We want the website to split contributors into current and former. We
implement the logic in Python and only add simple information to the
Hugo template.
2026-04-17 13:43:38 +02:00

237 lines
7.5 KiB
Python

import json
from pathlib import Path
import re
import textwrap
import click
import pendulum
import yaml
def get_formatted_name(person: dict) -> str:
if (formatted := person.get("formatted_name")) is not None:
return formatted
all_parts = [
person.get("given_name"),
*person.get("additional_names", []),
person.get("family_name"),
]
parts = [p for p in all_parts if p is not None]
return " ".join(parts)
def get_sortable_name(person: dict) -> str:
"""Return lastname;firstname, suitable for sorting
Format is inspired by vCard (RFC 6350, 6.2.2) but additional
semicolons are removed because we only care about sorting.
"""
all_parts = [
person.get("family_name", ""),
person.get("given_name", ""),
",".join(person.get("additional_names", [])),
person.get("honorific_name_prefix", ""),
person.get("honorific_name_suffix", ""),
]
n = ";".join(all_parts).strip(";")
return re.sub(r";{2,}", ";", n)
def process_affiliation(person: dict) -> str:
"""Return affiliation based on delegations
Our data model has no such thing as affiliation. This uses
delegated_by property and concatenates names of organizations that
the person is delegated by into an "affilation" string that goes
into the website.
In practice, this currently restricts us to organizations present
in ROR (institutions rather than labs) but this seems like a good
start.
"""
org_names = []
for delegation in person.get("delegated_by", []):
if (
isinstance(delegation.get("object"), dict)
and delegation["object"].get("schema_type") == "trr379ri:TRR379Organization"
):
# ignore those which did not get inlined & only accept
# organizations (not persons)
# ideally, we would check for Employer role, but this is
# not available for TRR
org_names.append(delegation["object"]["name"])
return "; ".join(org_names)
def process_orcid(person: dict) -> str | None:
"""Return an ORCID from identifiers"""
# TODO: use inlined form
for identifier in person.get("identifiers", []):
if (
identifier.get("schema_type") == "trr379ri:ORCID"
or identifier.get("creator") == "ror:04fa4r544"
):
return identifier.get("notation")
def process_projects(person: dict) -> list[str]:
"""Return a list of projects associated with the person
Uses an inverse association which needs to be added (in the data
model, projects are associated with people).
"""
projects = set()
for assoc in person.get("x_associated_projects", []):
pn = assoc.get("object", {}).get("short_name")
if pn is not None and pn != "TRR379":
projects.add(pn.lower())
return sorted(list(projects))
def process_projects_active(person: dict) -> bool | None:
active = []
for assoc in person.get("x_associated_projects", []):
res = True # in the absence of information, treat association as active
if (
"started" in assoc
and pendulum.parse(
assoc["started"].get("at_time", "1970-01-01")
).is_future()
):
# start is defined, and date is in the future
# (no date: "always has been")
res = False
if (
"ended" in assoc
and pendulum.parse(assoc["ended"].get("at_time", "1970-01-01")).is_past()
):
# ended is defined, and date is in the past (no date: is past)
res = False
active.append(res)
return any(active) if len(active) > 0 else None
def process_roles(person: dict) -> list[str]:
"""Return a list of roles the person has in associated projects
Uses an inverse association which needs to be added (in the data
model, projects are associated with people).
Works on PIDs because they map better to the website than labels.
"""
pids = set()
for assoc in person.get("x_associated_projects", []):
for role_pid in assoc.get("roles", []):
pids.add(role_pid)
pat = re.compile(r"(https://trr379\.de/|trr379root:)roles/([\w\-]+)")
# everything in the TRR namespace should / could correspond to the website
roles = [m.group(2) for pid in pids if (m := re.match(pat, pid)) is not None]
# we also recognize some external roles, labels don't match website
ext_defs = {
"obo:NCIT_C19924": "pi",
"http://purl.obolibrary.org/obo/NCIT_C19924": "pi",
}
roles.extend([ext_defs[pid] for pid in pids if pid in ext_defs])
return sorted(roles)
def process_sites(person: dict) -> list[str]:
"""Return a list of sites the person is related to
Uses a custom property, x_site, which needs to be added based on
the Person's delegated_by, taking into account child / related
institutions in ROR.
"""
return sorted(person.get("x_site", []))
@click.command()
@click.argument("input", type=click.File("rb"))
@click.argument(
"outdir",
type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
)
def main(input, outdir):
# use PID to determine target path & decide whether output is needed
pat = re.compile(r"(https://trr379\.de/|trr379root:)contributors/([\w\-]+)")
for line in input:
person = json.loads(line)
if (m := re.match(pat, person["pid"])) is not None:
label = m.group(2) # last part of pid becomes hugo page bundle name
else:
# only build pages for trr379root:/contributors prefix
# other records could include e.g. co-authors with no affiliation with the TRR
continue
# additional filtering could be done here, if needed
front_matter_dict = {}
front_matter_dict["title"] = get_formatted_name(person)
if len(short_project_names := process_projects(person)) > 0:
front_matter_dict["projects"] = short_project_names
if len(sites := process_sites(person)) > 0:
front_matter_dict["sites"] = sites
if len(roles := process_roles(person)) > 0:
front_matter_dict["roles"] = roles
params = {}
if (orcid := process_orcid(person)) is not None:
params["orcid"] = orcid
if "honorific_name_prefix" in person:
params["name-title"] = person["honorific_name_prefix"]
if "honorific_name_suffix" in person:
# unused in hugo
# TODO: maybe that should be part of formatted name
params["name-suffix"] = person["honorific_name_suffix"]
if len(sn := get_sortable_name(person)) > 0:
params["sortkey"] = sn
if len(affiliation := process_affiliation(person)) > 0:
params["affiliation"] = affiliation
if (is_active := process_projects_active(person)) is not None:
params["active"] = is_active
if len(params) > 0:
front_matter_dict["params"] = params
front_matter_dict["layout"] = "contributor"
# assemble a markdown page
page = f"---\n{yaml.dump(front_matter_dict, allow_unicode=True, sort_keys=False)}---\n"
if description := person.get("description", False):
page += "\n"
page += textwrap.fill(description, width=80, break_long_words=False)
# write the markdown file
out_file = outdir / label / "_index.md"
if not out_file.parent.is_dir():
out_file.parent.mkdir()
out_file.write_text(page)
if __name__ == "__main__":
main()