We want the website to split contributors into current and former. We implement the logic in Python and only add simple information to the Hugo template.
237 lines
7.5 KiB
Python
237 lines
7.5 KiB
Python
import json
|
|
from pathlib import Path
|
|
import re
|
|
import textwrap
|
|
|
|
import click
|
|
import pendulum
|
|
import yaml
|
|
|
|
|
|
def get_formatted_name(person: dict) -> str:
|
|
if (formatted := person.get("formatted_name")) is not None:
|
|
return formatted
|
|
|
|
all_parts = [
|
|
person.get("given_name"),
|
|
*person.get("additional_names", []),
|
|
person.get("family_name"),
|
|
]
|
|
parts = [p for p in all_parts if p is not None]
|
|
return " ".join(parts)
|
|
|
|
|
|
def get_sortable_name(person: dict) -> str:
|
|
"""Return lastname;firstname, suitable for sorting
|
|
|
|
Format is inspired by vCard (RFC 6350, 6.2.2) but additional
|
|
semicolons are removed because we only care about sorting.
|
|
|
|
"""
|
|
all_parts = [
|
|
person.get("family_name", ""),
|
|
person.get("given_name", ""),
|
|
",".join(person.get("additional_names", [])),
|
|
person.get("honorific_name_prefix", ""),
|
|
person.get("honorific_name_suffix", ""),
|
|
]
|
|
n = ";".join(all_parts).strip(";")
|
|
return re.sub(r";{2,}", ";", n)
|
|
|
|
|
|
def process_affiliation(person: dict) -> str:
|
|
"""Return affiliation based on delegations
|
|
|
|
Our data model has no such thing as affiliation. This uses
|
|
delegated_by property and concatenates names of organizations that
|
|
the person is delegated by into an "affilation" string that goes
|
|
into the website.
|
|
|
|
In practice, this currently restricts us to organizations present
|
|
in ROR (institutions rather than labs) but this seems like a good
|
|
start.
|
|
|
|
"""
|
|
org_names = []
|
|
for delegation in person.get("delegated_by", []):
|
|
if (
|
|
isinstance(delegation.get("object"), dict)
|
|
and delegation["object"].get("schema_type") == "trr379ri:TRR379Organization"
|
|
):
|
|
# ignore those which did not get inlined & only accept
|
|
# organizations (not persons)
|
|
|
|
# ideally, we would check for Employer role, but this is
|
|
# not available for TRR
|
|
org_names.append(delegation["object"]["name"])
|
|
|
|
return "; ".join(org_names)
|
|
|
|
|
|
def process_orcid(person: dict) -> str | None:
|
|
"""Return an ORCID from identifiers"""
|
|
|
|
# TODO: use inlined form
|
|
for identifier in person.get("identifiers", []):
|
|
if (
|
|
identifier.get("schema_type") == "trr379ri:ORCID"
|
|
or identifier.get("creator") == "ror:04fa4r544"
|
|
):
|
|
return identifier.get("notation")
|
|
|
|
|
|
def process_projects(person: dict) -> list[str]:
|
|
"""Return a list of projects associated with the person
|
|
|
|
Uses an inverse association which needs to be added (in the data
|
|
model, projects are associated with people).
|
|
|
|
"""
|
|
projects = set()
|
|
for assoc in person.get("x_associated_projects", []):
|
|
pn = assoc.get("object", {}).get("short_name")
|
|
if pn is not None and pn != "TRR379":
|
|
projects.add(pn.lower())
|
|
return sorted(list(projects))
|
|
|
|
|
|
def process_projects_active(person: dict) -> bool | None:
|
|
|
|
active = []
|
|
for assoc in person.get("x_associated_projects", []):
|
|
res = True # in the absence of information, treat association as active
|
|
if (
|
|
"started" in assoc
|
|
and pendulum.parse(
|
|
assoc["started"].get("at_time", "1970-01-01")
|
|
).is_future()
|
|
):
|
|
# start is defined, and date is in the future
|
|
# (no date: "always has been")
|
|
res = False
|
|
if (
|
|
"ended" in assoc
|
|
and pendulum.parse(assoc["ended"].get("at_time", "1970-01-01")).is_past()
|
|
):
|
|
# ended is defined, and date is in the past (no date: is past)
|
|
res = False
|
|
active.append(res)
|
|
|
|
return any(active) if len(active) > 0 else None
|
|
|
|
|
|
def process_roles(person: dict) -> list[str]:
|
|
"""Return a list of roles the person has in associated projects
|
|
|
|
Uses an inverse association which needs to be added (in the data
|
|
model, projects are associated with people).
|
|
|
|
Works on PIDs because they map better to the website than labels.
|
|
|
|
"""
|
|
|
|
pids = set()
|
|
for assoc in person.get("x_associated_projects", []):
|
|
for role_pid in assoc.get("roles", []):
|
|
pids.add(role_pid)
|
|
|
|
pat = re.compile(r"(https://trr379\.de/|trr379root:)roles/([\w\-]+)")
|
|
# everything in the TRR namespace should / could correspond to the website
|
|
roles = [m.group(2) for pid in pids if (m := re.match(pat, pid)) is not None]
|
|
# we also recognize some external roles, labels don't match website
|
|
ext_defs = {
|
|
"obo:NCIT_C19924": "pi",
|
|
"http://purl.obolibrary.org/obo/NCIT_C19924": "pi",
|
|
}
|
|
roles.extend([ext_defs[pid] for pid in pids if pid in ext_defs])
|
|
return sorted(roles)
|
|
|
|
|
|
def process_sites(person: dict) -> list[str]:
|
|
"""Return a list of sites the person is related to
|
|
|
|
Uses a custom property, x_site, which needs to be added based on
|
|
the Person's delegated_by, taking into account child / related
|
|
institutions in ROR.
|
|
|
|
"""
|
|
return sorted(person.get("x_site", []))
|
|
|
|
|
|
@click.command()
|
|
@click.argument("input", type=click.File("rb"))
|
|
@click.argument(
|
|
"outdir",
|
|
type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
|
|
)
|
|
def main(input, outdir):
|
|
|
|
# use PID to determine target path & decide whether output is needed
|
|
pat = re.compile(r"(https://trr379\.de/|trr379root:)contributors/([\w\-]+)")
|
|
|
|
for line in input:
|
|
person = json.loads(line)
|
|
|
|
if (m := re.match(pat, person["pid"])) is not None:
|
|
label = m.group(2) # last part of pid becomes hugo page bundle name
|
|
else:
|
|
# only build pages for trr379root:/contributors prefix
|
|
# other records could include e.g. co-authors with no affiliation with the TRR
|
|
continue
|
|
|
|
# additional filtering could be done here, if needed
|
|
|
|
front_matter_dict = {}
|
|
|
|
front_matter_dict["title"] = get_formatted_name(person)
|
|
|
|
if len(short_project_names := process_projects(person)) > 0:
|
|
front_matter_dict["projects"] = short_project_names
|
|
|
|
if len(sites := process_sites(person)) > 0:
|
|
front_matter_dict["sites"] = sites
|
|
|
|
if len(roles := process_roles(person)) > 0:
|
|
front_matter_dict["roles"] = roles
|
|
|
|
params = {}
|
|
if (orcid := process_orcid(person)) is not None:
|
|
params["orcid"] = orcid
|
|
|
|
if "honorific_name_prefix" in person:
|
|
params["name-title"] = person["honorific_name_prefix"]
|
|
|
|
if "honorific_name_suffix" in person:
|
|
# unused in hugo
|
|
# TODO: maybe that should be part of formatted name
|
|
params["name-suffix"] = person["honorific_name_suffix"]
|
|
|
|
if len(sn := get_sortable_name(person)) > 0:
|
|
params["sortkey"] = sn
|
|
|
|
if len(affiliation := process_affiliation(person)) > 0:
|
|
params["affiliation"] = affiliation
|
|
|
|
if (is_active := process_projects_active(person)) is not None:
|
|
params["active"] = is_active
|
|
|
|
if len(params) > 0:
|
|
front_matter_dict["params"] = params
|
|
|
|
front_matter_dict["layout"] = "contributor"
|
|
|
|
# assemble a markdown page
|
|
page = f"---\n{yaml.dump(front_matter_dict, allow_unicode=True, sort_keys=False)}---\n"
|
|
if description := person.get("description", False):
|
|
page += "\n"
|
|
page += textwrap.fill(description, width=80, break_long_words=False)
|
|
|
|
# write the markdown file
|
|
out_file = outdir / label / "_index.md"
|
|
if not out_file.parent.is_dir():
|
|
out_file.parent.mkdir()
|
|
out_file.write_text(page)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|