pool-publication-page/person.py

import json
from pathlib import Path
import re
import textwrap

import click
import pendulum
import yaml


def get_formatted_name(person: dict) -> str:
    if (formatted := person.get("formatted_name")) is not None:
        return formatted

    all_parts = [
        person.get("given_name"),
        *person.get("additional_names", []),
        person.get("family_name"),
    ]
    parts = [p for p in all_parts if p is not None]
    return " ".join(parts)


def get_sortable_name(person: dict) -> str:
    """Return lastname;firstname, suitable for sorting

    Format is inspired by vCard (RFC 6350, 6.2.2) but additional
    semicolons are removed because we only care about sorting.

    """
    all_parts = [
        person.get("family_name", ""),
        person.get("given_name", ""),
        ",".join(person.get("additional_names", [])),
        person.get("honorific_name_prefix", ""),
        person.get("honorific_name_suffix", ""),
    ]
    n = ";".join(all_parts).strip(";")
    return re.sub(r";{2,}", ";", n)


def process_affiliation(person: dict) -> str:
    """Return affiliation based on delegations

    Our data model has no such thing as affiliation. This uses
    delegated_by property and concatenates names of organizations that
    the person is delegated by into an "affilation" string that goes
    into the website.

    In practice, this currently restricts us to organizations present
    in ROR (institutions rather than labs) but this seems like a good
    start.

    """
    org_names = []
    for delegation in person.get("delegated_by", []):
        if (
            isinstance(delegation.get("object"), dict)
            and delegation["object"].get("schema_type") == "trr379ri:TRR379Organization"
        ):
            # ignore those which did not get inlined & only accept
            # organizations (not persons)

            # ideally, we would check for Employer role, but this is
            # not available for TRR
            org_names.append(delegation["object"]["name"])

    return "; ".join(org_names)


def process_orcid(person: dict) -> str | None:
    """Return an ORCID from identifiers"""

    # TODO: use inlined form
    for identifier in person.get("identifiers", []):
        if (
            identifier.get("schema_type") == "trr379ri:ORCID"
            or identifier.get("creator") == "ror:04fa4r544"
        ):
            return identifier.get("notation")


def process_projects(person: dict) -> list[str]:
    """Return a list of projects associated with the person

    Uses an inverse association which needs to be added (in the data
    model, projects are associated with people).

    """
    projects = set()
    for assoc in person.get("x_associated_projects", []):
        pn = assoc.get("object", {}).get("short_name")
        if pn is not None and pn != "TRR379":
            projects.add(pn.lower())
    return sorted(list(projects))


def process_projects_active(person: dict) -> bool | None:

    active = []
    for assoc in person.get("x_associated_projects", []):
        res = True  # in the absence of information, treat association as active
        if (
            "started" in assoc
            and pendulum.parse(
                assoc["started"].get("at_time", "1970-01-01")
            ).is_future()
        ):
            # start is defined, and date is in the future
            # (no date: "always has been")
            res = False
        if (
            "ended" in assoc
            and pendulum.parse(assoc["ended"].get("at_time", "1970-01-01")).is_past()
        ):
            # ended is defined, and date is in the past (no date: is past)
            res = False
        active.append(res)

    return any(active) if len(active) > 0 else None


def process_roles(person: dict) -> list[str]:
    """Return a list of roles the person has in associated projects

    Uses an inverse association which needs to be added (in the data
    model, projects are associated with people).

    Works on PIDs because they map better to the website than labels.

    """

    pids = set()
    for assoc in person.get("x_associated_projects", []):
        for role_pid in assoc.get("roles", []):
            pids.add(role_pid)

    pat = re.compile(r"(https://trr379\.de/|trr379root:)roles/([\w\-]+)")
    # everything in the TRR namespace should / could correspond to the website
    roles = [m.group(2) for pid in pids if (m := re.match(pat, pid)) is not None]
    # we also recognize some external roles, labels don't match website
    ext_defs = {
        "obo:NCIT_C19924": "pi",
        "http://purl.obolibrary.org/obo/NCIT_C19924": "pi",
    }
    roles.extend([ext_defs[pid] for pid in pids if pid in ext_defs])
    return sorted(roles)


def process_sites(person: dict) -> list[str]:
    """Return a list of sites the person is related to

    Uses a custom property, x_site, which needs to be added based on
    the Person's delegated_by, taking into account child / related
    institutions in ROR.

    """
    return sorted(person.get("x_site", []))


@click.command()
@click.argument("input", type=click.File("rb"))
@click.argument(
    "outdir",
    type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
)
def main(input, outdir):

    # use PID to determine target path & decide whether output is needed
    pat = re.compile(r"(https://trr379\.de/|trr379root:)contributors/([\w\-]+)")

    for line in input:
        person = json.loads(line)

        if (m := re.match(pat, person["pid"])) is not None:
            label = m.group(2)  # last part of pid becomes hugo page bundle name
        else:
            # only build pages for trr379root:/contributors prefix
            # other records could include e.g. co-authors with no affiliation with the TRR
            continue

        # additional filtering could be done here, if needed

        front_matter_dict = {}

        front_matter_dict["title"] = get_formatted_name(person)

        if len(short_project_names := process_projects(person)) > 0:
            front_matter_dict["projects"] = short_project_names

        if len(sites := process_sites(person)) > 0:
            front_matter_dict["sites"] = sites

        if len(roles := process_roles(person)) > 0:
            front_matter_dict["roles"] = roles

        params = {}
        if (orcid := process_orcid(person)) is not None:
            params["orcid"] = orcid

        if "honorific_name_prefix" in person:
            params["name-title"] = person["honorific_name_prefix"]

        if "honorific_name_suffix" in person:
            # unused in hugo
            # TODO: maybe that should be part of formatted name
            params["name-suffix"] = person["honorific_name_suffix"]

        if len(sn := get_sortable_name(person)) > 0:
            params["sortkey"] = sn

        if len(affiliation := process_affiliation(person)) > 0:
            params["affiliation"] = affiliation

        if (is_active := process_projects_active(person)) is not None:
            params["active"] = is_active

        if len(params) > 0:
            front_matter_dict["params"] = params

        front_matter_dict["layout"] = "contributor"

        # assemble a markdown page
        page = f"---\n{yaml.dump(front_matter_dict, allow_unicode=True, sort_keys=False)}---\n"
        if description := person.get("description", False):
            page += "\n"
            page += textwrap.fill(description, width=80, break_long_words=False)

        # write the markdown file
        out_file = outdir / label / "_index.md"
        if not out_file.parent.is_dir():
            out_file.parent.mkdir()
        out_file.write_text(page)


if __name__ == "__main__":
    main()