pool-publication-page/project.py

"""Process project records into project pages

Needs the associated_with property on a project to be inlined to
correctly assign contributors and sites. Needs the (inverse of)
influenced_by field from the umbrella project to be joined to the
record (as "x_had_influence") to correctly assign roles.

"""

import json
from pathlib import Path
import re
import textwrap

import click
import pendulum
import yaml

SITE_DICT = {
    "ror:04xfq0f34": "aachen",  # RWTH Aachen
    "ror:04cvxnb49": "frankfurt",  # Goethe University Frankfurt
    "ror:038t36y30": "heidelberg",  # Heidelberg University
    "ror:02nv7yv05": "juelich",  # Forschungszentrum Jülich
    "ror:023b0x485": "mainz",  # Johannes Gutenberg University Mainz
    "ror:01hynnt93": "mannheim",  # CIMH Mannheim
    "ror:00fbnyb24": "wuerzburg",  # University of Würzburg
}


def format_title(d: dict) -> str | None:
    """Return formatted title

    Combines short name and title if possible. Makes do with what's
    available otherwise.

    """
    if "short_name" in d and "title" in d:
        res = f"{d['short_name']}: {d['title']}"
    elif "title" in d:
        res = d["title"]
    elif "short_name" in d:
        res = d["short_name"]
    else:
        res = None
    return res


def get_contributors_and_sites(d: dict) -> tuple[list[str], list[str]]:
    """Go through attributions, return contributor and site labels

    Only contributors with TRR root PID and known sites are returned.

    """
    ctb_pat = re.compile(r"trr379root:contributors/([\w\-]+)")
    contributors = set()
    sites = set()
    for association in d.get("associated_with", []):
        obj = association.get("object")
        roles = set(x if isinstance(x, str) else x.get("pid") for x in association.get("roles", []))
        if not isinstance(obj, dict):
            # silently demand inlined input so we can check schema type
            continue

        if obj.get("schema_type") == "trr379ri:TRR379Organization" and "marcrel:sht" in roles:
            # associated_with(org, supporting host) -> site
            if (site_label := SITE_DICT.get(obj["pid"])) is not None:
                sites.add(site_label)
        elif obj.get("schema_type") == "trr379ri:TRR379Person":
            # associated_with(person, any) -> contributor
            if (m := re.match(ctb_pat, obj["pid"])) is not None:
                contributors.add(m.group(1))

    return sorted(list(contributors)), sorted(list(sites))


def get_detailed_contributors(d: dict) -> tuple[list[str], list[str]]:
    """Return associations to Person, grouped by Person"""
    ctb_pat = re.compile(r"trr379root:(contributors/[\w\-]+)")
    contributors = {}

    # group associations by their objects
    for association in d.get("associated_with", []):
        ass = association.copy()
        obj = ass.pop("object")
        if (
            obj.get("schema_type") == "trr379ri:TRR379Person"
            and (m := re.match(ctb_pat, obj["pid"])) is not None
        ):
            label = m.group(1)
            if label in contributors:
                contributors[label].append(ass)
            else:
                contributors[label] = [ass]

    # divide contributors into current and past
    current_contributors = []
    past_contributors = []
    for k, v in sorted(contributors.items()):
        if is_active_contributor(v):
            current_contributors.append(k)
        else:
            past_contributors.append(k)

    return current_contributors, past_contributors


def is_active_contributor(associations):
    """Check whether one of associations has not ended

    An association with an end date in the past has ended. An
    association with an end but no defined end date has also ended
    (maybe someone only recorded location). If the end date is
    explicitly in the future, the association has not ended.

    """

    ended_associations = [
        "ended" in ass
        and pendulum.parse(ass["ended"].get("at_time", "1970-01-01")).is_past()
        for ass in associations
    ]

    return not all(ended_associations)

def get_topics(d: dict) -> list[str]:
    topic_pat = re.compile(r"trr379root:topics/([\w\-]+)")
    topics = []
    for x in d.get("about", []):
        x_pid = x if isinstance(x, str) else x.get("pid")
        if (m := re.match(topic_pat, x_pid)) is not None:
            topics.append(m.group(1))
    return topics


def get_roles(d: dict) -> list[str]:
    """Get roles of the project

    A role of the project is defined by its influence on the TRR
    umbrella project. For this reason, we rely on the association
    class in the umbrella project's "influenced_by" to have been
    processed. We will assume that the result was appended as
    "x_had_influence" property on any given project.

    """
    pat = re.compile(r"trr379root:roles/([\w\-]+)")
    roles = set()
    for influence in d.get("x_had_influence", []):
        # we could check object of that influence, but we accept any
        for role in influence.get("roles", []):
            role_pid = role if isinstance(role, str) else role.get("pid")
            if (m := re.match(pat, role_pid)) is not None:
                roles.add(m.group(1))
    return sorted(list(roles))


def get_weight(d: dict) -> int | None:
    """Return sorting weight, following existing convention"""
    pat = re.compile(r"trr379root:projects/([a-cq])(\d+)")
    major_weights = {"a": 1000, "b": 2000, "c": 3000, "q": 4000}
    if (m := re.match(pat, d["pid"])) is not None:
        return major_weights[m.group(1)] + (int(m.group(2)) - 1) * 10


def pid_of(x: str | dict, label_only: bool = False) -> str:
    pid = x if isinstance(x, str) else x["pid"]
    if label_only:
        return pid.split(":", maxsplit=1)[-1]
    return pid


def read_markdown_content(p: Path) -> tuple[dict | None, str | None]:
    """Read content from a markdown page with a yaml front matter"""
    with p.open() as fp:
        lines = fp.readlines()
    fence_loc = [i for i, line in enumerate(lines) if line.rstrip() == "---"]

    if len(fence_loc) >= 2:
        header_str = "".join(lines[fence_loc[0]+1:fence_loc[1]])
        content = "".join(lines[fence_loc[1]+1:])
        try:
            header = yaml.safe_load(header_str)
        except yaml.YAMLError:
            header = None
    else:
        header = None
        content = None

    return header, content


def write_page(p: Path, fm: dict, content: str | None, sep=True):
    """Write a markdown page with yaml front matter"""

    with p.open("wt") as fp:
        fp.write("---\n")
        yaml.dump(fm, stream=fp, allow_unicode=True, sort_keys=False)
        fp.write("---\n")
        if content is not None:
            if sep:
                fp.write("\n")
            fp.write(content)


@click.command()
@click.argument("input", type=click.File("rb"))
@click.argument(
    "outdir",
    type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
)
def main(input, outdir):

    pat = re.compile(r"(https://trr379\.de/|trr379root:)projects/([\w\-]+)")

    for line in input:
        project = json.loads(line)

        if (m := re.match(pat, project["pid"])) is not None:
            label = m.group(2)
        else:
            # stick to trr379root:projects/ namespace (excludes root project)
            continue

        front_matter_dict = {}

        title = format_title(project)
        if title is not None:
            front_matter_dict["title"] = title

        contributors, sites = get_contributors_and_sites(project)
        if len(contributors) > 0:
            front_matter_dict["contributors"] = contributors
        if len(sites) > 0:
            front_matter_dict["sites"] = sites

        topics = get_topics(project)
        if len(topics) > 0:
            front_matter_dict["topics"] = topics

        roles = get_roles(project)
        if len(roles) > 0:
            front_matter_dict["roles"] = roles

        weight = get_weight(project)
        if weight is not None:
            front_matter_dict["weight"] = weight

        curr_contrib, past_contrib = get_detailed_contributors(project)
        if len(curr_contrib) > 0 or len(past_contrib) > 0:
            front_matter_dict["params"] = {}
        if len(curr_contrib) > 0:
            front_matter_dict["params"]["current_contributors"] = curr_contrib
        if len(past_contrib) > 0:
            front_matter_dict["params"]["past_contributors"] = past_contrib

        description = project.get("description")

        # prepare for writing markdown page(s)
        out_file = outdir / label / "_index.md"
        if not out_file.parent.is_dir():
            out_file.parent.mkdir()

        # write default (English) page
        write_page(
            out_file,
            front_matter_dict,
            (
                textwrap.fill(description, width=80, break_long_words=False)
                if description is not None
                else None
            ),
        )

        # for German, keep description and title from the old file, update header
        # (until internationalization is solved in the Pool)
        out_file_de = out_file.with_suffix(".de.md")
        if out_file_de.exists():
            old_header_de, old_content_de, = read_markdown_content(out_file_de)
        else:
            old_header_de = old_content_de = None

        new_header_de = front_matter_dict.copy()
        if old_header_de is not None and "title" in old_header_de:
            new_header_de["title"] = old_header_de["title"]
        write_page(out_file_de, new_header_de, old_content_de, sep=False)


if __name__ == "__main__":
    main()