pool-publication-page/filters/infer-site.py

"""Add information about site to the pool metadata

This is a TRR-specific filter which infers a TRR "site" corresponding
to the organization that a Person is delegated_by. A list of TRR sites
is hardcoded. However, not just direct matches but also short parent
and related chains in the ror database are used to select the site.

Because we need to look at both parent and related organization,
access to the ROR records (rather than TRR pool records) is
needed. This is done by using the data dump, converted to parquet for
smaller size and faster load time. ROR API would be an alternative,
but using the data dump is somewhat simpler, especially since rate
limits are about to get lowered.

"""

from collections.abc import Iterator
from typing import Any, TypedDict
from enum import Enum
import json

import click
import duckdb


class Relation(Enum):
    SAME = 0
    PARENT = 1
    RELATED = 2
    PARENT_PARENT = 3
    PARENT_RELATED = 4


class NamesAndRels(TypedDict):
    names: Any
    relationships: Any


def load_map(dump_path: str) -> dict[str, NamesAndRels]:
    """Create id-indexed dict from ror parquet dump.

    Returns a mapping of ids to names and relationships.

    """
    all_ror_tbl = duckdb.read_parquet(dump_path)
    res = duckdb.sql("SELECT id, names, relationships FROM all_ror_tbl")
    all_ror_dict = {
        id: {"names": names, "relationships": relationships}
        for id, names, relationships in res.fetchall()
    }
    return all_ror_dict  # pyright: ignore


def lookup(d: dict[str, dict[str, Any]], uri: str, rel_type: str) -> Iterator[str]:
    """Yield ROR IDs of orgs connected by given relation"""
    for rel in d[uri].get("relationships", []):
        if rel["type"] == rel_type:
            yield rel["id"]


def match_site(
    all_ror: dict, known_sites: set, uriorcurie: str
) -> tuple[str, Relation] | None:

    this_id = uriorcurie.replace("ror:", "https://ror.org/")

    if this_id in known_sites:
        return (this_id, Relation.SAME)

    for parent_id in lookup(all_ror, this_id, "parent"):
        if parent_id in known_sites:
            return (parent_id, Relation.PARENT)

    for related_id in lookup(all_ror, this_id, "related"):
        if related_id in known_sites:
            return (related_id, Relation.RELATED)

    for parent_id in lookup(all_ror, this_id, "parent"):
        for grandparent_id in lookup(all_ror, parent_id, "parent"):
            if grandparent_id in known_sites:
                return (grandparent_id, Relation.PARENT_PARENT)

    for parent_id in lookup(all_ror, this_id, "parent"):
        for pr_id in lookup(all_ror, parent_id, "related"):
            if pr_id in known_sites:
                return (pr_id, Relation.PARENT_RELATED)


@click.command()
@click.argument("input", type=click.File("rb"))
@click.argument("dump", type=click.Path(exists=True, dir_okay=False))
@click.argument("output", type=click.File("wt"))
def main(input, dump, output):

    site_dict = {
        "https://ror.org/04xfq0f34": "aachen",  # RWTH Aachen
        "https://ror.org/04cvxnb49": "frankfurt",  # Goethe University Frankfurt
        "https://ror.org/038t36y30": "heidelberg",  # Heidelberg University
        "https://ror.org/02nv7yv05": "juelich",  # Forschungszentrum Jülich
        "https://ror.org/023b0x485": "mainz",  # Johannes Gutenberg University Mainz
        "https://ror.org/01hynnt93": "mannheim",  # CIMH Mannheim
        "https://ror.org/00fbnyb24": "wuerzburg",  # University of Würzburg
    }

    site_set = set(site_dict.keys())

    rel_map = load_map(dump)

    for obj in (json.loads(line) for line in input):
        for delegation in obj.get("delegated_by", []):

            # work with either inlined delegation or just pid
            if isinstance(delegation.get("object"), dict):
                dpid = delegation.get("object").get("pid")
            else:
                dpid = delegation.get("object")

            # it only makes sense to look at ror pids
            if not dpid.startswith(("ror:", "https://ror.org/")):
                continue

            # if the delegation matches site, add or extend x_site property
            m = match_site(rel_map, site_set, dpid)
            if m is not None:
                site_label = site_dict[m[0]]
                if "x_site" not in obj:
                    obj["x_site"] = [site_label]
                else:
                    if site_label not in obj["x_site"]:
                        obj["x_site"].append(site_label)

        click.echo(json.dumps(obj), output)


if __name__ == "__main__":
    main()