pool-publication-page/filters/infer-site.py
Michał Szczepanik bf70a92f7c Add site information
This adds an "infer-site" filter, which combines the pool information
with information from the ror database (data dump). The external ror
information is used to provide information about related organizations,
in addition to parent organizations. A duckdb dependency is used to
improve load speed and reduce data dump size by using parquet format.

Only the seven main TRR sites are hardcoded; an organization (present as
delegated_by) counts as site if it is that site, has the site as parent
or related organization, or has the site as its parent organization's
parent or related organization.

For example, this filter recognizes ror:03f6n9m15 University Hospital
Frankfurt as "frankfurt" site because it is related to ror:04cvxnb49
Goethe University Frankfurt (which is the "frankfurt" site).

After implementing this, I realized that this could be simplified by
inverting the lookup: start not from any given organization in the
Person record, but from the list of sites. A mapping of related, child,
child-child, and child-related organization could be created upfront,
producing a much smaller org-to-site relationship. This can, however, be
done in the future, without changing the interface much.
2026-02-06 18:53:10 +01:00

136 lines
4.5 KiB
Python

"""Add information about site to the pool metadata
This is a TRR-specific filter which infers a TRR "site" corresponding
to the organization that a Person is delegated_by. A list of TRR sites
is hardcoded. However, not just direct matches but also short parent
and related chains in the ror database are used to select the site.
Because we need to look at both parent and related organization,
access to the ROR records (rather than TRR pool records) is
needed. This is done by using the data dump, converted to parquet for
smaller size and faster load time. ROR API would be an alternative,
but using the data dump is somewhat simpler, especially since rate
limits are about to get lowered.
"""
from collections.abc import Iterator
from typing import Any, TypedDict
from enum import Enum
import json
import click
import duckdb
class Relation(Enum):
SAME = 0
PARENT = 1
RELATED = 2
PARENT_PARENT = 3
PARENT_RELATED = 4
class NamesAndRels(TypedDict):
names: Any
relationships: Any
def load_map(dump_path: str) -> dict[str, NamesAndRels]:
"""Create id-indexed dict from ror parquet dump.
Returns a mapping of ids to names and relationships.
"""
all_ror_tbl = duckdb.read_parquet(dump_path)
res = duckdb.sql("SELECT id, names, relationships FROM all_ror_tbl")
all_ror_dict = {
id: {"names": names, "relationships": relationships}
for id, names, relationships in res.fetchall()
}
return all_ror_dict # pyright: ignore
def lookup(d: dict[str, dict[str, Any]], uri: str, rel_type: str) -> Iterator[str]:
"""Yield ROR IDs of orgs connected by given relation"""
for rel in d[uri].get("relationships", []):
if rel["type"] == rel_type:
yield rel["id"]
def match_site(
all_ror: dict, known_sites: set, uriorcurie: str
) -> tuple[str, Relation] | None:
this_id = uriorcurie.replace("ror:", "https://ror.org/")
if this_id in known_sites:
return (this_id, Relation.SAME)
for parent_id in lookup(all_ror, this_id, "parent"):
if parent_id in known_sites:
return (parent_id, Relation.PARENT)
for related_id in lookup(all_ror, this_id, "related"):
if related_id in known_sites:
return (related_id, Relation.RELATED)
for parent_id in lookup(all_ror, this_id, "parent"):
for grandparent_id in lookup(all_ror, parent_id, "parent"):
if grandparent_id in known_sites:
return (grandparent_id, Relation.PARENT_PARENT)
for parent_id in lookup(all_ror, this_id, "parent"):
for pr_id in lookup(all_ror, parent_id, "related"):
if pr_id in known_sites:
return (pr_id, Relation.PARENT_RELATED)
@click.command()
@click.argument("input", type=click.File("rb"))
@click.argument("dump", type=click.Path(exists=True, dir_okay=False))
@click.argument("output", type=click.File("wt"))
def main(input, dump, output):
site_dict = {
"https://ror.org/04xfq0f34": "aachen", # RWTH Aachen
"https://ror.org/04cvxnb49": "frankfurt", # Goethe University Frankfurt
"https://ror.org/038t36y30": "heidelberg", # Heidelberg University
"https://ror.org/02nv7yv05": "juelich", # Forschungszentrum Jülich
"https://ror.org/023b0x485": "mainz", # Johannes Gutenberg University Mainz
"https://ror.org/01hynnt93": "mannheim", # CIMH Mannheim
"https://ror.org/00fbnyb24": "wuerzburg", # University of Würzburg
}
site_set = set(site_dict.keys())
rel_map = load_map(dump)
for obj in (json.loads(line) for line in input):
for delegation in obj.get("delegated_by", []):
# work with either inlined delegation or just pid
if isinstance(delegation.get("object"), dict):
dpid = delegation.get("object").get("pid")
else:
dpid = delegation.get("object")
# it only makes sense to look at ror pids
if not dpid.startswith(("ror:", "https://ror.org/")):
continue
# if the delegation matches site, add or extend x_site property
m = match_site(rel_map, site_set, dpid)
if m is not None:
site_label = site_dict[m[0]]
if "x_site" not in obj:
obj["x_site"] = [site_label]
else:
if site_label not in obj["x_site"]:
obj["x_site"].append(site_label)
click.echo(json.dumps(obj), output)
if __name__ == "__main__":
main()