This adds an "infer-site" filter, which combines the pool information with information from the ror database (data dump). The external ror information is used to provide information about related organizations, in addition to parent organizations. A duckdb dependency is used to improve load speed and reduce data dump size by using parquet format. Only the seven main TRR sites are hardcoded; an organization (present as delegated_by) counts as site if it is that site, has the site as parent or related organization, or has the site as its parent organization's parent or related organization. For example, this filter recognizes ror:03f6n9m15 University Hospital Frankfurt as "frankfurt" site because it is related to ror:04cvxnb49 Goethe University Frankfurt (which is the "frankfurt" site). After implementing this, I realized that this could be simplified by inverting the lookup: start not from any given organization in the Person record, but from the list of sites. A mapping of related, child, child-child, and child-related organization could be created upfront, producing a much smaller org-to-site relationship. This can, however, be done in the future, without changing the interface much.
136 lines
4.5 KiB
Python
136 lines
4.5 KiB
Python
"""Add information about site to the pool metadata
|
|
|
|
This is a TRR-specific filter which infers a TRR "site" corresponding
|
|
to the organization that a Person is delegated_by. A list of TRR sites
|
|
is hardcoded. However, not just direct matches but also short parent
|
|
and related chains in the ror database are used to select the site.
|
|
|
|
Because we need to look at both parent and related organization,
|
|
access to the ROR records (rather than TRR pool records) is
|
|
needed. This is done by using the data dump, converted to parquet for
|
|
smaller size and faster load time. ROR API would be an alternative,
|
|
but using the data dump is somewhat simpler, especially since rate
|
|
limits are about to get lowered.
|
|
|
|
"""
|
|
|
|
from collections.abc import Iterator
|
|
from typing import Any, TypedDict
|
|
from enum import Enum
|
|
import json
|
|
|
|
import click
|
|
import duckdb
|
|
|
|
|
|
class Relation(Enum):
|
|
SAME = 0
|
|
PARENT = 1
|
|
RELATED = 2
|
|
PARENT_PARENT = 3
|
|
PARENT_RELATED = 4
|
|
|
|
|
|
class NamesAndRels(TypedDict):
|
|
names: Any
|
|
relationships: Any
|
|
|
|
|
|
def load_map(dump_path: str) -> dict[str, NamesAndRels]:
|
|
"""Create id-indexed dict from ror parquet dump.
|
|
|
|
Returns a mapping of ids to names and relationships.
|
|
|
|
"""
|
|
all_ror_tbl = duckdb.read_parquet(dump_path)
|
|
res = duckdb.sql("SELECT id, names, relationships FROM all_ror_tbl")
|
|
all_ror_dict = {
|
|
id: {"names": names, "relationships": relationships}
|
|
for id, names, relationships in res.fetchall()
|
|
}
|
|
return all_ror_dict # pyright: ignore
|
|
|
|
|
|
def lookup(d: dict[str, dict[str, Any]], uri: str, rel_type: str) -> Iterator[str]:
|
|
"""Yield ROR IDs of orgs connected by given relation"""
|
|
for rel in d[uri].get("relationships", []):
|
|
if rel["type"] == rel_type:
|
|
yield rel["id"]
|
|
|
|
|
|
def match_site(
|
|
all_ror: dict, known_sites: set, uriorcurie: str
|
|
) -> tuple[str, Relation] | None:
|
|
|
|
this_id = uriorcurie.replace("ror:", "https://ror.org/")
|
|
|
|
if this_id in known_sites:
|
|
return (this_id, Relation.SAME)
|
|
|
|
for parent_id in lookup(all_ror, this_id, "parent"):
|
|
if parent_id in known_sites:
|
|
return (parent_id, Relation.PARENT)
|
|
|
|
for related_id in lookup(all_ror, this_id, "related"):
|
|
if related_id in known_sites:
|
|
return (related_id, Relation.RELATED)
|
|
|
|
for parent_id in lookup(all_ror, this_id, "parent"):
|
|
for grandparent_id in lookup(all_ror, parent_id, "parent"):
|
|
if grandparent_id in known_sites:
|
|
return (grandparent_id, Relation.PARENT_PARENT)
|
|
|
|
for parent_id in lookup(all_ror, this_id, "parent"):
|
|
for pr_id in lookup(all_ror, parent_id, "related"):
|
|
if pr_id in known_sites:
|
|
return (pr_id, Relation.PARENT_RELATED)
|
|
|
|
|
|
@click.command()
|
|
@click.argument("input", type=click.File("rb"))
|
|
@click.argument("dump", type=click.Path(exists=True, dir_okay=False))
|
|
@click.argument("output", type=click.File("wt"))
|
|
def main(input, dump, output):
|
|
|
|
site_dict = {
|
|
"https://ror.org/04xfq0f34": "aachen", # RWTH Aachen
|
|
"https://ror.org/04cvxnb49": "frankfurt", # Goethe University Frankfurt
|
|
"https://ror.org/038t36y30": "heidelberg", # Heidelberg University
|
|
"https://ror.org/02nv7yv05": "juelich", # Forschungszentrum Jülich
|
|
"https://ror.org/023b0x485": "mainz", # Johannes Gutenberg University Mainz
|
|
"https://ror.org/01hynnt93": "mannheim", # CIMH Mannheim
|
|
"https://ror.org/00fbnyb24": "wuerzburg", # University of Würzburg
|
|
}
|
|
|
|
site_set = set(site_dict.keys())
|
|
|
|
rel_map = load_map(dump)
|
|
|
|
for obj in (json.loads(line) for line in input):
|
|
for delegation in obj.get("delegated_by", []):
|
|
|
|
# work with either inlined delegation or just pid
|
|
if isinstance(delegation.get("object"), dict):
|
|
dpid = delegation.get("object").get("pid")
|
|
else:
|
|
dpid = delegation.get("object")
|
|
|
|
# it only makes sense to look at ror pids
|
|
if not dpid.startswith(("ror:", "https://ror.org/")):
|
|
continue
|
|
|
|
# if the delegation matches site, add or extend x_site property
|
|
m = match_site(rel_map, site_set, dpid)
|
|
if m is not None:
|
|
site_label = site_dict[m[0]]
|
|
if "x_site" not in obj:
|
|
obj["x_site"] = [site_label]
|
|
else:
|
|
if site_label not in obj["x_site"]:
|
|
obj["x_site"].append(site_label)
|
|
|
|
click.echo(json.dumps(obj), output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|