pool-publication-page/publication.py
Michał Szczepanik 35c21b9123 Improve author attribution processing, add SCoRO
When processing publication records, we have been checking whether each
attribution qualifies as an authorship (and not e.g. funding support).
In anticipation of addition of SCoRO terms, this commit makes two
changes:

- if the attribution roles are inlined, broad mappings will be used
  (to check if they include marcrel:aut)
- four SCoRO terms (consortium-, corresponding-, principal-, and senior-
  author) are added to a hardcoded lists of terms always interpreted
  as authorship
2026-03-13 12:18:47 +01:00

275 lines
8.1 KiB
Python

import hashlib
import json
from pathlib import Path
import re
from urllib.parse import urljoin
import click
import yaml
def is_authorship(attribution: dict) -> bool:
"""Check if the attribution describes authorship
This takes an attribution and uses a predefined set of roles to
distinguish authors from e.g. funders and reviewers.
"""
author_narrow = {
"marcrel:aut", # author
"obo:MS_1002034", # first author
"obo:MS_1002035", # senior author
"obo:MS_1002036", # coauthor
"marcrel:ctb", # contributor
"marcrel:cre", # creator
"marcrel:dis", # dissertant
"marcrel:edt", # editor
"marcrel:edc", # editor of compilation
"SCoRO:consortium-author",
"SCoRO:corresponding-author",
"SCoRO:principal-author",
"SCoRO:senior-author",
}
for role in attribution.get("roles", []):
rpid = role if isinstance(role, str) else role.get("pid")
if isinstance(role, dict) and "marcrel:aut" in role.get("broad_mappings", []):
# explicitly declared to be a narrow match for author
is_author = True
break
elif rpid in author_narrow:
# matches our hardcoded list
is_author = True
break
else:
is_author = False
return is_author
def curie_lastpart(curie: str) -> str:
"""Return last part of a curie"""
return curie.split(":")[-1].split("/")[-1]
def discover_authors(pool_people_metadata, external_metadata):
"""Cross-check ORCIDs in external publication record vs pool
Takes all Person records from the pool and a JSON record from
doi.org, then cross-refernces ORCIDs. Returns a list of matching
Person PIDs from the pool.
"""
orcid_map = {
f"https://orcid.org/{record.get('orcid')}": record
for record in pool_people_metadata
if "orcid" in record
}
discovered = []
for author in external_metadata.get("author"):
if (orcid := author.get("ORCID")) is not None:
if (person := orcid_map.get(orcid)) is not None:
discovered.append(person["pid"])
return discovered
def short_name_from_pool(d):
"""Generate file name based on data in the pool
If the PID is in the trr379.de/publications/ namespace (default in
the web UI), use the last part. Otherwise, to avoid dealing with
special characters or lengthy PIDs, use the md5 hash of the PID.
todo: the pool can have author, title, and date; we should use
that to create a human-readable name before falling back to
hashing.
"""
pat = r"(https://trr379\.de/|trr379root:)publications/([\w\-]+)"
if (m := re.match(pat, d["pid"])) is not None:
return m.group(2) + ".md"
else:
return hashlib.md5(data=d["pid"].encode()).hexdigest() + ".md"
def lastname_firstname(person: dict) -> str:
all_parts = [
person.get("family_name", ""),
person.get("given_name", ""),
",".join(person.get("additional_names", [])),
person.get("honorific_name_prefix", ""),
person.get("honorific_name_suffix", ""),
]
n = ";".join(all_parts).strip(";")
n = re.sub(r";{2,}", ";", n)
if len(n) == 0:
# best effort from the PID
parts = curie_lastpart(person["pid"]).split("-")
n = ";".join(parts[-1::-1])
return n
def fill_date(date: str) -> str:
"""Fill date with -01 to get full yyyy-mm-dd
The reason to do so is that hugo does not accept incomplete iso
dates, and parses integers as timestamps.
"""
date_parts = date.split("-")
if len(date_parts) == 1:
date += "-01-01"
elif len(date_parts) == 2:
date += "-01"
return date
def pid_of(x: str | dict) -> str:
"""Return a PID of an object, inlined or not
A shortcut - makes a pid string or an inlined dict (where pid is a
property) equivalent. Does not do further validation, but it could
be added here.
"""
return x.get("pid", "") if isinstance(x, dict) else x
def process_publication_date(paper: dict) -> str | None:
for generation in paper.get("generated_by", []):
if pid_of(generation.get("object")) == "obo:IAO_0000444": # Publishing process
return generation.get("at_time")
def process_doi(paper: dict) -> str | None:
"""Return a DOI from identifiers"""
for identifier in paper.get("identifiers", []):
if (
pid_of(identifier.get("creator")) == "ror:01fyxcz70"
or identifier.get("schema_type") == "dlthings:DOI"
):
return identifier.get("notation")
def process_single(
paper: dict,
outdir: Path | None,
filename: str | None = None,
) -> None:
# some information can be lifted straight from the json record
# (or minimally preprocessed)
# since most curies are website-related we can use them verbatim for Hugo
title = paper.get("title")
date = process_publication_date(paper)
abstract = paper.get("description")
projects = [
curie_lastpart(pid_of(generation.get("object", "")))
for generation in paper.get("generated_by", [])
if pid_of(generation.get("object")).startswith("trr379root:projects")
]
topics = [
curie_lastpart(topic)
for topic in paper.get("about", [])
if topic.startswith("trr379root:topics")
]
licenses = [
curie_lastpart(rule)
for rule in paper.get("rules", [])
if rule.startswith("spdx") or rule.startswith("https://spdx")
]
doi = process_doi(paper) # hugo template wants external url instead
external_url = urljoin("https://doi.org", doi) if doi is not None else None
contributors = [
attribution["object"]
for attribution in paper.get("attributed_to", [])
if is_authorship(attribution)
]
# remove not-inlined contributors
contributors = [c for c in contributors if isinstance(c, dict)]
# sort contributors for reproducibility
# previous set operation may introduce randomnsess
contributors.sort(key=lastname_firstname)
# sort other fields too
projects.sort()
topics.sort()
# restrict reported contributions to those in the trr namespace
# because we want Hugo to create links, not additional pages
# (this assumes correspondence of pool pids with website urls)
# a more complete check would be a query for project membership
contributors = [
curie_lastpart(ctb["pid"])
for ctb in contributors
if ctb["pid"].startswith("trr379root:contributors/")
]
# fill date to yyyy-mm-dd for hugo-compatible formatting
date = fill_date(date) if date is not None else date
# citation is not part of the schema, but can come from enrichment
citation = paper.get("x_citation")
# assemble the front matter
front_matter_dict = {
"title": title,
"date": date,
"contributors": contributors,
"projects": projects,
"topics": topics,
"external_url": external_url,
"layout": "publication",
"params": {"licenses": licenses},
}
# assemble a markdown page
front_matter = f"---\n{yaml.dump(front_matter_dict)}\n---\n"
body = ""
if abstract is not None:
body += "\n"
body += abstract
body += "" if abstract.endswith("\n") else "\n"
if citation is not None:
body += "\n"
body += citation
body += "\n"
page = front_matter + body
# decide on output file and save
if outdir is None:
print(page)
elif filename is not None:
(outdir / filename).write_text(page)
else:
fn = paper.get("x_suggested_name") # can be added via enrichment
fn = fn if fn is not None else short_name_from_pool(paper)
(outdir / fn).write_text(page)
@click.command()
@click.argument("input", type=click.File("rb"))
@click.argument(
"outdir",
type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
)
def main(input, outdir):
for line in input:
paper = json.loads(line)
process_single(paper, outdir=outdir)
if __name__ == "__main__":
main()