When processing publication records, we have been checking whether each attribution qualifies as an authorship (and not e.g. funding support). In anticipation of addition of SCoRO terms, this commit makes two changes: - if the attribution roles are inlined, broad mappings will be used (to check if they include marcrel:aut) - four SCoRO terms (consortium-, corresponding-, principal-, and senior- author) are added to a hardcoded lists of terms always interpreted as authorship
275 lines
8.1 KiB
Python
275 lines
8.1 KiB
Python
import hashlib
|
|
import json
|
|
from pathlib import Path
|
|
import re
|
|
from urllib.parse import urljoin
|
|
|
|
import click
|
|
import yaml
|
|
|
|
|
|
def is_authorship(attribution: dict) -> bool:
|
|
"""Check if the attribution describes authorship
|
|
|
|
This takes an attribution and uses a predefined set of roles to
|
|
distinguish authors from e.g. funders and reviewers.
|
|
|
|
"""
|
|
|
|
author_narrow = {
|
|
"marcrel:aut", # author
|
|
"obo:MS_1002034", # first author
|
|
"obo:MS_1002035", # senior author
|
|
"obo:MS_1002036", # coauthor
|
|
"marcrel:ctb", # contributor
|
|
"marcrel:cre", # creator
|
|
"marcrel:dis", # dissertant
|
|
"marcrel:edt", # editor
|
|
"marcrel:edc", # editor of compilation
|
|
"SCoRO:consortium-author",
|
|
"SCoRO:corresponding-author",
|
|
"SCoRO:principal-author",
|
|
"SCoRO:senior-author",
|
|
}
|
|
|
|
for role in attribution.get("roles", []):
|
|
rpid = role if isinstance(role, str) else role.get("pid")
|
|
if isinstance(role, dict) and "marcrel:aut" in role.get("broad_mappings", []):
|
|
# explicitly declared to be a narrow match for author
|
|
is_author = True
|
|
break
|
|
elif rpid in author_narrow:
|
|
# matches our hardcoded list
|
|
is_author = True
|
|
break
|
|
else:
|
|
is_author = False
|
|
|
|
return is_author
|
|
|
|
|
|
def curie_lastpart(curie: str) -> str:
|
|
"""Return last part of a curie"""
|
|
return curie.split(":")[-1].split("/")[-1]
|
|
|
|
|
|
def discover_authors(pool_people_metadata, external_metadata):
|
|
"""Cross-check ORCIDs in external publication record vs pool
|
|
|
|
Takes all Person records from the pool and a JSON record from
|
|
doi.org, then cross-refernces ORCIDs. Returns a list of matching
|
|
Person PIDs from the pool.
|
|
|
|
"""
|
|
orcid_map = {
|
|
f"https://orcid.org/{record.get('orcid')}": record
|
|
for record in pool_people_metadata
|
|
if "orcid" in record
|
|
}
|
|
|
|
discovered = []
|
|
for author in external_metadata.get("author"):
|
|
if (orcid := author.get("ORCID")) is not None:
|
|
if (person := orcid_map.get(orcid)) is not None:
|
|
discovered.append(person["pid"])
|
|
|
|
return discovered
|
|
|
|
|
|
def short_name_from_pool(d):
|
|
"""Generate file name based on data in the pool
|
|
|
|
If the PID is in the trr379.de/publications/ namespace (default in
|
|
the web UI), use the last part. Otherwise, to avoid dealing with
|
|
special characters or lengthy PIDs, use the md5 hash of the PID.
|
|
|
|
todo: the pool can have author, title, and date; we should use
|
|
that to create a human-readable name before falling back to
|
|
hashing.
|
|
|
|
"""
|
|
pat = r"(https://trr379\.de/|trr379root:)publications/([\w\-]+)"
|
|
if (m := re.match(pat, d["pid"])) is not None:
|
|
return m.group(2) + ".md"
|
|
else:
|
|
return hashlib.md5(data=d["pid"].encode()).hexdigest() + ".md"
|
|
|
|
|
|
def lastname_firstname(person: dict) -> str:
|
|
all_parts = [
|
|
person.get("family_name", ""),
|
|
person.get("given_name", ""),
|
|
",".join(person.get("additional_names", [])),
|
|
person.get("honorific_name_prefix", ""),
|
|
person.get("honorific_name_suffix", ""),
|
|
]
|
|
n = ";".join(all_parts).strip(";")
|
|
n = re.sub(r";{2,}", ";", n)
|
|
if len(n) == 0:
|
|
# best effort from the PID
|
|
parts = curie_lastpart(person["pid"]).split("-")
|
|
n = ";".join(parts[-1::-1])
|
|
return n
|
|
|
|
|
|
def fill_date(date: str) -> str:
|
|
"""Fill date with -01 to get full yyyy-mm-dd
|
|
|
|
The reason to do so is that hugo does not accept incomplete iso
|
|
dates, and parses integers as timestamps.
|
|
|
|
"""
|
|
date_parts = date.split("-")
|
|
if len(date_parts) == 1:
|
|
date += "-01-01"
|
|
elif len(date_parts) == 2:
|
|
date += "-01"
|
|
return date
|
|
|
|
|
|
def pid_of(x: str | dict) -> str:
|
|
"""Return a PID of an object, inlined or not
|
|
|
|
A shortcut - makes a pid string or an inlined dict (where pid is a
|
|
property) equivalent. Does not do further validation, but it could
|
|
be added here.
|
|
|
|
"""
|
|
return x.get("pid", "") if isinstance(x, dict) else x
|
|
|
|
|
|
def process_publication_date(paper: dict) -> str | None:
|
|
for generation in paper.get("generated_by", []):
|
|
if pid_of(generation.get("object")) == "obo:IAO_0000444": # Publishing process
|
|
return generation.get("at_time")
|
|
|
|
|
|
def process_doi(paper: dict) -> str | None:
|
|
"""Return a DOI from identifiers"""
|
|
|
|
for identifier in paper.get("identifiers", []):
|
|
if (
|
|
pid_of(identifier.get("creator")) == "ror:01fyxcz70"
|
|
or identifier.get("schema_type") == "dlthings:DOI"
|
|
):
|
|
return identifier.get("notation")
|
|
|
|
|
|
def process_single(
|
|
paper: dict,
|
|
outdir: Path | None,
|
|
filename: str | None = None,
|
|
) -> None:
|
|
|
|
# some information can be lifted straight from the json record
|
|
# (or minimally preprocessed)
|
|
# since most curies are website-related we can use them verbatim for Hugo
|
|
title = paper.get("title")
|
|
date = process_publication_date(paper)
|
|
abstract = paper.get("description")
|
|
projects = [
|
|
curie_lastpart(pid_of(generation.get("object", "")))
|
|
for generation in paper.get("generated_by", [])
|
|
if pid_of(generation.get("object")).startswith("trr379root:projects")
|
|
]
|
|
topics = [
|
|
curie_lastpart(topic)
|
|
for topic in paper.get("about", [])
|
|
if topic.startswith("trr379root:topics")
|
|
]
|
|
licenses = [
|
|
curie_lastpart(rule)
|
|
for rule in paper.get("rules", [])
|
|
if rule.startswith("spdx") or rule.startswith("https://spdx")
|
|
]
|
|
|
|
doi = process_doi(paper) # hugo template wants external url instead
|
|
external_url = urljoin("https://doi.org", doi) if doi is not None else None
|
|
|
|
contributors = [
|
|
attribution["object"]
|
|
for attribution in paper.get("attributed_to", [])
|
|
if is_authorship(attribution)
|
|
]
|
|
|
|
# remove not-inlined contributors
|
|
contributors = [c for c in contributors if isinstance(c, dict)]
|
|
|
|
# sort contributors for reproducibility
|
|
# previous set operation may introduce randomnsess
|
|
contributors.sort(key=lastname_firstname)
|
|
|
|
# sort other fields too
|
|
projects.sort()
|
|
topics.sort()
|
|
|
|
# restrict reported contributions to those in the trr namespace
|
|
# because we want Hugo to create links, not additional pages
|
|
# (this assumes correspondence of pool pids with website urls)
|
|
# a more complete check would be a query for project membership
|
|
|
|
contributors = [
|
|
curie_lastpart(ctb["pid"])
|
|
for ctb in contributors
|
|
if ctb["pid"].startswith("trr379root:contributors/")
|
|
]
|
|
|
|
# fill date to yyyy-mm-dd for hugo-compatible formatting
|
|
date = fill_date(date) if date is not None else date
|
|
|
|
# citation is not part of the schema, but can come from enrichment
|
|
citation = paper.get("x_citation")
|
|
|
|
# assemble the front matter
|
|
front_matter_dict = {
|
|
"title": title,
|
|
"date": date,
|
|
"contributors": contributors,
|
|
"projects": projects,
|
|
"topics": topics,
|
|
"external_url": external_url,
|
|
"layout": "publication",
|
|
"params": {"licenses": licenses},
|
|
}
|
|
|
|
# assemble a markdown page
|
|
front_matter = f"---\n{yaml.dump(front_matter_dict)}\n---\n"
|
|
|
|
body = ""
|
|
if abstract is not None:
|
|
body += "\n"
|
|
body += abstract
|
|
body += "" if abstract.endswith("\n") else "\n"
|
|
if citation is not None:
|
|
body += "\n"
|
|
body += citation
|
|
body += "\n"
|
|
|
|
page = front_matter + body
|
|
|
|
# decide on output file and save
|
|
if outdir is None:
|
|
print(page)
|
|
elif filename is not None:
|
|
(outdir / filename).write_text(page)
|
|
else:
|
|
fn = paper.get("x_suggested_name") # can be added via enrichment
|
|
fn = fn if fn is not None else short_name_from_pool(paper)
|
|
(outdir / fn).write_text(page)
|
|
|
|
|
|
@click.command()
|
|
@click.argument("input", type=click.File("rb"))
|
|
@click.argument(
|
|
"outdir",
|
|
type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
|
|
)
|
|
def main(input, outdir):
|
|
|
|
for line in input:
|
|
paper = json.loads(line)
|
|
process_single(paper, outdir=outdir)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|