Data Preparation OpenAlex-ID
Getting Started
Functions
normalize_university_names()
Normalize and standardize university labels to English and native-name variants.
Arguments
df
Tibble or data frame containing at least
uid,naam,clean_name, and auniversitycolumn (as a list-column of one or more raw affiliation strings per row).
Method
The function first defines a lookup list that maps patterns in the raw university field (e.g., “VU”, “UvA”, “Leiden”, “RUG”, “Gothenburg”) to a pair of names: an English label and a native-language label. Some entries only specify the native name, and some (e.g., “Politie”, “UvH”) are explicitly mapped to NA.
It then standardizes the structure of the input by collapsing the university list-column to a single character string per row (taking the first element when multiple are present), normalizing occurrences of “Uni”/“uni” to “University”, and setting two working columns: university_name (default English name, initialized as the original university) and university_name2 (default native name, initialized to NA).
Next, it iterates over each regex pattern in the lookup. For rows where the pattern matches university, it overwrites university_name with the English label (if provided and still equal to the original value) and fills university_name2 with the native label when it is missing.
Finally, the two name columns are stacked with pivot_longer(), yielding a long format with one row per (uid, naam, clean_name, university, university_name) combination (where university_name can be either the English or native variant, or NA for explicitly excluded cases). The function returns this tidy table, which can be used downstream to choose a preferred representation or to handle multilingual labeling of university affiliations.
normalize_university_names = function(df){
# 1. Pattern → c(english, native) (use ^...$ to force exact matches)
lookup = list(
"VU" = c("Free University Amsterdam",
"Vrije Universiteit Amsterdam"),
"(UvA|Uva)" = c("University of Amsterdam",
"Universiteit van Amsterdam"),
"Leiden" = c("Leiden University",
"Leiden Universiteit"),
# this pattern matches with “RU”, “(RU)”, “RU,”, etc., and excludes “RUG”.
"\\bRU\\b" = c("Radboud University Nijmegen",
"Radboud Universiteit Nijmegen"),
"(UU|UCU)" = c("Utrecht University",
"Universiteit Utrecht"),
"^EUR$" = c("Erasmus University Rotterdam",
"Erasmus Universiteit Rotterdam"),
"(RUG|UvG)" = c("University of Groningen",
"Rijksuniversiteit Groningen"),
"(UvT|Uvt|Tilburg)" = c("University of Tilburg",
"Universiteit van Tilburg"),
"WUR" = c("Wageningen University & Research",
"Wageningen University & Research"),
"^TU Delft$" = c("Technical University Delft",
"Technische Universiteit Delft"),
"Milano" = c("University of Milan",
"Università degli Studi di Milano Statale"),
"Berlijn" = c("University of Hamburg",
"Universität Hamburg"),
"Trento" = c("University of Trento",
"Università degli Studi di Trento"),
"Stockholm" = c("Stockholm University",
"Stockholms Universitet"),
"Gent" = c("Ghent University",
"Universiteit Gent"),
# Only native names (leave EN as-is)
"Gothenburg" = c(NA, "Göteborgs universitet"),
"Cologne" = c(NA, "Universität zu Köln"),
"Koc" = c(NA, "Koç Üniversitesi"),
"Turku" = c(NA, "Turun Yliopisto"),
"Lausane" = c(NA, "Université de Lausanne"),
"Leipzig" = c(NA, "Universität Leipzig"),
"Linköping" = c(NA, "Linköpings universitet"),
# Explicit NAs
"^Politie$" = c(NA, NA),
"^UvH$" = c(NA, NA)
)
# 2. set the default values for university_names
df = df |>
mutate(
university = map_chr(university, ~ {
if (is.null(.x) || length(.x) == 0) return(NA_character_)
as.character(.x[[1]])
}),
university = str_replace(university, "(Uni |uni)", "University "),
university_name = university, # default EN → original
university_name2 = NA_character_ # default native → NA
)
# 3. replace the university name when the pattern is detected in university
for (pat in names(lookup)) {
vals = lookup[[pat]]
idx = str_detect(df$university, pat)
if (!is.na(vals[1])) {
df$university_name[idx & df$university_name == df$university] = vals[1]
}
if (!is.na(vals[2])) {
df$university_name2[idx & is.na(df$university_name2)] = vals[2]
}
}
# 4. stack the values of the two university_name variables
df = df |>
pivot_longer(
cols = c(university_name, university_name2),
names_to = "var",
values_to = "university_name"
) |>
select(uid, naam, clean_name, university, university_name)
return(df)
}oa_fetch_institutions()
Fetch and disambiguate OpenAlex institution records for a vector of institution names.
Arguments
institutions
Character vector of institution names to look up in OpenAlex (e.g., “Utrecht University”, “Radboud University Nijmegen”).
pause
Numeric number of seconds to sleep between successive API calls (default 0). Use this to throttle requests when scraping many institutions.
Method
The function starts by reporting the current time with cli_alert() and initializing a progress bar via cli_progress_bar(). It then loops over all elements of institutions, optionally sleeping pause seconds before each request. For every institution name, it calls oa_fetch(entity = "institutions", search = institution) inside tryCatch(), converting any errors or warnings to NULL so that failures do not interrupt the loop.
If the result is a non-null tibble, it first renames the OpenAlex columns id and display_name to institution_id and institution_name. For single-row results, that row is stored directly in hold[[institution]]. For multi-row results, it constructs a regular expression ^<institution>$ and checks for exact matches on institution_name. When there are no exact matches, the full result is kept (for later manual inspection); when there is exactly one match, only that row is kept; when there are multiple exact matches, the first matching row is retained as the most relevant hit. The progress bar is updated on each iteration.
After processing all names, the function binds all stored tibbles with bind_rows(), removes duplicate institutions using distinct(institution_id, .keep_all = TRUE), and keeps only institution_name and institution_id. It then adds an institution_url column equal to the original OpenAlex URL and converts institution_id to a compact identifier by stripping the "https://openalex.org/" prefix. The function returns this tidy tibble of unique, disambiguated OpenAlex institution records.
oa_fetch_institutions = function(institutions, pause=1){
k = length(institutions)
hold = list()
cli_alert("Starting now, at {Sys.time()}")
cli_progress_bar("Scraping institutions", total = k, clear = FALSE)
# 1. iterate over all institutionsinstitutions
for (institution in institutions){
# sleep if a pause time is set
if (pause > 0) Sys.sleep(pause)
# 2. fetch result for all institutions
res = tryCatch(
oa_fetch(
entity = "institutions", search = institution, api_key = APIKEY
),
error = function(e) NULL,
warning = function(w) NULL
)
cli_progress_update()
# 3. for cases with a valid result rename colums
if (is.null(res) || !is_tibble(res)) next
res = res |> rename(
"institution_id" = "id",
"institution_name" = "display_name"
)
# 4.1. single result hitsare directly stored
if (nrow(res) == 1L) {
hold[[institution]] <- res
next
}
# 4.2. multiple result hits are processed and checked the institution_name
# matches the OpenAlex display_name for institutions
pat = paste0("^", institution, "$")
idx = str_detect(res$institution_name, pat)
if (sum(idx, na.rm = TRUE) == 0L) {
hold[[institution]] = res
next
}
if (sum(idx, na.rm = TRUE) == 1L) {
hold[[institution]] = res[idx,]
next
}
# if multiple matches exist with only select the first (most relevant) one
if (sum(idx, na.rm = TRUE) > 1L) {
hold[[institution]] = res[idx,][1, ]
}
}
# combine the fetched data and harmonize this data
out = bind_rows(hold) |>
distinct(institution_id, .keep_all=TRUE) |>
select(institution_name, institution_id) |>
mutate(
institution_url = institution_id,
institution_id = str_remove(institution_id, 'https://openalex.org/')
)
return(out)
}add_institution_id()
Attach OpenAlex institution IDs to normalized university records.
Arguments
df
Tibble or data frame containing at least uid, university, and university_name, where university_name holds cleaned institution labels and university stores the original or raw affiliation.
Method
The function first extracts all unique, non-missing values of university_name and passes them to oa_fetch_institutions() to obtain a disambiguated lookup table of OpenAlex institutions, including institution_name, institution_id, and institution_url. It then renames university_name to institution_name and university to institution in df so that the columns align with the lookup, and performs a left_join() on institution_name. After the join, it filters out any rows where institution_id is missing (i.e., cases where no OpenAlex match was found) and removes duplicate (uid, institution_id) combinations to ensure each person–institution pair is unique. The resulting tibble retains the original information along with harmonized institution identifiers suitable for linkage to OpenAlex metadata.
add_institution_id = function(df){
# create a vector all the unique university names
institutions = df$university_name |> unique() |> na.omit()
institutions = oa_fetch_institutions(institutions)
# merge the institutions data with df and harmonize the data
out = df |>
rename(
"institution_name" = "university_name",
"institution" = "university"
) |>
left_join(institutions) |>
filter(!is.na(institution_id)) |>
distinct(uid, institution_id, .keep_all=TRUE)
return(out)
}normalize_name()
Normalize personal or institutional names for robust matching.
Arguments
x
Character vector (or coercible to character) with raw names or labels.
Method
The function first coerces x to character. It then detects which elements contain Cyrillic characters using stri_detect_charclass(). For those elements only, it transliterates Cyrillic to Latin script via stri_trans_general(x, "Cyrillic-Latin"), preserving the others as-is. Next, it removes accents/diacritics from all entries using "Latin-ASCII", converts the result to lowercase, and collapses multiple spaces into a single space with str_squish(). Finally, leading and trailing whitespace is removed using trimws(). The output is a character vector of normalized, ASCII-only, lowercase names that are suitable for fuzzy matching and string distance calculations across languages and scripts.
normalize_name = function(x) {
x = as.character(x)
# mark Cyrillic
has_cyr = stri_detect_charclass(x, "\\p{Script=Cyrillic}")
# transliterate only Cyrillic -> Latin
x[has_cyr] = stri_trans_general(x[has_cyr], "Cyrillic-Latin")
# strip accents for all, lowercase, trim, squish
x = stri_trans_general(x, "Latin-ASCII")
x = tolower(x)
x = str_squish(x)
trimws(x)
}add_query_similarity()
Compute a Jaro–Winkler similarity score between OpenAlex display names and query names.
Arguments
data
Tibble or data frame containing at least
display_name(e.g., OpenAlex entity label) andquery_name(the original search string).
Method
The function creates two temporary normalized versions of the names: dn as the normalized display_name and qn as the normalized query_name, both produced by normalize_name(). It then calculates the Jaro–Winkler string distance between dn and qn using stringdist(..., method = "jw") and converts this distance to a similarity measure via 1 - distance. The resulting value is stored in a new column query_similarity, which ranges from 0 (completely dissimilar) to 1 (identical after normalization). Finally, the temporary helper columns dn and qn are dropped, and the augmented data frame is returned with an interpretable similarity metric that can be used for ranking or filtering matches.
unnest_institutions()
Unnest and clean OpenAlex institution data per author in a nested list of tibbles
Arguments
data
Named list of tibbles, where each element corresponds to a
uidand contains at leastuid,author_id, andlast_known_institutions(a list-column of data frames with OpenAlex institution info).
Method
The function iterates over each element of the input list data, treating the list names as author uids. For every element, it first checks whether the object is a non-empty tibble; invalid entries are skipped. For valid tibbles, it filters rows to the current uid, removes duplicate (uid, author_id) combinations, and then processes the last_known_institutions list-column. Within each element of last_known_institutions, it keeps unique rows, derives a compact institution_id by stripping the "https://openalex.org/" prefix from id, and creates a readable institution_name from display_name, retaining id, institution_id, institution_name, and country_code. In addition, it constructs an institution_ids column that stores, per row, the unique set of associated institution IDs as a character vector. The modified tibble is written back into the corresponding position in the hold list. Finally, the function returns the updated list, preserving its original structure but with institution information consistently unpacked and enriched.
clean_last_known_institutions <- function(tab){
empty_inst <- tibble(
id = character(),
display_name = character(),
country_code = character()
)
# 1. If the column is not present, create it (one empty tibble per row)
if (!"last_known_institutions" %in% names(tab)) {
tab <- tab %>%
mutate(last_known_institutions = replicate(
n(), empty_inst, simplify = FALSE)) |>
relocate(last_known_institutions, .before = topics)
}
# 2. If entries are missing/NULL/NA/list(NA), replace them with empty_inst
tab <- tab %>%
mutate(
last_known_institutions = map(last_known_institutions, \(x) {
if (is.null(x) || length(x) == 0 || (length(x) == 1 && all(is.na(x)))) {
empty_inst
} else {
x
}
})
)
return(tab)
}
unnest_institutions = function(data){
hold = data
for (id in names(data)) {
# extract information for each author and skip if invalid
tab = data[[id]]
if (is.null(tab) || !is_tibble(tab) || !(nrow(tab) > 0)) next
# clean authors data and unnests institution information
hold[[id]] = tab |>
filter(uid == id) |>
distinct(uid, author_id, .keep_all=TRUE) |>
clean_last_known_institutions() |>
mutate(
# unpacks the dataframes found in last_known_institutions
last_known_institutions = map(
last_known_institutions,
~ .x |>
distinct(.keep_all = TRUE) |>
drop_na() |>
mutate(
institution_id = str_remove(id, "^https://openalex.org/"),
institution_name = display_name
) |>
select(id, institution_id, institution_name, country_code)
),
# unnests the institution_ids
institution_ids = map(
last_known_institutions,
~ .x |> pull(institution_id) |> unique()
)
)
}
return(hold)
}match_institutions()
Filter OpenAlex author candidates by institutional match and name similarity
Arguments
data
Named list of tibbles with OpenAlex author candidates per
uid, typically already enriched withinstitution_ids,query_name,display_name,query_similarity, andorcid.
df
Tibble or data frame containing at least
uidandinstitution_id, representing the verified institution mappings from your own data.
Method
For each element in data, the function first checks that the entry is a non-empty tibble and skips invalid ones. It then looks up all unique institution_id values in df for the current uid, skipping authors without any known institutions. For valid entries, it augments the candidate table by computing institution_match, which is TRUE when at least one of the candidate’s institution_ids overlaps with the known institution IDs for that uid. It then defines keep using a two-stage rule: candidates with query_similarity == 1.0 are always kept; otherwise, candidates are kept if they have high similarity without an institution match (query_similarity ≥ 0.60) or moderate similarity with an institution match (query_similarity ≥ 0.45). In a second pass over keep, the function applies a series of hard-coded exceptions that force specific problematic display_name and query_name combinations to be dropped, even if they otherwise pass the thresholds. The orcid column is coerced to character for consistency, and finally only rows with keep == TRUE are retained. The filtered tibble is stored back in the corresponding list element, and the updated list of disambiguated author matches is returned.
match_institutions = function(data, df){
hold = data
for(id in names(data)){
# extract information for each author and skip if invalid
tab = data[[id]]
if (is.null(tab) || !is_tibble(tab) || !(nrow(tab) > 0)) next
# create a list of institution_ids per author and skip if invalid
institutions = df |>
filter(uid == id) |>
pull(institution_id) |>
unique()
if ((length(institutions) < 1)) next
#...
hold[[id]] = tab |>
mutate(
# indicates whether there is a match in institution_id between OpenAlex
# and the original dataframe
institution_match = map_lgl(
institution_ids,
~ {x = .x; if (is.null(x)) x = character(0)
any(as.character(x) %in% institutions)}
),
institution_dutch = map_lgl(
last_known_institutions,
~ {
x = .x; if (is.null(x)) x = character(0)
any(x$country_code == 'NL')
}
),
keep = case_when(
query_similarity == 1.0 ~ TRUE,
institution_dutch == FALSE ~ FALSE,
!institution_match & query_similarity >= 0.60 ~ TRUE,
institution_match & query_similarity >= 0.45 ~ TRUE,
.default = FALSE
),
keep = case_when(
display_name == 'Thijs W. de Vos' ~ FALSE,
display_name == 'Kees van Kersbergen' ~ FALSE,
display_name == 'René W. van der Hulst' ~ FALSE,
display_name == 'Younes Zoughlami' ~ FALSE,
display_name == 'Younes Zeboudj' ~ FALSE,
display_name == 'Younes Saramifar' ~ FALSE,
display_name == 'van Dijk' ~ FALSE,
display_name == 'Stephanie Maas' ~ FALSE,
display_name == 'A.J.J. Nijhuis' ~ FALSE,
(query_name == 'Meindert Fennema')
& (display_name == 'Wouter van der Brug') ~ FALSE,
.default = keep
),
orcid = as.character(orcid)
) |>
filter(keep)
}
return(hold)
}oa_fetch_works = function(data, pause = 0){
# configure iterator and other looping variables
k = length(data)
cache_path = file.path('data', '_cache', 'oaworks.Rds')
cache = load_cache(cache_path)
# get new APIKEY
dotenv::load_dot_env()
APIKEY <- Sys.getenv("OPENALEX_API_KEY2")
cli_alert("Starting now, at {Sys.time()}")
cli_progress_bar("Scraping works", total = k, clear = FALSE)
for (id in names(data)){
cli_progress_update()
if (id %in% names(cache)) next
# extract information for each author and skip if invalid
tab = data[[id]]
# extract a list of institution_ids
# if more than 10 ids are present, limited to the 10 most relevant
oa_ids = tab |>
mutate(author_id = str_remove(author_id, 'https://openalex.org/')) |>
arrange(relevance_score) |>
distinct(author_id) |> drop_na() |>
head(10) |> pull(author_id) |> unique()
# if the scholar has no valid oa_ids than skip this scholar
if (any(is.null(oa_ids), length(oa_ids) == 0)) {
cache[[id]] = NULL
next
}
# fetch works where the author was part of the authorship
res = tryCatch(
oa_fetch(
entity = 'works',
authorships.author.id = oa_ids,
api_key = APIKEY
),
error = function(e) NULL,
warning = function(w) NULL
)
# if there is a valid result then extract author_id from authorships
if (all(!is.null(res), is_tibble(res), nrow(res) > 0)){
res = res |>
rename(work_id = id) |>
mutate(authors = authorships) |>
# expand the data table with a new row per author
unnest(authors, keep_empty = TRUE, names_sep = "__") |>
mutate(uid = id, author_id = authors__id, .before = work_id) |>
mutate(across(
c(work_id, author_id),
~ str_remove(as.character(.x), "^https://openalex.org/")
)) |>
# select the row where the author_id is in the selection
filter(author_id %in% oa_ids) |>
select(-starts_with('authors__')) |>
# drop duplicates (very rare)
distinct(uid, work_id, .keep_all = TRUE)
}
cache[[id]] = res
}
saveRDS(cache, cache_path)
return(cache)
}Application
load_cache <- function(path) {
if (!file.exists(path)) return(list())
cache <- tryCatch(
readRDS(path),
error = function(e) list(),
warning = function(w) list()
)
# Validate expected structure (named list)
if (!is.list(cache)) return(list())
if (is.null(names(cache))) names(cache) <- character(0)
cache
}oa_fetch_institutions = function(institutions, pause = 0){
k = length(institutions)
cache_path = file.path('data', '_cache', 'oainstitutions.Rds')
cache <- load_cache(cache_path)
cli_alert("Starting now, at {Sys.time()}")
cli_progress_bar("Scraping institutions", total = k, clear = FALSE)
# 1) iterate over all institutionsinstitutions
for (institution in institutions){
# skip institution if cached
if(institution %in% names(cache)) next
# 2) fetch result for all institutions
res = tryCatch(
oa_fetch(
entity = "institutions", search = institution, api_key = APIKEY
),
error = function(e) NULL,
warning = function(w) NULL
)
cli_progress_update()
if(is_tibble(res)) {
res = res |>
mutate(institution_name = institution) |>
relocate(institution_name, .before = id) |>
first()
}
cache[[institution]] = res
}
# write cache
saveRDS(cache, cache_path)
out = cache |>
bind_rows() |>
select(institution_name, id) |>
rename(institution_id = id)
return(out)
}
add_institution_id = function(df){
# fetch institution ids for each university
institutions = df$university_name |> unique() |> na.omit()
results = oa_fetch_institutions(institutions)
# create author table with institution_ids
authors = df |>
left_join(results, by = join_by(university_name == institution_name)) |>
distinct(clean_name, institution_id, .keep_all = TRUE) |>
filter(university_name %in% results$institution_name)
}