Data Preparation OpenAlex-ID

Published

March 31, 2026

Getting Started

# load custom functions
source("src/utils/custom_functions.r")

# clear the global environment and set dependencies
.clear_global_environment()
.load_quarto_dependencies()

# load and activate packages
library(tidyverse)
library(readxl)
library(stringr)
library(lubridate)
library(openalexR)
library(rvest)
library(jsonlite)
library(cli)
library(stringi)
library(stringdist)

# set global options
dotenv::load_dot_env()
APIKEY <- Sys.getenv("OPENALEX_API_KEY")

Functions

`normalize_university_names()`

Normalize and standardize university labels to English and native-name variants.

Arguments

df

Tibble or data frame containing at least uid, naam, clean_name, and a university column (as a list-column of one or more raw affiliation strings per row).

Method

The function first defines a lookup list that maps patterns in the raw university field (e.g., “VU”, “UvA”, “Leiden”, “RUG”, “Gothenburg”) to a pair of names: an English label and a native-language label. Some entries only specify the native name, and some (e.g., “Politie”, “UvH”) are explicitly mapped to NA.

It then standardizes the structure of the input by collapsing the university list-column to a single character string per row (taking the first element when multiple are present), normalizing occurrences of “Uni”/“uni” to “University”, and setting two working columns: university_name (default English name, initialized as the original university) and university_name2 (default native name, initialized to NA).

Next, it iterates over each regex pattern in the lookup. For rows where the pattern matches university, it overwrites university_name with the English label (if provided and still equal to the original value) and fills university_name2 with the native label when it is missing.

Finally, the two name columns are stacked with pivot_longer(), yielding a long format with one row per (uid, naam, clean_name, university, university_name) combination (where university_name can be either the English or native variant, or NA for explicitly excluded cases). The function returns this tidy table, which can be used downstream to choose a preferred representation or to handle multilingual labeling of university affiliations.

normalize_university_names = function(df){
  # 1. Pattern → c(english, native) (use ^...$ to force exact matches)
   lookup = list(
    "VU"                = c("Free University Amsterdam", 
                            "Vrije Universiteit Amsterdam"),
    "(UvA|Uva)"         = c("University of Amsterdam", 
                            "Universiteit van Amsterdam"),
    "Leiden"            = c("Leiden University",
                            "Leiden Universiteit"),
    # this pattern matches with “RU”, “(RU)”, “RU,”, etc., and excludes “RUG”.
    "\\bRU\\b"          = c("Radboud University Nijmegen",
                            "Radboud Universiteit Nijmegen"),
    "(UU|UCU)"          = c("Utrecht University", 
                            "Universiteit Utrecht"),
    "^EUR$"             = c("Erasmus University Rotterdam", 
                            "Erasmus Universiteit Rotterdam"),
    "(RUG|UvG)"         = c("University of Groningen", 
                            "Rijksuniversiteit Groningen"),
    "(UvT|Uvt|Tilburg)" = c("University of Tilburg", 
                            "Universiteit van Tilburg"),
    "WUR"               = c("Wageningen University & Research", 
                            "Wageningen University & Research"),
    "^TU Delft$"        = c("Technical University Delft", 
                            "Technische Universiteit Delft"),
    "Milano"            = c("University of Milan", 
                            "Università degli Studi di Milano Statale"),
    "Berlijn"           = c("University of Hamburg",
                            "Universität Hamburg"),
    "Trento"            = c("University of Trento",
                            "Università degli Studi di Trento"),
    "Stockholm"         = c("Stockholm University", 
                            "Stockholms Universitet"),
    "Gent"              = c("Ghent University", 
                            "Universiteit Gent"),
    # Only native names (leave EN as-is)
    "Gothenburg"        = c(NA, "Göteborgs universitet"),
    "Cologne"           = c(NA, "Universität zu Köln"),
    "Koc"               = c(NA, "Koç Üniversitesi"),
    "Turku"             = c(NA, "Turun Yliopisto"),
    "Lausane"           = c(NA, "Université de Lausanne"),
    "Leipzig"           = c(NA, "Universität Leipzig"),
    "Linköping"         = c(NA, "Linköpings universitet"),
    # Explicit NAs
    "^Politie$"         = c(NA, NA),
    "^UvH$"             = c(NA, NA)
  )

  # 2. set the default values for university_names
  df = df |>
    mutate(
      university = map_chr(university, ~ {
          if (is.null(.x) || length(.x) == 0) return(NA_character_)
          as.character(.x[[1]])
        }),
      university       = str_replace(university, "(Uni |uni)", "University "),
      university_name  = university,             # default EN → original
      university_name2 = NA_character_           # default native → NA
    )

  # 3. replace the university name when the pattern is detected in university
  for (pat in names(lookup)) {
    vals = lookup[[pat]]
    idx  = str_detect(df$university, pat)

    if (!is.na(vals[1])) {
      df$university_name[idx & df$university_name == df$university] = vals[1]
    }
    if (!is.na(vals[2])) {
      df$university_name2[idx & is.na(df$university_name2)] = vals[2]
    }
  }

  # 4. stack the values of the two university_name variables
  df = df |>
    pivot_longer(
      cols = c(university_name, university_name2),
      names_to = "var",
      values_to = "university_name"
    ) |>
    select(uid, naam, clean_name, university, university_name)

  return(df)
}

`oa_fetch_institutions()`

Fetch and disambiguate OpenAlex institution records for a vector of institution names.

Arguments

institutions

Character vector of institution names to look up in OpenAlex (e.g., “Utrecht University”, “Radboud University Nijmegen”).

pause

Numeric number of seconds to sleep between successive API calls (default 0). Use this to throttle requests when scraping many institutions.

Method

The function starts by reporting the current time with cli_alert() and initializing a progress bar via cli_progress_bar(). It then loops over all elements of institutions, optionally sleeping pause seconds before each request. For every institution name, it calls oa_fetch(entity = "institutions", search = institution) inside tryCatch(), converting any errors or warnings to NULL so that failures do not interrupt the loop.

If the result is a non-null tibble, it first renames the OpenAlex columns id and display_name to institution_id and institution_name. For single-row results, that row is stored directly in hold[[institution]]. For multi-row results, it constructs a regular expression ^<institution>$ and checks for exact matches on institution_name. When there are no exact matches, the full result is kept (for later manual inspection); when there is exactly one match, only that row is kept; when there are multiple exact matches, the first matching row is retained as the most relevant hit. The progress bar is updated on each iteration.

After processing all names, the function binds all stored tibbles with bind_rows(), removes duplicate institutions using distinct(institution_id, .keep_all = TRUE), and keeps only institution_name and institution_id. It then adds an institution_url column equal to the original OpenAlex URL and converts institution_id to a compact identifier by stripping the "https://openalex.org/" prefix. The function returns this tidy tibble of unique, disambiguated OpenAlex institution records.

oa_fetch_institutions = function(institutions, pause=1){
  k = length(institutions)
  hold = list()

  cli_alert("Starting now, at {Sys.time()}")
  cli_progress_bar("Scraping institutions", total = k, clear = FALSE)
  # 1. iterate over all institutionsinstitutions
  for (institution in institutions){
    # sleep if a pause time is set
    if (pause > 0) Sys.sleep(pause)

    # 2. fetch result for all institutions
    res = tryCatch(
      oa_fetch(
          entity = "institutions", search = institution, api_key = APIKEY
        ),
      error = function(e) NULL,
      warning = function(w) NULL
    )

    cli_progress_update()
    

    # 3. for cases with a valid result rename colums
    if (is.null(res) || !is_tibble(res)) next
    res = res |> rename(
      "institution_id" = "id",
      "institution_name" = "display_name"
    )

    # 4.1. single result hitsare directly stored
    if (nrow(res) == 1L) {
      hold[[institution]] <- res
      next
    }

    # 4.2. multiple result hits are processed and checked the institution_name 
    #      matches the OpenAlex display_name for institutions
    pat = paste0("^", institution, "$")
    idx = str_detect(res$institution_name, pat)

    if (sum(idx, na.rm = TRUE) == 0L) {
      hold[[institution]] = res
      next
    }

    if (sum(idx, na.rm = TRUE) == 1L) {
      hold[[institution]] = res[idx,]
      next
    }

    # if multiple matches exist with only select the first (most relevant) one
    if (sum(idx, na.rm = TRUE) > 1L) {
      hold[[institution]] = res[idx,][1, ]
    }
  }

  # combine the fetched data and harmonize this data
  out = bind_rows(hold) |> 
    distinct(institution_id, .keep_all=TRUE) |>
    select(institution_name, institution_id) |>
    mutate(
      institution_url = institution_id,
      institution_id = str_remove(institution_id, 'https://openalex.org/')
    )

  return(out)
}

`add_institution_id()`

Attach OpenAlex institution IDs to normalized university records.

Arguments

df

Tibble or data frame containing at least uid, university, and university_name, where university_name holds cleaned institution labels and university stores the original or raw affiliation.

Method

The function first extracts all unique, non-missing values of university_name and passes them to oa_fetch_institutions() to obtain a disambiguated lookup table of OpenAlex institutions, including institution_name, institution_id, and institution_url. It then renames university_name to institution_name and university to institution in df so that the columns align with the lookup, and performs a left_join() on institution_name. After the join, it filters out any rows where institution_id is missing (i.e., cases where no OpenAlex match was found) and removes duplicate (uid, institution_id) combinations to ensure each person–institution pair is unique. The resulting tibble retains the original information along with harmonized institution identifiers suitable for linkage to OpenAlex metadata.

add_institution_id = function(df){
  # create a vector all the unique university names
  institutions = df$university_name |> unique() |> na.omit()
  institutions = oa_fetch_institutions(institutions)

  # merge the institutions data with df and harmonize the data
  out = df |>
    rename(
      "institution_name" = "university_name",
      "institution" = "university"
    ) |>
    left_join(institutions) |>
    filter(!is.na(institution_id)) |>
    distinct(uid, institution_id, .keep_all=TRUE)

  return(out)
}

`normalize_name()`

Normalize personal or institutional names for robust matching.

Arguments

x

Character vector (or coercible to character) with raw names or labels.

Method

The function first coerces x to character. It then detects which elements contain Cyrillic characters using stri_detect_charclass(). For those elements only, it transliterates Cyrillic to Latin script via stri_trans_general(x, "Cyrillic-Latin"), preserving the others as-is. Next, it removes accents/diacritics from all entries using "Latin-ASCII", converts the result to lowercase, and collapses multiple spaces into a single space with str_squish(). Finally, leading and trailing whitespace is removed using trimws(). The output is a character vector of normalized, ASCII-only, lowercase names that are suitable for fuzzy matching and string distance calculations across languages and scripts.

normalize_name = function(x) {
  x = as.character(x)

  # mark Cyrillic
  has_cyr = stri_detect_charclass(x, "\\p{Script=Cyrillic}")

  # transliterate only Cyrillic -> Latin
  x[has_cyr] = stri_trans_general(x[has_cyr], "Cyrillic-Latin")

  # strip accents for all, lowercase, trim, squish
  x = stri_trans_general(x, "Latin-ASCII")
  x = tolower(x)
  x = str_squish(x)
  trimws(x)
}

`add_query_similarity()`

Compute a Jaro–Winkler similarity score between OpenAlex display names and query names.

Arguments

data

Tibble or data frame containing at least display_name (e.g., OpenAlex entity label) and query_name (the original search string).

Method

The function creates two temporary normalized versions of the names: dn as the normalized display_name and qn as the normalized query_name, both produced by normalize_name(). It then calculates the Jaro–Winkler string distance between dn and qn using stringdist(..., method = "jw") and converts this distance to a similarity measure via 1 - distance. The resulting value is stored in a new column query_similarity, which ranges from 0 (completely dissimilar) to 1 (identical after normalization). Finally, the temporary helper columns dn and qn are dropped, and the augmented data frame is returned with an interpretable similarity metric that can be used for ranking or filtering matches.

add_query_similarity = function(data){
  data |>
    mutate(
      dn = normalize_name(display_name),
      qn = normalize_name(query_name),
      query_similarity   = 1 - stringdist(dn, qn, method = "jw")
    ) |>  # Jaro–Winkler
    select(-dn, -qn)
}

`oa_fetch_authors()`

Fetch candidate OpenAlex author records for each scholar and score name similarity

Arguments

df

Tibble or data frame containing at least uid, clean_name, and institution_id for each scholar to query.

pause

Numeric number of seconds reserved for throttling between requests (default 0). Currently not used inside the function.

Method

The function first extracts the vector of unique, non-missing uid values from df and initializes a progress bar with cli_alert() and cli_progress_bar(). For each uid, it subsets df to the corresponding rows, pulls the scholar’s clean_name, and collects the associated institution_id values. It then calls oa_fetch(entity = "author", search = clean_name, affiliations.institution.id = institution_ids) inside tryCatch() so that HTTP or API errors return NULL instead of stopping execution. If this institution-constrained query returns NULL, it falls back to a second oa_fetch() call that omits the affiliation filter and searches only on clean_name. After updating the progress bar, the function skips to the next uid if the result is not a non-empty tibble. Otherwise, it renames id to author_id, removes duplicate author_id rows, and adds uid and query_name (set to clean_name) columns. It then calls add_query_similarity() to compute a Jaro–Winkler similarity between display_name and query_name, and finally orders the candidates by descending relevance_score. Each processed result tibble is stored in hold[[uid]], and at the end the function returns the hold list containing scored OpenAlex author candidates for all requested scholars.

oa_fetch_authors = function(df, pause = 0){
  # configure iterator and other looping variables
  uids = df |> pull(uid) |> na.omit() |> unique()
  k = length(uids)
  cache_path = file.path('data', '_cache', 'oaauthor.Rds')
  cache <- load_cache(cache_path)

  cli_alert("Starting now, at {Sys.time()}")
  cli_progress_bar("Scraping authors", total = k, clear = FALSE)
  for (id in uids) {
    cli_progress_update()
    if (id %in% names(cache)) next

    # extract information for each author
    tab = df |> filter(uid == id)

    # pull the required information
    clean_name = tab |> distinct(clean_name) |> first() |> pull()
    institution_ids = tab |> distinct(institution_id) |> pull()

    Sys.sleep(pause)
    res = tryCatch(
      oa_fetch(
        entity = 'author',
        search = clean_name,
        affiliations.institution.id = institution_ids,
        api_key = APIKEY
      ),
      error = function(e) NULL,
      warning = function(w) NULL
    )

    if (is.null(res)) {
      res = tryCatch(
        oa_fetch(
          entity = 'author',
          search = clean_name,
          api_key = APIKEY
        ),
        error = function(e) NULL,
        warning = function(w) NULL
      )
    }

    # skip institution if invalid result
    if (is.null(res) || !is_tibble(res) || (nrow(res) == 0)) next

    # harmonize and expand the data
    cache[[id]] = res |> rename('author_id' = 'id') |>
      distinct(author_id, .keep_all = TRUE) |>
      mutate(
        uid = id,
        query_name = clean_name,
        .before = 1
      ) |>
      add_query_similarity() |>
      arrange(-relevance_score)
    
  }
  saveRDS(cache, cache_path)
  return(cache)
}

`unnest_institutions()`

Unnest and clean OpenAlex institution data per author in a nested list of tibbles

Arguments

data

Named list of tibbles, where each element corresponds to a uid and contains at least uid, author_id, and last_known_institutions (a list-column of data frames with OpenAlex institution info).

Method

The function iterates over each element of the input list data, treating the list names as author uids. For every element, it first checks whether the object is a non-empty tibble; invalid entries are skipped. For valid tibbles, it filters rows to the current uid, removes duplicate (uid, author_id) combinations, and then processes the last_known_institutions list-column. Within each element of last_known_institutions, it keeps unique rows, derives a compact institution_id by stripping the "https://openalex.org/" prefix from id, and creates a readable institution_name from display_name, retaining id, institution_id, institution_name, and country_code. In addition, it constructs an institution_ids column that stores, per row, the unique set of associated institution IDs as a character vector. The modified tibble is written back into the corresponding position in the hold list. Finally, the function returns the updated list, preserving its original structure but with institution information consistently unpacked and enriched.

clean_last_known_institutions <- function(tab){
  empty_inst <- tibble(
    id = character(),
    display_name = character(),
    country_code = character()
  )

  # 1. If the column is not present, create it (one empty tibble per row)
  if (!"last_known_institutions" %in% names(tab)) {
    tab <- tab %>%
      mutate(last_known_institutions = replicate(
        n(), empty_inst, simplify = FALSE)) |>
      relocate(last_known_institutions, .before = topics)
  }

  # 2. If entries are missing/NULL/NA/list(NA), replace them with empty_inst
  tab <- tab %>%
    mutate(
      last_known_institutions = map(last_known_institutions, \(x) {
        if (is.null(x) || length(x) == 0 || (length(x) == 1 && all(is.na(x)))) {
          empty_inst
        } else {
          x
        }
      })
    )

  return(tab)
}

unnest_institutions = function(data){
  hold = data
  for (id in names(data)) {
    # extract information for each author and skip if invalid
    tab = data[[id]]
    if (is.null(tab) || !is_tibble(tab) || !(nrow(tab) > 0)) next

    # clean authors data and unnests institution information
    hold[[id]] = tab |>
      filter(uid == id) |>
      distinct(uid, author_id, .keep_all=TRUE) |>
      clean_last_known_institutions() |>
      mutate(
        # unpacks the dataframes found in last_known_institutions
        last_known_institutions = map(
          last_known_institutions,
          ~ .x |>
            distinct(.keep_all = TRUE) |>
            drop_na() |>
            mutate(
              institution_id   = str_remove(id, "^https://openalex.org/"),
              institution_name = display_name
            ) |>
            select(id, institution_id, institution_name, country_code)
        ),
        # unnests the institution_ids
        institution_ids = map(
          last_known_institutions, 
          ~ .x |> pull(institution_id) |> unique()
        )    
      )
  }
  return(hold)
}

`match_institutions()`

Filter OpenAlex author candidates by institutional match and name similarity

Arguments

data

Named list of tibbles with OpenAlex author candidates per uid, typically already enriched with institution_ids, query_name, display_name, query_similarity, and orcid.

df

Tibble or data frame containing at least uid and institution_id, representing the verified institution mappings from your own data.

Method

For each element in data, the function first checks that the entry is a non-empty tibble and skips invalid ones. It then looks up all unique institution_id values in df for the current uid, skipping authors without any known institutions. For valid entries, it augments the candidate table by computing institution_match, which is TRUE when at least one of the candidate’s institution_ids overlaps with the known institution IDs for that uid. It then defines keep using a two-stage rule: candidates with query_similarity == 1.0 are always kept; otherwise, candidates are kept if they have high similarity without an institution match (query_similarity ≥ 0.60) or moderate similarity with an institution match (query_similarity ≥ 0.45). In a second pass over keep, the function applies a series of hard-coded exceptions that force specific problematic display_name and query_name combinations to be dropped, even if they otherwise pass the thresholds. The orcid column is coerced to character for consistency, and finally only rows with keep == TRUE are retained. The filtered tibble is stored back in the corresponding list element, and the updated list of disambiguated author matches is returned.

match_institutions = function(data, df){
  hold = data
  for(id in names(data)){
    # extract information for each author and skip if invalid
    tab = data[[id]]
    if (is.null(tab) || !is_tibble(tab) || !(nrow(tab) > 0)) next

    # create a list of institution_ids per author and skip if invalid
    institutions = df |>
      filter(uid == id) |>
      pull(institution_id) |>
      unique()
    if ((length(institutions) < 1)) next

    #... 
    hold[[id]] = tab |>
      mutate(
        # indicates whether there is a match in institution_id between OpenAlex
        # and the original dataframe
        institution_match = map_lgl(
          institution_ids, 
          ~ {x = .x; if (is.null(x)) x = character(0)
            any(as.character(x) %in% institutions)}
          ),
        institution_dutch = map_lgl(
          last_known_institutions,
          ~ {
            x = .x; if (is.null(x)) x = character(0)
            any(x$country_code == 'NL')
          }
        ),
        keep = case_when(
            query_similarity == 1.0 ~ TRUE,
            institution_dutch == FALSE ~ FALSE,
            !institution_match & query_similarity >= 0.60 ~ TRUE,
            institution_match & query_similarity >= 0.45 ~ TRUE,
            .default = FALSE
          ),
        keep = case_when(
            display_name == 'Thijs W. de Vos' ~ FALSE,
            display_name == 'Kees van Kersbergen' ~ FALSE,
            display_name == 'René W. van der Hulst' ~ FALSE,
            display_name == 'Younes Zoughlami' ~ FALSE,
            display_name == 'Younes Zeboudj' ~ FALSE,
            display_name == 'Younes Saramifar' ~ FALSE,
            display_name == 'van Dijk' ~ FALSE,
            display_name == 'Stephanie Maas' ~ FALSE,
            display_name == 'A.J.J. Nijhuis' ~ FALSE,
            (query_name == 'Meindert Fennema') 
              & (display_name == 'Wouter van der Brug') ~ FALSE,
            .default = keep
          ),
        orcid = as.character(orcid)
      ) |>
      filter(keep)
  }
  return(hold)
}

oa_fetch_works = function(data, pause = 0){
  # configure iterator and other looping variables
  k = length(data)
  cache_path = file.path('data', '_cache', 'oaworks.Rds')
  cache = load_cache(cache_path)

  # get new APIKEY
  dotenv::load_dot_env()
  APIKEY <- Sys.getenv("OPENALEX_API_KEY2")

  cli_alert("Starting now, at {Sys.time()}")
  cli_progress_bar("Scraping works", total = k, clear = FALSE)
  for (id in names(data)){
    cli_progress_update()
    if (id %in% names(cache)) next
    # extract information for each author and skip if invalid
    tab = data[[id]]

    # extract a list of institution_ids 
    # if more than 10 ids are present, limited to the 10 most relevant
    oa_ids = tab |> 
      mutate(author_id = str_remove(author_id, 'https://openalex.org/')) |>
      arrange(relevance_score) |>
      distinct(author_id) |> drop_na() |> 
      head(10) |> pull(author_id) |> unique() 
    
    # if the scholar has no valid oa_ids than skip this scholar
    if (any(is.null(oa_ids), length(oa_ids) == 0)) {
      cache[[id]] = NULL
      next
    }

    # fetch works where the author was part of the authorship 
    res = tryCatch(
      oa_fetch(
          entity = 'works',
          authorships.author.id = oa_ids,
          api_key = APIKEY
      ),
      error = function(e) NULL,
      warning = function(w) NULL
    )

    # if there is a valid result then extract author_id from authorships
    if (all(!is.null(res), is_tibble(res), nrow(res) > 0)){
      res = res |>
        rename(work_id = id) |>
        mutate(authors = authorships) |>
        # expand the data table with a new row per author
        unnest(authors, keep_empty = TRUE, names_sep = "__") |>
        mutate(uid = id, author_id = authors__id, .before = work_id) |>
        mutate(across(
          c(work_id, author_id),
          ~ str_remove(as.character(.x), "^https://openalex.org/")
        )) |>
        # select the row where the author_id is in the selection
        filter(author_id %in% oa_ids) |>
        select(-starts_with('authors__')) |>
        # drop duplicates (very rare)
        distinct(uid, work_id, .keep_all = TRUE)
    }
    cache[[id]] = res
    
  }
  saveRDS(cache, cache_path)
  return(cache)
}

Application

# load the most recent names data
data = freadRDS2('ethnicity')

load_cache <- function(path) {
  if (!file.exists(path)) return(list())

  cache <- tryCatch(
    readRDS(path),
    error = function(e) list(),
    warning = function(w) list()
  )

  # Validate expected structure (named list)
  if (!is.list(cache)) return(list())
  if (is.null(names(cache))) names(cache) <- character(0)

  cache
}

oa_fetch_institutions = function(institutions, pause = 0){
  k = length(institutions)
  cache_path =  file.path('data', '_cache', 'oainstitutions.Rds')
  cache <- load_cache(cache_path)

  cli_alert("Starting now, at {Sys.time()}")
  cli_progress_bar("Scraping institutions", total = k, clear = FALSE)
  # 1) iterate over all institutionsinstitutions
  for (institution in institutions){
    # skip institution if cached
    if(institution %in% names(cache)) next

    # 2) fetch result for all institutions
    res = tryCatch(
      oa_fetch(
          entity = "institutions", search = institution, api_key = APIKEY
        ),
      error = function(e) NULL,
      warning = function(w) NULL
    )

    cli_progress_update()

    if(is_tibble(res)) {
      res = res |>
        mutate(institution_name = institution) |>
        relocate(institution_name, .before = id) |>
        first()
    }

    cache[[institution]] = res
  }

  # write cache
  saveRDS(cache, cache_path)

  out = cache |> 
    bind_rows() |>
    select(institution_name, id) |>
    rename(institution_id = id)

  return(out)
}

add_institution_id = function(df){
  # fetch institution ids for each university
  institutions = df$university_name |> unique() |> na.omit()
  results = oa_fetch_institutions(institutions)

  # create author table with institution_ids
  authors = df |>
    left_join(results, by = join_by(university_name == institution_name)) |>
    distinct(clean_name, institution_id, .keep_all = TRUE) |>
    filter(university_name %in% results$institution_name) 
}

# Combine names and scholarid data and reshape to create a table with
# a row per author university pair
df = data |>
  distinct(uid, university, .keep_all = TRUE) |>
  unnest_longer(university) |>
  normalize_university_names() |>
  add_institution_id()

# create a authors table
authors = df |>
  oa_fetch_authors(pause=0) |> 
  unnest_institutions() |>
  match_institutions(df)

fsaveRDS(authors, 'oaauthors', location = "./data/raw_data/")

a = bind_rows(authors)
a |> head()

# create a works table
works = oa_fetch_works(authors)
fsaveRDS(works, 'oaworks', location = "./data/raw_data/")

w = bind_rows(works)
w |> head()