Data Preparation OpenAlex-ID

Published

October 21, 2025

Tip

rough first working code for the collection of scholar_ids using openalexr

I optimized the algorithm to not just select the top case, but allows for multiple ids (in rows) per person, in the decision rules i use: - semantic similarity - matches of university_id and institution_id

Getting Started

# clear the global environment
rm(list = ls())
gc()
source("src/utils/custom_functions.r")

# load and activate packages
fpackage.check(c(
  'tidyverse', 'readxl',  'stringr', 
  'lubridate', 'openalexR', 'rvest', 'jsonlite',
  'cli'
))

options(openalexR.mailto = "jos.slabbekoorn@ru.nl")
dir = file.path('data', 'processed')
file = list.files(dir, pattern = 'names.Rds')
names = freadRDS(file.path(dir, file))


file = list.files(dir, pattern = 'scholarid.Rds')
scholar = freadRDS(file.path(dir, file)) |>
  select(naam, university) |>
  distinct(.keep_all = TRUE) |>
  unnest_longer(university) 
  # |>
  # left_join(names) |>
  # unite(
  #   initials:last_name, 
  #   col=name, sep = ' ', 
  #   na.rm=TRUE, remove=FALSE) |>
  # unite(
  #   c(name, maiden_name), 
  #   col=name2, sep = '-', 
  #   na.rm=TRUE, remove=FALSE)
scholar = scholar |>
  mutate(
    university = university |>
      str_replace("(Uni |uni)", "University "),
    university_name = case_when(
      str_detect(university, "VU") ~ "Free University Amsterdam",
      str_detect(university, "(UvA|Uva)") ~ "University of Amsterdam",
      str_detect(university, "Leiden") ~ "Leiden University",
      university == 'RU' ~ "Radboud University Nijmegen",
      str_detect(university, "(UU|UCU)") ~ "Utrecht University",
      university == 'EUR' ~ "Erasmus University Rotterdam",
      str_detect(university, '(RUG|UvG)') ~ "University of Groningen",
      str_detect(university, '(UvT|Uvt|Tilburg)') ~ "University of Tilburg",
      str_detect(university, 'WUR') ~ "Wageningen University & Research",
      university == 'Uni Gothenburg' ~ "University of Gothenburg",
      university == 'TU Delft' ~ "Technical University Delft",
      str_detect(university, 'Milano') ~ 'University of Milan',
      # fix for mistake in affilation for Sabine Mokry,
      str_detect(university, 'Berlijn') ~ 'University of Hamburg',
      str_detect(university, 'Trento') ~ 'University of Trento',
      str_detect(university, 'Stockholm') ~ 'Stockholm University',
      str_detect(university, 'Gent') ~ "Universiteit Gent",
      university == "Politie" ~ NA_character_,
      university == "UvH" ~ NA_character_,
      .default = university
    ),
    university_name2 = case_when(
      str_detect(university, "VU") ~ "Vrije Universiteit Amsterdam",
      str_detect(university, "(UvA|Uva)") ~ "Universiteit van Amsterdam",
      str_detect(university, "Leiden") ~ "Leiden Universiteit",
      university == 'RU' ~ "Radboud Universiteit Nijmegen",
      str_detect(university, "(UU|UCU)") ~ "Universiteit Utrecht",
      university == 'EUR' ~ "Erasmus Universiteit Rotterdam",
      str_detect(university, '(RUG|UvG)') ~ "Rijksuniversiteit Groningen",
      str_detect(university, '(UvT|Uvt|Tilburg)') ~ "Universiteit van Tilburg",
      str_detect(university, 'WUR') ~ "Wageningen University & Research",
      university == 'Uni Gothenburg' ~ "Universiteit van Gothenburg",
      university == 'TU Delft' ~ "Technische Universiteit Delft",
      str_detect(university, 'Milano') ~ 'Università degli Studi di Milano Statale',
      # fix for mistake in affilation for Sabine Mokry,
      str_detect(university, 'Berlijn') ~ 'Universität Hamburg',
      str_detect(university, 'Trento') ~ 'Università degli Studi di Trento',
      str_detect(university, 'Cologne') ~ 'Universität zu Köln',
      str_detect(university, 'Stockholm') ~ 'Stockholms Universitet',
      str_detect(university, 'Bocconi') ~ "Università Bocconi",
      str_detect(university, 'Koc') ~ "Koç Üniversitesi",
      str_detect(university, 'Gent') ~ "Ghent University",
      str_detect(university, 'Gothenburg') ~ "Göteborgs universitet",
      str_detect(university, "Turku") ~ "Turun Yliopisto",
      str_detect(university, 'Lausane') ~ "Université de Lausanne",
      str_detect(university, "Leipzig") ~ "Universität Leipzig",
      str_detect(university, "Linköping") ~ "Linköpings universitet",
      .default = NA_character_
    )
  ) |>
  drop_na() |>
  pivot_longer(
    cols = university_name:university_name2, 
    names_to = "var",
    values_to = "university_name"
  ) |>
  select(-var)


institutions = scholar$university_name |> unique()
# scholar = scholar |>
#   mutate(
#     university_name = university |>
#       str_replace("(Uni |uni)", "University ") |>
#       str_replace('(UU)', 'Utrecht University') |>
#       str_replace('(UCU)', 'Utrecht University') |>
#       str_replace('(Leiden|Leiden University)', 'Leiden University') |>
#       str_replace('(RUG)', 'University of Groningen') |>
#       str_replace('(RU)', 'Radboud University Nijmegen') |>
#       str_replace('(UvA|Uva)', 'University of Amsterdam') |>
#       str_replace('(VU)', 'Free University Amsterdam') |>
#       str_replace('(EUR)', 'Erasmus University Rotterdam') |>
#       str_replace('(UvT|Tilburg|Uvt)', 'University of Tilburg') |>
#       str_replace('(TU Delft)', 'Technical University Delft') |>
#       str_replace('(WUR)', 'Wageningen University & Research') |>
#       str_replace('(Universita degli studi di Milano)', 'University of Milan') |>
#       str_replace('(Politie)', NA_character_) |>
#       str_replace('(UvH)', NA_character_) |>
#       str_replace('(UvG)', "University of Groningen") |>
#       # fixing mistake for Sabine Mokry
#       str_replace('University Berlijn', "University of Hamburg") |>
#       str_replace('Trento University', "University of Trento") |>
      
#   ) |>
#   filter(!is.na(university_name))

# # institutions = c(
# #   scholar$university_name |> unique(),
# # )

# institutions = scholar$university_name |> unique()
oa_fetch_institution = function(institution, pause=0){
  if (pause > 0) Sys.sleep(pause)
  oa_fetch(
    entity = "institutions",
    search=institution,
    mailto = "jos.slabbekoorn@ru.nl"
  )$id[1]
}

hold = c()
for (institution in institutions){
  id = oa_fetch_institution(institution)
  hold[[institution]] = tibble::tibble(
    university_name = institution,
    university_url = id, 
    university_id = str_remove(id, 'https://openalex.org/')
  )
}

institutions = bind_rows(hold) |>
  drop_na()
scholar = scholar |>
  left_join(institutions) |>
  distinct(naam, university_id, .keep_all = TRUE)
# library(dplyr)
library(stringdist)
library(stringi)

normalize_name = function(x) {
  x = as.character(x)

  # mark Cyrillic
  has_cyr = stri_detect_charclass(x, "\\p{Script=Cyrillic}")

  # transliterate only Cyrillic -> Latin
  x[has_cyr] = stri_trans_general(x[has_cyr], "Cyrillic-Latin")

  # strip accents for all, lowercase, trim, squish
  x = stri_trans_general(x, "Latin-ASCII")
  x = tolower(x)
  x = str_squish(x)
  trimws(x)
}

add_query_similarity = function(data){
  data |>
    mutate(
      dn = normalize_name(display_name),
      qn = normalize_name(query_name),
      query_similarity   = 1 - stringdist(dn, qn, method = "jw")
    ) |>  # Jaro–Winkler
    select(-dn, -qn)
}

# dubbel check op achternaam
# als daar voor nog wat voor staat komt dat overeen met de voornaam
# als de voornaam, komt dit overeen met initalen
oa_fetch_scholar = function(
    scholar_name, 
    university_ids,
    pause = 0
  ){
  # fetch oa scholar information for all name id pairs
  hold = list()
  for (id in university_ids){
    if (pause > 0) Sys.sleep(pause)
    
    hold[[id]] = tryCatch(
      oa_fetch(
        entity = 'author',
        search = scholar_name,
        affiliations.institution.id = id,
        mailto = "jos.slabbekoorn@ru.nl"
      ),
      error = function(e) NULL,
      warning = function(w) NULL
    )
  }
  res = bind_rows(hold)
  # combine the results and drop duplicates
  if (ncol(res) > 2){
    res = res |> 
      distinct(.keep_all = TRUE) |>
      mutate(query_name = scholar_name) |>
      add_query_similarity() |>
      arrange(-query_similarity) |>
      head(15)
  }

  return(res)
}
scholar_name = 'Amy Verdun'
id = "I121797337"
res = oa_fetch_scholar(scholar_name, id)
library(cli)
scholars = scholar$naam |> unique()

scrape_scholars = function(scholars){
  # configure loop
  k = length(scholars)
  hold = list()

  # loop over all scholars
  cli_alert("Starting now, at {Sys.time()}")
  cli_progress_bar("Scraping Scholars", total = k, clear = FALSE)
  for (scholar_name in scholars) {
    tab = scholar |> 
      filter(naam == scholar_name) |>
      select(naam:university_id)

    university_ids = as.list(na.omit(tab$university_id))
    if (length(university_ids) > 0){
      hold[[scholar_name]] = oa_fetch_scholar(scholar_name, university_ids)
    }
    cli_progress_update()
  }

  return(hold)
}


hold = scrape_scholars(scholars)

clean institutions data

hold2 = list()

for (name_ in names(hold)){
  if (is.na(name_) || !nzchar(trimws(name_))) next
  tab = hold[[name_]] 

  # if there is a table, than update the table
  if (nrow(tab) >= 1){
    tab = tab |>
    filter(query_name == name_) |>
    distinct(.keep_all = TRUE) |>
    mutate(
      last_known_institutions = map(
        last_known_institutions,
        ~ .x |>
          distinct(.keep_all = TRUE) |>
          mutate(
            institution_id   = str_remove(id, "^https://openalex.org/"),
            institution_name = display_name
          ) |>
          select(id, institution_id, institution_name, country_code)
      ),
      institution_ids = map(last_known_institutions, ~ .x |> pull(institution_id) |> unique())
    )
  }

  hold2[[name_]] = tab
}

# hold2

sanity checks for institution_ids

hold3 = list()
for (name_ in names(hold2)){
  tab = hold2[[name_]]

  university_ids = scholar |> 
    filter(naam == name_) |>
    select(university_id) |> 
    pull()

  # for valid tables, match the institution ids with university ids
  if (nrow(tab)> 0){
    # ensure we have a list-column of character vectors (no NULLs)
    ids_list = map(tab$institution_ids, ~ if (is.null(.x)) character(0) else as.character(.x))

    # for each row, return indices of rows (excluding self) sharing any institution_id
    matching_rows = map2(ids_list, seq_along(ids_list), \(ids, i){
      if (length(ids) == 0) return(integer(0))
      hits = which(map_lgl(ids_list, ~ length(intersect(ids, .x)) > 0))
      hits[hits != i]
    })

    tab = tab |>
      mutate(
        # any match between this row's institutions and this scholar's uni_ids?
        institution_matched = map_lgl(institution_ids, ~ {
          x = .x; if (is.null(x)) x = character(0)
          any(as.character(x) %in% university_ids)
        }),
        # vector of row numbers with overlapping institution_ids (across all other rows)
        has_multiple_records = matching_rows
      ) |>
      ## decision rule which rows to keep
      mutate(
        keep = case_when(
          query_similarity == 1.0 ~ TRUE,
          !institution_matched & query_similarity >= 0.60 ~ TRUE,
          institution_matched & query_similarity >= 0.45 ~ TRUE,
          display_name == 'Thijs W. de Vos' ~ FALSE,
          display_name == 'Kees van Kersbergen' ~ FALSE,
          display_name == 'René W. van der Hulst' ~ FALSE,
          display_name == 'Younes Zoughlami' ~ FALSE,
          display_name == 'Younes Zeboudj' ~ FALSE,
          display_name == 'Younes Saramifar' ~ FALSE,
          display_name == 'van Dijk' ~ FALSE,
          display_name == 'Stephanie Maas' ~ FALSE,
          display_name == 'A.J.J. Nijhuis' ~ FALSE,
          .default = FALSE
        ),
        orcid = as.character(orcid)
      ) |>
      filter(keep) |> select(-counts_by_year)
  }
  
  hold3[[name_]] = tab 
}
fsaveRDS(hold3, 'oascholars', location = "./data/raw_data/")
dir = file.path('data', 'raw_data')
file = list.files(dir, pattern = 'oascholars.Rds')[1]
data = freadRDS(file.path(dir, file ))

data = bind_rows(data)