Data Preparation OpenAlex-ID

Published

October 21, 2025

Tip

rough first working code for the collection of scholar_ids using openalexr

I optimized the algorithm to not just select the top case, but allows for multiple ids (in rows) per person, in the decision rules i use: - semantic similarity - matches of university_id and institution_id

Getting Started

# clear the global environment
rm(list = ls())
gc()

source("src/utils/custom_functions.r")

# load and activate packages
fpackage.check(c(
  'tidyverse', 'readxl',  'stringr', 
  'lubridate', 'openalexR', 'rvest', 'jsonlite',
  'cli'
))

options(openalexR.mailto = "jos.slabbekoorn@ru.nl")

old_file_ = "/Users/josslabbekoorn/Downloads/scholars_20240925.rda"
old = load(old_file_)

file = file.path('data', 'raw_data', '20251015oascholars.Rds')
scholars = freadRDS(file) |> bind_rows()

splits = scholars[1:5]

oa_fetch_works = function(scholars){
    works = list()
    ids = scholars$id |> unique()
    k = length(ids)

    cli_alert("Starting now, at {Sys.time()}")
    cli_progress_bar("Scraping Works", total = k, clear = FALSE)
    for (id_ in ids){
        tab = scholars |> filter(id == id_)
        
        if (nrow(tab) > 0){
            for (i in 1:nrow(tab)){
                row = tab[i, ]

                res = tryCatch(
                    oa_fetch(
                        entity = 'works',
                        author.id = str_remove(id_, 'https://openalex.org/'),
                        mailto = "jos.slabbekoorn@ru.nl"
                    ),
                    error = function(e) NULL,
                    warning = function(w) NULL
                    )
                }

                if (!is.null(res)){
                    res = res |>
                        mutate(author_id = id_) |>
                        relocate(author_id, .before=id)
                }
                works[[id_]] = res
            
        }
        cli_progress_update()
    }
    
    return(works)
}

works = oa_fetch_works(scholars)

fsaveRDS(works, 'oaworks', location = "./data/raw_data/")