Combine Scrape Data

Published

March 31, 2026

Getting Started

# load custom functions
source("src/utils/custom_functions.r")

# clear the global environment and set dependencies
.clear_global_environment()
.load_quarto_dependencies()

# load and activate packages
library(tidyverse)
library(stringr)
library(lubridate)

Functions

`reshape_demographics()`

Note

TODO: add description and information

reshape_demographics = function(data){
  data[['demographics']] = data[['demographics']] |>
    mutate(
      year = as.integer(year(date) - 2000),
      functie = case_when(
        str_detect(functie, 'Lecturer') ~ 'Researcher or Lecturer',
        str_detect(functie, 'Researcher') ~ 'Researcher or Lecturer',
        .default = functie
      ),
      discipline = case_when(
        str_detect(discipline, 'Sociologie') ~ "Sociology",
        str_detect(discipline, 'Politicologie') ~ 'Political Sciences',
        .default = NA_character_
      )
    ) |>
    arrange(clean_name, year) |>
    select(
      -university, -google_scholar_id, -date,
      -naam, -clean_name_full, -email_adres
    ) |>
    pivot_wider(
      names_from = year, 
      values_from = c(universiteit, functie, functie2, discipline),
      names_glue  = "{.value}_{year}",
      values_fill = NA,
      values_fn   = list(
        universiteit = ~ if (all(is.na(.x))) NA_character_ else first(na.omit(.x)),
        functie      = ~ if (all(is.na(.x))) NA_character_ else first(na.omit(.x)),
        functie2     = ~ if (all(is.na(.x))) NA_character_ else first(na.omit(.x)),
        discipline   = ~ if (all(is.na(.x))) NA_character_ else first(na.omit(.x))
      )
    ) |> 
    distinct(uid, .keep_all = TRUE)

  return(data)
}

`parse_works()`

Extract and clean per-UID works tables from the raw works list.

Note

TODO: update description

parse_works = function(data){
  uids = data[['demographics']] |> pull(uid) |> unique()
  works = data[['works']]

  # 1. Deduplicate valid results
  works <- works |>
    imap(~ if (inherits(.x, "data.frame")) distinct(.x) else .x)

  # 2. Define a placeholder row for missing uids
  placeholder <- function(uid) {
    tibble(uid = uid, has_works = FALSE)
  }

  # 3. Expand to full uid set, ensuring EVERY element is a tibble
  data[['works']] <- setNames(
    map(uids, function(uid) {
      if (uid %in% names(works) && inherits(works[[uid]], "data.frame")) {
        works[[uid]] |>
          mutate(uid = uid, .before = 1, has_works = TRUE) |>
          distinct(work_id, .keep_all = TRUE)
      } else {
        placeholder(uid)
      }
    }),
    uids
  )
  return(data)
}

`add_coauthor_uids()`

Attach scholar UIDs to nested OpenAlex authorship records within works.

Note

TODO: add description

add_coauthor_uids = function(data){
  lookup = data[['scholars_oa']] |>
    bind_rows() |>
    distinct(uid, author_id) |>
    drop_na()

  for (id_ in names(data[['works']])){
    data[['works']][[id_]] = data[['works']][[id_]] |>
      mutate(
        authorships = map(
          authorships,
          ~ lookup |> 
            right_join(.x, by = join_by(author_id == id))  
        )
      )
  }
  return(data)
}

`parse_scholars()`

Split a combined OpenAlex author table into a per-UID list.

Note

TODO: add description

parse_scholars = function(data){
  # return(authors)
  uids = data[['demographics']] |> pull(uid) |> unique()
  scholars = data[['scholars_oa']]

  # 1. Deduplicate valid results
  scholars <- scholars |>
    imap(~ if (inherits(.x, "data.frame")) distinct(.x) else .x)

  # 2. Define a placeholder row for missing uids
  placeholder <- function(uid) {
    tibble(uid = uid, has_author_id = FALSE)
  }

  # 3. Expand to full uid set, ensuring EVERY element is a tibble
  data[['scholars_oa']] <- setNames(
    map(uids, function(uid) {
      if (uid %in% names(scholars) && inherits(scholars[[uid]], "data.frame")) {
        scholars[[uid]] |>
          mutate(uid = uid, .before = 1, has_author_id = TRUE) |>
          distinct(author_id, .keep_all = TRUE)
      } else {
        placeholder(uid)
      }
    }),
    uids
  )

  return(data)
}

Application

data = list(
  demographics = freadRDS2('ethnicity'),
  scholars_oa = freadRDS2('oaauthors', location = 'raw_data'),
  works = freadRDS2('oaworks', location = 'raw_data')
)

scholars = data |>
  reshape_demographics() |>
  add_coauthor_uids() |> # takes a considerable amount of time
  parse_works() |>
  parse_scholars()

fsaveRDS(scholars, file="scholars", location = "./data/combined/")