Data Processing Scholars

Published

October 21, 2025

Getting Started

# clear the global environment
rm(list = ls())
gc()
          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells  605262 32.4    1371427 73.3         NA   715785 38.3
Vcells 1122026  8.6    8388608 64.0      16384  2012131 15.4
# load custom functions
source("src/utils/custom_functions.r")

# load and activate packages
fpackage.check(c(
  'tidyverse', 'readxl', 'renv', 'stringr', 
  'janitor', 'lubridate'
))

Functions

Loading in Raw Data

This function loads in Excel files containing information on scholars working in Dutch sociology and political science departments. The data have been collected at three points in time — 19 December 2022, 19 April 2024, and 1 October 2025 — to capture changes in staff composition over time. Each Excel file includes two sheets, one for Sociologie and one for Politicologie, which are read in separately, harmonized, and then concatenated into a single dataset that combines all disciplines and time points.

# define tailored functions
read_spreadsheet = function(filepath, sheet){  
  readxl::read_excel(filepath, sheet = sheet) 
}

harmonize_columns = function(data, discipline, date){
  data = data |>
    clean_names() |>
    mutate(
      discipline = discipline,
      date = date
    ) |>
    relocate(c(discipline, date), .before = 1)
}

fread = function(
    files, 
    source,
    disciplines = c( "Politicologie", "Sociologie"),
    language = 'nl'
  ){
  # read in both sheets from excel files
  hold = c()
  for (file in files){
    filepath = file.path(source, file)
    date = ymd(str_split(file, pattern = "_")[[1]][1])

    # each file contains two sheets, for each discipline
    # read in both sheets and combine these data
    disciplines = c( "Politicologie", "Sociologie")
    chunks = c()
    for (discipline in disciplines){
      chunk = read_spreadsheet(filepath, discipline) |>
        harmonize_columns(discipline, date)

      chunks[[discipline]] = chunk
    }

    hold[[file]] =  bind_rows(chunks)
  }

  data = bind_rows(hold)

  if (!language %in% c('nl', 'en')) stop("language should be nl or en")
  # implement rename function for english column namesac
  

  return(data)
}

Fix Scholar Names

This step standardizes scholar names to ensure consistent matching across sources. It first normalizes name particles common in Dutch and related languages (e.g., van, de, der, ten, ’t, le, el, op den) by converting them to lowercase when they appear between spaces (e.g., “Jan van Dijk”). Next, it corrects a curated set of frequent typos and formatting inconsistencies - such as fixing misplaced or missing initials, diacritics, and misspellings (e.g., “AJGM van Montfort” to “A.J.G.M. van Montfort,” “Lea Kroner” to “Lea Kröner”). The result is a cleaned naam field with harmonized capitalization and corrected names, improving join accuracy and downstream deduplication.

fix_scholar_names = function(data){
  # fix the particles in the names
  particles = c(
    "van", "in",
    "de", "den", "der", "den", "del",
    "te", "ten", "ter", "tes", "'t",
    "la", "le", "les", "los", "el", "el-",
    "op den"
  )
  pattern = regex(
    paste0("(?<=\\s)(", paste(particles, collapse = "|"), ")(?=\\s)"),
    ignore_case = TRUE
  )

  data |>
    mutate(
      # replace capitalized particles
      naam = str_replace_all(naam, pattern, ~ tolower(.x)),

      # replace typos and mistakes in names
      naam = case_when(
        naam == "Andrea Forstrer"      ~ "Andrea Forster",
        naam == "AJGM van Montfort"    ~ "A.J.G.M. van Montfort",
        naam == "FP Wagenaar"          ~ "F.P. Wagenaar",
        naam == "JP Presley"           ~ "J.P. Presley",
        naam == "JS Timmer"            ~ "J.S. Timmer",
        naam == "ilya Lavrov"          ~ "Ilya Lavrov",
        naam == "p. Vila Soler"        ~ "P. Vila Soler",
        naam == "Z Dong"               ~ "Z. Dong",
        naam == "Renae  Loh"           ~ "Renae Loh",
        naam == "Paulina Pankowski"    ~ "Paulina Pankowska",
        naam == "M.M Cuperus"          ~ "M.M. Cuperus",
        naam == "Lea Kroner"           ~ "Lea Kröner",
        naam == "L Slot"               ~ "L. Slot",
        naam == "Jan Willen Duyvendak" ~ "Jan Willem Duyvendak",
        .default = naam
      )
    )
}

Fixing Google Scholar ID

This step propagates known Google Scholar IDs across time for the same person. The data are first ordered by naam and date, then grouped by naam so that each individual’s records form a sequence. Within each group, missing values in google_scholar_id are filled using the nearest available value, with the fill direction controlled by the .direction argument (default “updown” fills forward and backward; alternatives like “down” or “up” restrict the fill to one direction). Groups are then ungrouped and the dataset is tidied by universiteit and date. This reduces missing IDs while ensuring values never leak across different people; it assumes that identical names refer to the same scholar, so remaining homonyms should be checked upstream.

fix_google_scholar_id = function(data, .direction='updown'){
  data |>
    arrange(naam, date) |>
    group_by(naam) |>
    # fill missing values with available information
    fill(google_scholar_id, .direction = .direction) |>  
    ungroup() |>
    arrange(universiteit, date)
}

Fixing Email Addresses

This step standardizes and completes the email address information for each scholar. It first applies a practical, case-insensitive regular expression that captures valid email formats, including subdomains, to extract clean addresses from the email_adres field. All extracted emails are then converted to lowercase to ensure consistency. Next, the data are grouped by universiteit and naam, and missing email values are filled using the nearest available information within each group (by default in both directions, controlled by the .direction argument). Finally, the dataset is ungrouped and ordered by universiteit and date, resulting in a harmonized and more complete set of email addresses that align across time points for the same scholar

fix_email_adresses = function(data, .direction = "updown"){
  # practical email regex (case-insensitive), supports subdomains
  email_pattern = regex(
    "\\b[[:alnum:]._%+-]+@[[:alnum:]-]+(?:\\.[[:alnum:]-]+)+\\b", 
    ignore_case = TRUE
  )

  # clean email variable
  data |>
    mutate(
      email_adres = str_extract(
        email_adres, email_pattern
        ) |> tolower()
    ) |>
    group_by(universiteit, naam) |>
    # fill 
    fill(email_adres, .direction = "updown") |>
    ungroup()|>
    arrange(universiteit, date)
}

Fixing Universities

This step harmonizes university affiliations and adds a canonical code alongside the raw label. It first splits multi-valued entries in universiteit (e.g., “UU / UvA”) into separate rows, then trims and normalizes each label. Using a case-insensitive pattern, it maps recognized names/abbreviations to a standard set (EUR, RU, RUG, UU, VU, UvA, UvT, Leiden). If a canonical code is still missing, it infers the affiliation from the email domain (e.g., …@essb.eur.nl -> EUR, …@vu.nl -> VU, …@uva.nl -> UvA, etc.). After removing duplicates, affiliations are re-aggregated per person and date into a list column university, joined back to the original data, and positioned next to universiteit. The result is a consistent, machine-readable university code that supports reliable grouping, filtering, and longitudinal comparison.

clean_universities = function(data){
  universities = c("EUR", "RU", "RUG", "UU", "VU", "UvA", "UvT", "LU")
  pat   = regex("\\b(EUR|RU|RUG|UU|VU|UvA|UvT|Leiden)\\b", ignore_case = TRUE)
  canon = setNames(universities, universities)

  # clean universities
  uni = data |>
    # split university strings on '\s' , '/', '\.', and '?'
    mutate(
      universiteit = str_split(universiteit, "/+|\\?+|\\."),
    ) |>
    unnest_longer(universiteit) |>
    # clean the university labels
    mutate(
      universiteit = str_replace(str_squish(universiteit), 'Leiden uni', 'Leiden'),
      university = str_replace(universiteit, pat, \(m) canon[str_to_lower(m)]),
      universiteit = case_when(
        (is.na(university) & str_detect(email_adres, 'essb.eur.nl')) ~ 'EUR',
        (is.na(university) & str_detect(email_adres, 'vu.nl'))  ~ 'VU',
        (is.na(university) & str_detect(email_adres, 'uva.nl'))  ~  'UVA',
        (is.na(university) & str_detect(email_adres, 'leidenuni'))  ~  'Leiden',
        (is.na(university) & str_detect(email_adres, 'ru.nl'))  ~  'RU',
        (is.na(university) & str_detect(email_adres, 'rug.nl'))  ~  'RUG',
        (is.na(university) & str_detect(email_adres, 'tilburguni'))  ~  'UvT',
        (is.na(university) & str_detect(email_adres, 'uu.nl'))  ~  'UU',
        .default = university
      ),
      university = ifelse("" == university, NA_character_, university)
    ) |>
    distinct(.keep_all=TRUE) |>
    group_by(naam, date) |> 
    summarise(university = list(unlist(university))) |>
    ungroup()

  data |>
    left_join(uni) |>
    relocate(university, .after=universiteit)
}

Clean functie

This step parses free-text job titles in functie into a set of consistent role flags. It first keeps only date, naam, and functie, then lowercases functie for keyword matching. Using targeted patterns in Dutch and English, it creates boolean indicators for common categories:

  • visiting (e.g., gast, visit),
  • external (buiten, external),
  • professor (incl. typos like proffessor),
  • associate professor (hoofddocent, associate, uhd),
  • assistant professor (universitair docent, assistant),
  • postdoc (postdoc, doctoral),
  • lecturer (docent, lecturer, teacher),
  • researcher (onderzoeker, research),
  • PhD (phd, promovend),
  • senior/junior,
  • emeritus (professor + emiri) or endowed (professor + bijzon),
  • fellow,
  • and a broad staff (e.g., advisor, secretary, assistent, medewerker, manager, coordinator, director/directeur).

To avoid double counting, several flags are explicitly set to FALSE when a more specific academic rank applies (e.g., assistant/associate/full professor precedence over lecturer/researcher/staff). Missing titles propagate as NA in the corresponding flags. The result is a tidy, machine-readable set of role indicators that standardizes heterogeneous job titles for downstream classification and analysis.

parse_job_titles = function(data){
  data |>
    select(date, naam, functie) |>
    mutate(
      is_visiting = case_when(
        str_detect(str_to_lower(functie), 'gast') ~ TRUE,
        str_detect(str_to_lower(functie), 'visit') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_external = case_when(
        str_detect(str_to_lower(functie), 'external') ~ TRUE,
        str_detect(str_to_lower(functie), 'buiten') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_associate_professor = case_when(
        str_detect(str_to_lower(functie), 'hoofddocent') ~ TRUE,
        str_detect(str_to_lower(functie), 'associate ') ~ TRUE,
        str_detect(str_to_lower(functie), 'uhd') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_assistant_professor = case_when(
        is_associate_professor ~ FALSE,
        str_detect(str_to_lower(functie), 'universitair docent') ~ TRUE,
        str_detect(str_to_lower(functie), 'assistant ') ~ TRUE, 
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_postdoc = case_when(
        str_detect(str_to_lower(functie), 'postdoc') ~ TRUE,
        str_detect(str_to_lower(functie), 'doctoral') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_senior = case_when(
        str_detect(str_to_lower(functie), 'senior') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_junior = case_when(
        str_detect(str_to_lower(functie), 'junior') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_lecturer = case_when(
        is_associate_professor ~ FALSE, 
        is_assistant_professor ~ FALSE, 
        str_detect(str_to_lower(functie), 'lecturer') ~ TRUE,
        str_detect(str_to_lower(functie), 'docent') ~ TRUE,
        str_detect(str_to_lower(functie), 'teacher') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_researcher = case_when(
        is_associate_professor ~ FALSE, 
        is_assistant_professor ~ FALSE,
        is_postdoc ~ FALSE, 
        str_detect(str_to_lower(functie), 'onderzoeker') ~ TRUE,
        str_detect(str_to_lower(functie), 'research') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_phd = case_when(
        str_detect(str_to_lower(functie), 'phd') ~ TRUE,
        str_detect(str_to_lower(functie), 'promovend') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_professor = case_when(
        is_associate_professor ~ FALSE, 
        is_assistant_professor ~ FALSE,
        is_postdoc ~ FALSE, 
        str_detect(str_to_lower(functie), 'hoogleraar') ~ TRUE,
        str_detect(str_to_lower(functie), 'professor') ~ TRUE,
        str_detect(str_to_lower(functie), 'proffessor') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_emeritus = case_when(
        is_professor & str_detect(str_to_lower(functie), 'emiri') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_endowed = case_when(
        is_professor & str_detect(str_to_lower(functie), 'bijzon') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      ),
      is_staff = case_when(
        # make sure that people with other positions are not falsely
        # been configured to be a staff member.
        is_associate_professor ~ FALSE, 
        is_assistant_professor ~ FALSE,
        is_lecturer ~ FALSE,
        is_postdoc ~ FALSE,
        is_professor ~ FALSE,
        # staff members have wildly varying job titles.
        str_detect(str_to_lower(functie), 'advisor') ~ TRUE,
        str_detect(str_to_lower(functie), 'secretary') ~ TRUE,
        str_detect(str_to_lower(functie), 'assistent') ~ TRUE,
        str_detect(str_to_lower(functie), 'medewerk') ~ TRUE,
        str_detect(str_to_lower(functie), 'market') ~ TRUE,
        str_detect(str_to_lower(functie), 'managing') ~ TRUE,
        str_detect(str_to_lower(functie), 'manager') ~ TRUE,
        str_detect(str_to_lower(functie), 'coordinator') ~ TRUE,
        str_detect(str_to_lower(functie), 'director') ~ TRUE,
        str_detect(str_to_lower(functie), 'directeur') ~ TRUE,
      ),
      is_fellow = case_when(
        str_detect(str_to_lower(functie), 'fellow') ~ TRUE,
        is.na(functie) ~ NA,
        .default = FALSE
      )
    )
}

Building on the role flags derived in Clean functie, this step converts those boolean indicators into human-readable position labels and, optionally, a compact string that also captures distinctions.

  • construct_positions() maps the mutually exclusive academic roles to a canonical position (e.g., Full Professor, Associate Professor, Assistant Professor, Postdoctoral Researcher, PhD Candidate, Lecturer, Researcher, Staff) while separately flagging distinctions as short text labels: Visiting, External, Senior, Junior, Emeritus, Endowed, and Fellow.

After creating these columns, it drops the original is_ flags and constructs position2 by uniting any present distinctions (from visiting through fellow) into a single, comma-separated string (leaving the per-column distinction flags intact for auditing).

  • clean_functie() then orchestrates the full cleaning. It first runs parse_job_titles() to produce the role flags, passes the result through construct_positions(), and finally writes the output back into the original data:
  • .what = "complete" (default) sets functie to the canonical position (rank only).
  • .what = "simplified" sets functie to position2, which includes any distinctions (e.g., Assistant Professor, Visiting; Fellow).

The result is a consistent functie column suitable either for strict rank analyses (complete) or for descriptive reporting that preserves visiting/external/fellow/etc. qualifiers (simplified).

construct_positions = function(data) {
  data |>
    mutate(
      # make flags for people with one of the following distinctions
      visiting = ifelse(is_visiting, 'Visiting', NA_character_),
      external = ifelse(is_external, 'External', NA_character_),
      senior = ifelse(is_senior, 'Senior', NA_character_),
      junior = ifelse(is_junior, 'Junior', NA_character_),
      emeritus = ifelse(is_emeritus, 'Emeritus', NA_character_),
      endowed = ifelse(is_endowed, 'Endowed', NA_character_),
      # create a basic positions variable, excluding distinctions
      position = case_when(
        is_professor ~ "Full Professor",
        is_associate_professor ~ "Associate Professor",
        is_assistant_professor ~ "Assistant Professor",
        is_postdoc ~ "Postdoctoral Researcher",
        is_phd ~ "PhD Candidate",
        is_lecturer ~ "Lecturer",
        is_researcher ~ "Researcher",
        is_staff ~ "Staff",
        .default = NA_character_
      ),
      fellow = ifelse(is_fellow, 'Fellow', NA_character_)
    ) |>
    select(!starts_with('is_')) |>
    unite('position2', visiting:fellow, na.rm=TRUE, remove=FALSE)
}

clean_functie = function(data, .what='complete'){
  test = data |> 
    parse_job_titles() |>
    construct_positions()

  if (.what == 'complete'){
    data['functie'] = test$position
  } else if (.what == 'simplified'){
    data['functie'] = test$position2
  }

  return(data)
}

Application

# identified scholarid files
source = file.path("data", "raw_data")
files = list.files(source, pattern = "scholarid.xlsx")

# load and process data
data = fread(files, source) |> 
  fix_scholar_names() |>
  clean_universities() |>
  fix_email_adresses() |>
  fix_google_scholar_id() |>
  select(-specialisatie, -notitie, -additional, -checked) |>
  arrange(discipline, date, naam, university) |>
  clean_functie()
`summarise()` has grouped output by 'naam'. You can override using the
`.groups` argument.
Joining with `by = join_by(date, naam)`
# save data
fsaveRDS(data, 'scholarid')
[1] "SAVING: ./data/processed/20251021scholarid.Rds"