Clean Data

Published

March 31, 2026

Getting Started

# load custom functions
source("src/utils/custom_functions.r")

# clear the global environment and set dependencies
.clear_global_environment()
.load_quarto_dependencies()

# load and activate packages
library(tidyverse)
library(stringr)
library(lubridate)
library(purrr)

Functions

`prune_works()`

Filter each scholar’s works to a set of sufficiently common work types and a minimum publication year.

Arguments

data

Named list that contains a works element (list of works tibbles). Each works tibble is expected to include at least type and publication_year (and typically uid, work_id, etc.).

Method

The function first binds all works together to compute type frequencies. It keeps only types with at least 50 occurrences and explicitly excludes dataset. It then iterates over data[['works']], coercing each element to a tibble, returning an empty tibble when required columns are missing, and filtering to valid types and publication_year >= 1990. Finally, it drops scholars whose filtered works tibble is empty and returns the updated data.

prune_works <- function(data){
  # create list of valid datatypes
  valid_types <- data[['works']] |>
    bind_rows() |>
    group_by(type) |>
    summarize(n = n()) |>
      # excluding types with fewer than 50 hits (retraction: 1, libguides: 18, 
      # reference-entry: 39), and datasets (991). 
      filter(
        n >= 50, 
        type != 'dataset'
      ) |> 
      pull(type)

  data[["works"]] <- data[["works"]] |>
    map(\(df) {
      df <- as_tibble(df)

      # if needed columns are missing, return empty with same cols
      if (!all(c("type", "publication_year") %in% names(df))) {
        return(df[0, , drop = FALSE])
      }

      df |>
        filter(
          type %in% valid_types,
          publication_year >= 1980
        ) |>
        arrange(publication_year)
    })

  return(data)
}

`clean_demographics()`

Attach region-of-origin metadata and standardize/rename demographics variables to a compact schema.

Arguments

data

Named list that contains a demographics element (a tibble with at least uid and ethnicity, plus the columns that are renamed/selected in the function).

Method

The function reads a country-to-region lookup table from data/utils/country_conversion.csv (keeping distinct alpha_2 and region) and harmonizes the region labels. It then left-joins this lookup onto data[['demographics']] by matching ethnicity to alpha_2, moves the joined region next to ethnicity, and renames a set of variables (e.g.,prob→gender_prob, universiteit_22→university_22). Finally, it selects a curated set of columns (including has_valid_works) and returns the updated list.

clean_demographics = function(data){
  # get country to region table
  country_to_region <- read_csv(
      file.path('data', 'utils', 'country_conversion.csv'),
      show_col_types = FALSE
    ) |>
    distinct(alpha_2, region) |>
    mutate(
      region = case_when(
        region == '(Asia|Oceania)' ~ 'Asia and Oceania',
        region == 'Netherlands' ~ 'Netherlands',
        .default = region
      )
    )

  # add region to demographics and clean the dataframe
  data[['demographics']] <- data[['demographics']] |>
    left_join(country_to_region, by = join_by(country_id == alpha_2)) |>
    relocate(region, .before = country_id) |>
    rename(
        origin = region,
        university_22 = universiteit_22,
        university_24 = universiteit_24,
        university_25 = universiteit_25,
        position_22 = functie_22,
        position_24 = functie_24,
        position_25 = functie_25,
        position2_22 = functie2_22,
        position2_24 = functie2_24,
        position2_25 = functie2_25,
      )

  return(data)
}

`get_citation_counts()`

Expand nested per-work citation histories to a complete annual panel and compute cumulative citation counts.

Arguments

data

Named list that contains a works element (list of works tibbles with at least uid, work_id, publication_year, and counts_by_year).

Method

The function binds all works together and unnests counts_by_year into long form, widening it into counts_by_year_year and counts_by_year_cited_by_count, which are renamed to year and citations. It coerces publication_year, year, and citations to integers, replaces missing citation counts with 0, and falls back to publication_year when year is missing. It then completes a full sequence of years from the earliest observed year through 2026 per (uid, work_id), filling missing citations with 0, fills publication_year down/up within each work, and computes cum_citations as the cumulative sum of annual citations per work. The resulting long tibble is returned (not re-attached to data).

get_citation_counts <- function(data){
  data[['works']] |>
    keep(~ nrow(.x) > 0) |>
    bind_rows() |>
    # unpack citation counts
    unnest_longer(counts_by_year) |>
    unnest_wider(counts_by_year, names_sep = "_") |>
    rename(
      year = counts_by_year_year,
      citations = counts_by_year_cited_by_count
    ) |>
    mutate(
      publication_year = as.integer(publication_year),
      year = as.integer(year),
      citations = as.integer(citations),
      citations = coalesce(citations, 0L),
      year = coalesce(year, publication_year)  # <-- key line
    ) |>
    arrange(uid, publication_year, year) |>
    select(uid, work_id, publication_year, year, citations, cited_by_count) |>
    # expand time series 
    group_by(uid, work_id) |>
    complete(
      year = full_seq(min(year):2026, 1),
      fill = list(citations = 0L)
    ) |>
    fill(c(publication_year, cited_by_count), .direction = "downup") |>
    arrange(publication_year, year, .by_group = TRUE) |>
    mutate(
      cum_citations = cumsum(citations),
      # add the offest to cumulative citation counts
      cited_offset = cited_by_count - max(cum_citations),
      cum_citations = cum_citations + cited_offset
    ) |>
    ungroup()
}

`add_annual_statistics()`

Compute yearly publication/citation metrics (h-index, i10-index, citations, publications) per scholar and attach them to demographics.

Arguments

data

Named list that contains works (list of works tibbles with citation histories) and demographics (tibble with uid).

Method

The function first calls get_citation_counts() to produce a per-work, per-year panel, then groups by (uid, year) to compute per-year cumulative publication counts. It derives the yearly i10-index by counting works with cum_citations > 10 and the yearly h-index using a simple per-year summary based on the relationship between cumulative citations and cumulative publications. It also aggregates total cumulative citations and number of distinct publications per year. These statistics are merged, filtered to years 2016–2025, nested into stats_by_year per uid, and then left-joined onto data$demographics. The updated list is returned.

add_annual_statistics <- function(data){
  citation_counts <- data |> 
    get_citation_counts() |>
    group_by(uid, year) |>
    mutate(cum_publications = n_distinct(work_id))

  # calculate i10_indices
  i10_index_by_year <- citation_counts  |>
    mutate(i10 = cum_citations > 10) |>
    group_by(uid, year) |>
    summarise(i10_index = sum(i10, na.rm = TRUE)) |>
    mutate(year = as.integer(year))

  # # calculate h_indices
  h_index_by_year <- citation_counts  |> 
    group_by(uid, year) |>
    summarise(h_index = sum(cum_citations > cum_publications, na.rm = TRUE)) |>
    mutate(year = as.integer(year))
  # toevoegen cum_citatties - max(citaties) = a 

  # aggregate citations and publications
  counts <- citation_counts  |> 
    group_by(uid, year) |>
    summarize(
      citations = sum(cum_citations),
      publications = n_distinct(work_id)
    )
  
  df <- h_index_by_year |>
    full_join(i10_index_by_year, by = c("uid", "year")) |>
    full_join(counts, by = c("uid", "year")) |>
    mutate(
      d_i10_index = lead(i10_index, 1, default = NA) - i10_index,
      d2_i10_index = RcppRoll::roll_sum(x = d_i10_index, 2, align = "right", fill = NA),
      d2_i10_index = coalesce(d2_i10_index, d_i10_index),
      d_citations = lead(citations, 1, default = 0) - citations,
      d2_citations = RcppRoll::roll_sum(x = d_citations, 2, align = "right", fill = NA),
      d2_citations = coalesce(d2_citations, d_citations),
      d_publications = lead(publications, 1, default = 0) - publications,
      d2_publications = RcppRoll::roll_sum(x = d_publications, 2, align = "right", fill = NA),
      d2_publications = coalesce(d2_publications, d_publications)
    )

  # nest statistics per uid
  statistics <- df |>
      filter(year > 2015, year <= 2026) |>
      arrange(uid, year) |>
      group_by(uid) |>
      nest(stats_by_year = c(
        year, h_index, 
        i10_index, d_i10_index, d2_i10_index,
        citations, d_citations, d2_citations, 
        publications, d_publications, d2_publications
      )) |>
      ungroup()

  # aggregate statistics
  aggregated <- df |>
    group_by(uid) |>
    summarise(
      career_start = min(year, na.rm = TRUE),
      h_index_22  = h_index[year == 2022][1],
      i10_index_21 = i10_index[year == 2021][1],
      i10_index_23 = i10_index[year == 2023][1],
      i10_index_25 = i10_index[year == 2025][1],
      .groups = "drop"
    )

  # add statistics to demographics
  data$demographics <- data$demographics |>
    left_join(statistics) |>
    left_join(aggregated)

  return(data)
}

`add_edge_table()`

Build an edge table of co-authorships from nested authorship data.

Arguments

data

Named list that contains a works element (list of works tibbles with uid, work_id, authorships, publication_date, publication_year, type).

Method

The function binds all per-scholar works together, selecting uid, work_id, authorships, publication_date, publication_year, and type. It renames authorships to a and fully unnests a into separate columns with a names separator. It then filters to rows where a__uid (the co-author’s uid) is non-missing. The resulting tibble, stored as data[['edge_table']], represents a long edge list that links focal uid to their co-authors per work, along with basic publication metadata. The updated list is returned.

add_edge_table <- function(data, start_date = NA){
  start_date <- if (is.na(start_date)) {
      dmy("01/01/1980")
    } else if (is.character(start_date)) {
      dmy(start_date)
    } else {
      start_date
    }

  data[['edge_table']] <- data[['works']] |>
    keep(~ nrow(.x) > 0) |>
    bind_rows() |>
    select(uid, work_id, authorships, publication_date, publication_year, type) |>
    rename(a = authorships) |>
    unnest(a, names_sep = "__", keep_empty=FALSE) |>
    filter(!is.na(a__uid), (publication_date >= start_date))

  return(data)
}

check_for_valid_coauthors = function(
    data, 
    start_date = dmy("19/12/2020"),
    end_date = dmy("01/02/2026")
  ){
  tab = data[['works']] |>
    # create a coauthor table
    keep(~ nrow(.x) > 0) |>
    bind_rows() |>
    select(uid, author_id, work_id, authorships, publication_date, publication_year, type) |>
    rename(a = authorships) |>
    unnest(a, names_sep = "__", keep_empty=FALSE) |>
    filter(
      (publication_date <= start_date),
      (publication_date <  end_date),
    ) |>
    distinct(uid, coalesce(a__uid, a__author_id), .keep_all = TRUE) |>
    # calculate the number of unique coauthors per author_id
    group_by(author_id) |>
    summarize(
      uid = first(uid),
      n_coauthors = n_distinct(coalesce(a__uid, a__author_id)),
      n_valid_coauthors = n_distinct(a__uid, na.rm = TRUE)
    ) |>
    arrange(uid, author_id) |>
    # calculate the number of author_ids and indicate whether author_id is not
    # connected to valid co_authors
    group_by(uid) |>
    mutate(
      author_id = paste0('https://openalex.org/', author_id),
      n_oaids = n_distinct(author_id),
      valid_author_id = !((n_oaids > 1) & (n_valid_coauthors == 0))
    ) |>
    ungroup() |>
    select(-uid)

  data[['scholars_oa']] = map(data[['scholars_oa']], \(df) {
    if ("author_id" %in% names(df)) {
      left_join(df, tab, by = "author_id")
    } else {
      df
    }
  })

  return(data)
}

exclude_works_from_invalid_authors = function(data){
  valid_author_ids = data[['scholars_oa']] |> 
    bind_rows() |>
    filter(valid_author_id %in% c(TRUE, NA_character_)) |>
    mutate(author_id = str_remove(author_id, "https://openalex.org/") ) |>
    pull(author_id) |> unique() |> na.omit()

  data[['works']] = data[['works']] |>
    map(\(df) {
      if('author_id' %in% names(df)){
        df |> filter(author_id %in% valid_author_ids)
      } else (df)
    })

  return(data)
}

Application

data = freadRDS2(file="scholars", location = "combined") |>
  check_for_valid_coauthors() |>
  prune_works() |>
  exclude_works_from_invalid_authors() |>
  clean_demographics() |>
  add_annual_statistics() |>
  add_edge_table(start_date = '01/01/2020') 

fsaveRDS(data, 'scholars', location = "./data/clean/")