Clean Data
Getting Started
Functions
prune_works()
Filter each scholar’s works to a set of sufficiently common work types and a minimum publication year.
Arguments
data
Named list that contains a
workselement (list of works tibbles). Each works tibble is expected to include at leasttypeandpublication_year(and typicallyuid,work_id, etc.).
Method
The function first binds all works together to compute type frequencies. It keeps only types with at least 50 occurrences and explicitly excludes dataset. It then iterates over data[['works']], coercing each element to a tibble, returning an empty tibble when required columns are missing, and filtering to valid types and publication_year >= 1990. Finally, it drops scholars whose filtered works tibble is empty and returns the updated data.
prune_works <- function(data){
# create list of valid datatypes
valid_types <- data[['works']] |>
bind_rows() |>
group_by(type) |>
summarize(n = n()) |>
# excluding types with fewer than 50 hits (retraction: 1, libguides: 18,
# reference-entry: 39), and datasets (991).
filter(
n >= 50,
type != 'dataset'
) |>
pull(type)
data[["works"]] <- data[["works"]] |>
map(\(df) {
df <- as_tibble(df)
# if needed columns are missing, return empty with same cols
if (!all(c("type", "publication_year") %in% names(df))) {
return(df[0, , drop = FALSE])
}
df |>
filter(
type %in% valid_types,
publication_year >= 1980
) |>
arrange(publication_year)
})
return(data)
}clean_demographics()
Attach region-of-origin metadata and standardize/rename demographics variables to a compact schema.
Arguments
data
Named list that contains a demographics element (a tibble with at least
uidandethnicity, plus the columns that are renamed/selected in the function).
Method
The function reads a country-to-region lookup table from data/utils/country_conversion.csv (keeping distinct alpha_2 and region) and harmonizes the region labels. It then left-joins this lookup onto data[['demographics']] by matching ethnicity to alpha_2, moves the joined region next to ethnicity, and renames a set of variables (e.g.,prob→gender_prob, universiteit_22→university_22). Finally, it selects a curated set of columns (including has_valid_works) and returns the updated list.
clean_demographics = function(data){
# get country to region table
country_to_region <- read_csv(
file.path('data', 'utils', 'country_conversion.csv'),
show_col_types = FALSE
) |>
distinct(alpha_2, region) |>
mutate(
region = case_when(
region == '(Asia|Oceania)' ~ 'Asia and Oceania',
region == 'Netherlands' ~ 'Netherlands',
.default = region
)
)
# add region to demographics and clean the dataframe
data[['demographics']] <- data[['demographics']] |>
left_join(country_to_region, by = join_by(country_id == alpha_2)) |>
relocate(region, .before = country_id) |>
rename(
origin = region,
university_22 = universiteit_22,
university_24 = universiteit_24,
university_25 = universiteit_25,
position_22 = functie_22,
position_24 = functie_24,
position_25 = functie_25,
position2_22 = functie2_22,
position2_24 = functie2_24,
position2_25 = functie2_25,
)
return(data)
}get_citation_counts()
Expand nested per-work citation histories to a complete annual panel and compute cumulative citation counts.
Arguments
data
Named list that contains a works element (list of works tibbles with at least
uid,work_id,publication_year, andcounts_by_year).
Method
The function binds all works together and unnests counts_by_year into long form, widening it into counts_by_year_year and counts_by_year_cited_by_count, which are renamed to year and citations. It coerces publication_year, year, and citations to integers, replaces missing citation counts with 0, and falls back to publication_year when year is missing. It then completes a full sequence of years from the earliest observed year through 2026 per (uid, work_id), filling missing citations with 0, fills publication_year down/up within each work, and computes cum_citations as the cumulative sum of annual citations per work. The resulting long tibble is returned (not re-attached to data).
get_citation_counts <- function(data){
data[['works']] |>
keep(~ nrow(.x) > 0) |>
bind_rows() |>
# unpack citation counts
unnest_longer(counts_by_year) |>
unnest_wider(counts_by_year, names_sep = "_") |>
rename(
year = counts_by_year_year,
citations = counts_by_year_cited_by_count
) |>
mutate(
publication_year = as.integer(publication_year),
year = as.integer(year),
citations = as.integer(citations),
citations = coalesce(citations, 0L),
year = coalesce(year, publication_year) # <-- key line
) |>
arrange(uid, publication_year, year) |>
select(uid, work_id, publication_year, year, citations, cited_by_count) |>
# expand time series
group_by(uid, work_id) |>
complete(
year = full_seq(min(year):2026, 1),
fill = list(citations = 0L)
) |>
fill(c(publication_year, cited_by_count), .direction = "downup") |>
arrange(publication_year, year, .by_group = TRUE) |>
mutate(
cum_citations = cumsum(citations),
# add the offest to cumulative citation counts
cited_offset = cited_by_count - max(cum_citations),
cum_citations = cum_citations + cited_offset
) |>
ungroup()
}add_annual_statistics()
Compute yearly publication/citation metrics (h-index, i10-index, citations, publications) per scholar and attach them to demographics.
Arguments
data
Named list that contains works (list of works tibbles with citation histories) and demographics (tibble with
uid).
Method
The function first calls get_citation_counts() to produce a per-work, per-year panel, then groups by (uid, year) to compute per-year cumulative publication counts. It derives the yearly i10-index by counting works with cum_citations > 10 and the yearly h-index using a simple per-year summary based on the relationship between cumulative citations and cumulative publications. It also aggregates total cumulative citations and number of distinct publications per year. These statistics are merged, filtered to years 2016–2025, nested into stats_by_year per uid, and then left-joined onto data$demographics. The updated list is returned.
add_annual_statistics <- function(data){
citation_counts <- data |>
get_citation_counts() |>
group_by(uid, year) |>
mutate(cum_publications = n_distinct(work_id))
# calculate i10_indices
i10_index_by_year <- citation_counts |>
mutate(i10 = cum_citations > 10) |>
group_by(uid, year) |>
summarise(i10_index = sum(i10, na.rm = TRUE)) |>
mutate(year = as.integer(year))
# # calculate h_indices
h_index_by_year <- citation_counts |>
group_by(uid, year) |>
summarise(h_index = sum(cum_citations > cum_publications, na.rm = TRUE)) |>
mutate(year = as.integer(year))
# toevoegen cum_citatties - max(citaties) = a
# aggregate citations and publications
counts <- citation_counts |>
group_by(uid, year) |>
summarize(
citations = sum(cum_citations),
publications = n_distinct(work_id)
)
df <- h_index_by_year |>
full_join(i10_index_by_year, by = c("uid", "year")) |>
full_join(counts, by = c("uid", "year")) |>
mutate(
d_i10_index = lead(i10_index, 1, default = NA) - i10_index,
d2_i10_index = RcppRoll::roll_sum(x = d_i10_index, 2, align = "right", fill = NA),
d2_i10_index = coalesce(d2_i10_index, d_i10_index),
d_citations = lead(citations, 1, default = 0) - citations,
d2_citations = RcppRoll::roll_sum(x = d_citations, 2, align = "right", fill = NA),
d2_citations = coalesce(d2_citations, d_citations),
d_publications = lead(publications, 1, default = 0) - publications,
d2_publications = RcppRoll::roll_sum(x = d_publications, 2, align = "right", fill = NA),
d2_publications = coalesce(d2_publications, d_publications)
)
# nest statistics per uid
statistics <- df |>
filter(year > 2015, year <= 2026) |>
arrange(uid, year) |>
group_by(uid) |>
nest(stats_by_year = c(
year, h_index,
i10_index, d_i10_index, d2_i10_index,
citations, d_citations, d2_citations,
publications, d_publications, d2_publications
)) |>
ungroup()
# aggregate statistics
aggregated <- df |>
group_by(uid) |>
summarise(
career_start = min(year, na.rm = TRUE),
h_index_22 = h_index[year == 2022][1],
i10_index_21 = i10_index[year == 2021][1],
i10_index_23 = i10_index[year == 2023][1],
i10_index_25 = i10_index[year == 2025][1],
.groups = "drop"
)
# add statistics to demographics
data$demographics <- data$demographics |>
left_join(statistics) |>
left_join(aggregated)
return(data)
}add_edge_table()
Build an edge table of co-authorships from nested authorship data.
Arguments
data
Named list that contains a
workselement (list of works tibbles withuid,work_id,authorships,publication_date,publication_year,type).
Method
The function binds all per-scholar works together, selecting uid, work_id, authorships, publication_date, publication_year, and type. It renames authorships to a and fully unnests a into separate columns with a names separator. It then filters to rows where a__uid (the co-author’s uid) is non-missing. The resulting tibble, stored as data[['edge_table']], represents a long edge list that links focal uid to their co-authors per work, along with basic publication metadata. The updated list is returned.
add_edge_table <- function(data, start_date = NA){
start_date <- if (is.na(start_date)) {
dmy("01/01/1980")
} else if (is.character(start_date)) {
dmy(start_date)
} else {
start_date
}
data[['edge_table']] <- data[['works']] |>
keep(~ nrow(.x) > 0) |>
bind_rows() |>
select(uid, work_id, authorships, publication_date, publication_year, type) |>
rename(a = authorships) |>
unnest(a, names_sep = "__", keep_empty=FALSE) |>
filter(!is.na(a__uid), (publication_date >= start_date))
return(data)
}check_for_valid_coauthors = function(
data,
start_date = dmy("19/12/2020"),
end_date = dmy("01/02/2026")
){
tab = data[['works']] |>
# create a coauthor table
keep(~ nrow(.x) > 0) |>
bind_rows() |>
select(uid, author_id, work_id, authorships, publication_date, publication_year, type) |>
rename(a = authorships) |>
unnest(a, names_sep = "__", keep_empty=FALSE) |>
filter(
(publication_date <= start_date),
(publication_date < end_date),
) |>
distinct(uid, coalesce(a__uid, a__author_id), .keep_all = TRUE) |>
# calculate the number of unique coauthors per author_id
group_by(author_id) |>
summarize(
uid = first(uid),
n_coauthors = n_distinct(coalesce(a__uid, a__author_id)),
n_valid_coauthors = n_distinct(a__uid, na.rm = TRUE)
) |>
arrange(uid, author_id) |>
# calculate the number of author_ids and indicate whether author_id is not
# connected to valid co_authors
group_by(uid) |>
mutate(
author_id = paste0('https://openalex.org/', author_id),
n_oaids = n_distinct(author_id),
valid_author_id = !((n_oaids > 1) & (n_valid_coauthors == 0))
) |>
ungroup() |>
select(-uid)
data[['scholars_oa']] = map(data[['scholars_oa']], \(df) {
if ("author_id" %in% names(df)) {
left_join(df, tab, by = "author_id")
} else {
df
}
})
return(data)
}exclude_works_from_invalid_authors = function(data){
valid_author_ids = data[['scholars_oa']] |>
bind_rows() |>
filter(valid_author_id %in% c(TRUE, NA_character_)) |>
mutate(author_id = str_remove(author_id, "https://openalex.org/") ) |>
pull(author_id) |> unique() |> na.omit()
data[['works']] = data[['works']] |>
map(\(df) {
if('author_id' %in% names(df)){
df |> filter(author_id %in% valid_author_ids)
} else (df)
})
return(data)
}