Combine Scrape Data
Getting Started
Functions
reshape_demographics()
Note
TODO: add description and information
reshape_demographics = function(data){
data[['demographics']] = data[['demographics']] |>
mutate(
year = as.integer(year(date) - 2000),
functie = case_when(
str_detect(functie, 'Lecturer') ~ 'Researcher or Lecturer',
str_detect(functie, 'Researcher') ~ 'Researcher or Lecturer',
.default = functie
),
discipline = case_when(
str_detect(discipline, 'Sociologie') ~ "Sociology",
str_detect(discipline, 'Politicologie') ~ 'Political Sciences',
.default = NA_character_
)
) |>
arrange(clean_name, year) |>
select(
-university, -google_scholar_id, -date,
-naam, -clean_name_full, -email_adres
) |>
pivot_wider(
names_from = year,
values_from = c(universiteit, functie, functie2, discipline),
names_glue = "{.value}_{year}",
values_fill = NA,
values_fn = list(
universiteit = ~ if (all(is.na(.x))) NA_character_ else first(na.omit(.x)),
functie = ~ if (all(is.na(.x))) NA_character_ else first(na.omit(.x)),
functie2 = ~ if (all(is.na(.x))) NA_character_ else first(na.omit(.x)),
discipline = ~ if (all(is.na(.x))) NA_character_ else first(na.omit(.x))
)
) |>
distinct(uid, .keep_all = TRUE)
return(data)
}parse_works()
Extract and clean per-UID works tables from the raw works list.
Note
TODO: update description
parse_works = function(data){
uids = data[['demographics']] |> pull(uid) |> unique()
works = data[['works']]
# 1. Deduplicate valid results
works <- works |>
imap(~ if (inherits(.x, "data.frame")) distinct(.x) else .x)
# 2. Define a placeholder row for missing uids
placeholder <- function(uid) {
tibble(uid = uid, has_works = FALSE)
}
# 3. Expand to full uid set, ensuring EVERY element is a tibble
data[['works']] <- setNames(
map(uids, function(uid) {
if (uid %in% names(works) && inherits(works[[uid]], "data.frame")) {
works[[uid]] |>
mutate(uid = uid, .before = 1, has_works = TRUE) |>
distinct(work_id, .keep_all = TRUE)
} else {
placeholder(uid)
}
}),
uids
)
return(data)
}parse_scholars()
Split a combined OpenAlex author table into a per-UID list.
Note
TODO: add description
parse_scholars = function(data){
# return(authors)
uids = data[['demographics']] |> pull(uid) |> unique()
scholars = data[['scholars_oa']]
# 1. Deduplicate valid results
scholars <- scholars |>
imap(~ if (inherits(.x, "data.frame")) distinct(.x) else .x)
# 2. Define a placeholder row for missing uids
placeholder <- function(uid) {
tibble(uid = uid, has_author_id = FALSE)
}
# 3. Expand to full uid set, ensuring EVERY element is a tibble
data[['scholars_oa']] <- setNames(
map(uids, function(uid) {
if (uid %in% names(scholars) && inherits(scholars[[uid]], "data.frame")) {
scholars[[uid]] |>
mutate(uid = uid, .before = 1, has_author_id = TRUE) |>
distinct(author_id, .keep_all = TRUE)
} else {
placeholder(uid)
}
}),
uids
)
return(data)
}