Data Preparation OpenAlex-ID
Tip
rough first working code for the collection of scholar_ids using openalexr
I optimized the algorithm to not just select the top case, but allows for multiple ids (in rows) per person, in the decision rules i use: - semantic similarity - matches of university_id and institution_id
Getting Started
dir = file.path('data', 'processed')
files = list.files(dir)
data = list(
scholars = readRDS(file.path(dir, files[str_detect(files, 'scholarid')])),
ethnicity = readRDS(file.path(dir, files[str_detect(files, 'ethnicity')])),
gender = readRDS(file.path(dir, files[str_detect(files, 'gender')])),
names = readRDS(file.path(dir, files[str_detect(files, 'names')])),
id = readRDS(file.path('data', 'raw_data', '20251015oascholars.Rds')),
works = readRDS(file.path('data', 'raw_data', '20251016oaworks.Rds'))
)
# scholars = fread(file.)
dem = data[['scholars']] |>
mutate(
year = as.integer(year(date) - 2000)
) |>
arrange(naam, date) |>
select(-university, -date, -google_scholar_id) |>
pivot_wider(
names_from = year,
values_from = c(email_adres, universiteit, functie, discipline),
names_glue = "{.value}.{year}",
values_fill = NA,
values_fn = list(
email_adres = ~ if (all(is.na(.x))) NA_character_ else str_c(unique(na.omit(.x)), collapse = "; "),
universiteit = ~ if (all(is.na(.x))) NA_character_ else first(na.omit(.x)),
functie = ~ if (all(is.na(.x))) NA_character_ else first(na.omit(.x)),
discipline = ~ if (all(is.na(.x))) NA_character_ else first(na.omit(.x))
)
) |>
select(
naam, ends_with('22'), ends_with('24'), ends_with('25')
)
# add ids
dems = data$id |>
bind_rows() |>
arrange(query_name, works_count) |>
select(query_name, id) |>
# select the row with the most works
distinct(query_name, .keep_all = TRUE) |>
# merge in demographics
rename(naam = query_name) |>
right_join(dem) |>
# merge in names
left_join(data$names) |>
relocate(initials:maiden_name, .after=id) |>
# merge in gender
left_join(data$gender |> select(-count, -prob)) |>
relocate(gender, .after=maiden_name) |>
# merge in ethnicity
left_join(data$ethnicity |> select(-name_count)) |>
relocate(origin:dutch, .after=gender) |>
distinct(naam, id, .keep_all=TRUE) |>
mutate(has_oa_id = !is.na(id)) |>
drop_na(id) |>
distinct(id, .keep_all=TRUE)