Data Preparation OpenAlex-ID
Tip
rough first working code for the collection of scholar_ids using openalexr
I optimized the algorithm to not just select the top case, but allows for multiple ids (in rows) per person, in the decision rules i use: - semantic similarity - matches of university_id and institution_id
Getting Started
dir = file.path('data', 'processed')
file = list.files(dir, pattern = 'names.Rds')
names = freadRDS(file.path(dir, file))
file = list.files(dir, pattern = 'scholarid.Rds')
scholar = freadRDS(file.path(dir, file)) |>
select(naam, university) |>
distinct(.keep_all = TRUE) |>
unnest_longer(university)
# |>
# left_join(names) |>
# unite(
# initials:last_name,
# col=name, sep = ' ',
# na.rm=TRUE, remove=FALSE) |>
# unite(
# c(name, maiden_name),
# col=name2, sep = '-',
# na.rm=TRUE, remove=FALSE)scholar = scholar |>
mutate(
university = university |>
str_replace("(Uni |uni)", "University "),
university_name = case_when(
str_detect(university, "VU") ~ "Free University Amsterdam",
str_detect(university, "(UvA|Uva)") ~ "University of Amsterdam",
str_detect(university, "Leiden") ~ "Leiden University",
university == 'RU' ~ "Radboud University Nijmegen",
str_detect(university, "(UU|UCU)") ~ "Utrecht University",
university == 'EUR' ~ "Erasmus University Rotterdam",
str_detect(university, '(RUG|UvG)') ~ "University of Groningen",
str_detect(university, '(UvT|Uvt|Tilburg)') ~ "University of Tilburg",
str_detect(university, 'WUR') ~ "Wageningen University & Research",
university == 'Uni Gothenburg' ~ "University of Gothenburg",
university == 'TU Delft' ~ "Technical University Delft",
str_detect(university, 'Milano') ~ 'University of Milan',
# fix for mistake in affilation for Sabine Mokry,
str_detect(university, 'Berlijn') ~ 'University of Hamburg',
str_detect(university, 'Trento') ~ 'University of Trento',
str_detect(university, 'Stockholm') ~ 'Stockholm University',
str_detect(university, 'Gent') ~ "Universiteit Gent",
university == "Politie" ~ NA_character_,
university == "UvH" ~ NA_character_,
.default = university
),
university_name2 = case_when(
str_detect(university, "VU") ~ "Vrije Universiteit Amsterdam",
str_detect(university, "(UvA|Uva)") ~ "Universiteit van Amsterdam",
str_detect(university, "Leiden") ~ "Leiden Universiteit",
university == 'RU' ~ "Radboud Universiteit Nijmegen",
str_detect(university, "(UU|UCU)") ~ "Universiteit Utrecht",
university == 'EUR' ~ "Erasmus Universiteit Rotterdam",
str_detect(university, '(RUG|UvG)') ~ "Rijksuniversiteit Groningen",
str_detect(university, '(UvT|Uvt|Tilburg)') ~ "Universiteit van Tilburg",
str_detect(university, 'WUR') ~ "Wageningen University & Research",
university == 'Uni Gothenburg' ~ "Universiteit van Gothenburg",
university == 'TU Delft' ~ "Technische Universiteit Delft",
str_detect(university, 'Milano') ~ 'Università degli Studi di Milano Statale',
# fix for mistake in affilation for Sabine Mokry,
str_detect(university, 'Berlijn') ~ 'Universität Hamburg',
str_detect(university, 'Trento') ~ 'Università degli Studi di Trento',
str_detect(university, 'Cologne') ~ 'Universität zu Köln',
str_detect(university, 'Stockholm') ~ 'Stockholms Universitet',
str_detect(university, 'Bocconi') ~ "Università Bocconi",
str_detect(university, 'Koc') ~ "Koç Üniversitesi",
str_detect(university, 'Gent') ~ "Ghent University",
str_detect(university, 'Gothenburg') ~ "Göteborgs universitet",
str_detect(university, "Turku") ~ "Turun Yliopisto",
str_detect(university, 'Lausane') ~ "Université de Lausanne",
str_detect(university, "Leipzig") ~ "Universität Leipzig",
str_detect(university, "Linköping") ~ "Linköpings universitet",
.default = NA_character_
)
) |>
drop_na() |>
pivot_longer(
cols = university_name:university_name2,
names_to = "var",
values_to = "university_name"
) |>
select(-var)
institutions = scholar$university_name |> unique()# scholar = scholar |>
# mutate(
# university_name = university |>
# str_replace("(Uni |uni)", "University ") |>
# str_replace('(UU)', 'Utrecht University') |>
# str_replace('(UCU)', 'Utrecht University') |>
# str_replace('(Leiden|Leiden University)', 'Leiden University') |>
# str_replace('(RUG)', 'University of Groningen') |>
# str_replace('(RU)', 'Radboud University Nijmegen') |>
# str_replace('(UvA|Uva)', 'University of Amsterdam') |>
# str_replace('(VU)', 'Free University Amsterdam') |>
# str_replace('(EUR)', 'Erasmus University Rotterdam') |>
# str_replace('(UvT|Tilburg|Uvt)', 'University of Tilburg') |>
# str_replace('(TU Delft)', 'Technical University Delft') |>
# str_replace('(WUR)', 'Wageningen University & Research') |>
# str_replace('(Universita degli studi di Milano)', 'University of Milan') |>
# str_replace('(Politie)', NA_character_) |>
# str_replace('(UvH)', NA_character_) |>
# str_replace('(UvG)', "University of Groningen") |>
# # fixing mistake for Sabine Mokry
# str_replace('University Berlijn', "University of Hamburg") |>
# str_replace('Trento University', "University of Trento") |>
# ) |>
# filter(!is.na(university_name))
# # institutions = c(
# # scholar$university_name |> unique(),
# # )
# institutions = scholar$university_name |> unique()oa_fetch_institution = function(institution, pause=0){
if (pause > 0) Sys.sleep(pause)
oa_fetch(
entity = "institutions",
search=institution,
mailto = "jos.slabbekoorn@ru.nl"
)$id[1]
}
hold = c()
for (institution in institutions){
id = oa_fetch_institution(institution)
hold[[institution]] = tibble::tibble(
university_name = institution,
university_url = id,
university_id = str_remove(id, 'https://openalex.org/')
)
}
institutions = bind_rows(hold) |>
drop_na()
scholar = scholar |>
left_join(institutions) |>
distinct(naam, university_id, .keep_all = TRUE)# library(dplyr)
library(stringdist)
library(stringi)
normalize_name = function(x) {
x = as.character(x)
# mark Cyrillic
has_cyr = stri_detect_charclass(x, "\\p{Script=Cyrillic}")
# transliterate only Cyrillic -> Latin
x[has_cyr] = stri_trans_general(x[has_cyr], "Cyrillic-Latin")
# strip accents for all, lowercase, trim, squish
x = stri_trans_general(x, "Latin-ASCII")
x = tolower(x)
x = str_squish(x)
trimws(x)
}
add_query_similarity = function(data){
data |>
mutate(
dn = normalize_name(display_name),
qn = normalize_name(query_name),
query_similarity = 1 - stringdist(dn, qn, method = "jw")
) |> # Jaro–Winkler
select(-dn, -qn)
}
# dubbel check op achternaam
# als daar voor nog wat voor staat komt dat overeen met de voornaam
# als de voornaam, komt dit overeen met initalenoa_fetch_scholar = function(
scholar_name,
university_ids,
pause = 0
){
# fetch oa scholar information for all name id pairs
hold = list()
for (id in university_ids){
if (pause > 0) Sys.sleep(pause)
hold[[id]] = tryCatch(
oa_fetch(
entity = 'author',
search = scholar_name,
affiliations.institution.id = id,
mailto = "jos.slabbekoorn@ru.nl"
),
error = function(e) NULL,
warning = function(w) NULL
)
}
res = bind_rows(hold)
# combine the results and drop duplicates
if (ncol(res) > 2){
res = res |>
distinct(.keep_all = TRUE) |>
mutate(query_name = scholar_name) |>
add_query_similarity() |>
arrange(-query_similarity) |>
head(15)
}
return(res)
}library(cli)
scholars = scholar$naam |> unique()
scrape_scholars = function(scholars){
# configure loop
k = length(scholars)
hold = list()
# loop over all scholars
cli_alert("Starting now, at {Sys.time()}")
cli_progress_bar("Scraping Scholars", total = k, clear = FALSE)
for (scholar_name in scholars) {
tab = scholar |>
filter(naam == scholar_name) |>
select(naam:university_id)
university_ids = as.list(na.omit(tab$university_id))
if (length(university_ids) > 0){
hold[[scholar_name]] = oa_fetch_scholar(scholar_name, university_ids)
}
cli_progress_update()
}
return(hold)
}
hold = scrape_scholars(scholars)clean institutions data
hold2 = list()
for (name_ in names(hold)){
if (is.na(name_) || !nzchar(trimws(name_))) next
tab = hold[[name_]]
# if there is a table, than update the table
if (nrow(tab) >= 1){
tab = tab |>
filter(query_name == name_) |>
distinct(.keep_all = TRUE) |>
mutate(
last_known_institutions = map(
last_known_institutions,
~ .x |>
distinct(.keep_all = TRUE) |>
mutate(
institution_id = str_remove(id, "^https://openalex.org/"),
institution_name = display_name
) |>
select(id, institution_id, institution_name, country_code)
),
institution_ids = map(last_known_institutions, ~ .x |> pull(institution_id) |> unique())
)
}
hold2[[name_]] = tab
}
# hold2sanity checks for institution_ids
hold3 = list()
for (name_ in names(hold2)){
tab = hold2[[name_]]
university_ids = scholar |>
filter(naam == name_) |>
select(university_id) |>
pull()
# for valid tables, match the institution ids with university ids
if (nrow(tab)> 0){
# ensure we have a list-column of character vectors (no NULLs)
ids_list = map(tab$institution_ids, ~ if (is.null(.x)) character(0) else as.character(.x))
# for each row, return indices of rows (excluding self) sharing any institution_id
matching_rows = map2(ids_list, seq_along(ids_list), \(ids, i){
if (length(ids) == 0) return(integer(0))
hits = which(map_lgl(ids_list, ~ length(intersect(ids, .x)) > 0))
hits[hits != i]
})
tab = tab |>
mutate(
# any match between this row's institutions and this scholar's uni_ids?
institution_matched = map_lgl(institution_ids, ~ {
x = .x; if (is.null(x)) x = character(0)
any(as.character(x) %in% university_ids)
}),
# vector of row numbers with overlapping institution_ids (across all other rows)
has_multiple_records = matching_rows
) |>
## decision rule which rows to keep
mutate(
keep = case_when(
query_similarity == 1.0 ~ TRUE,
!institution_matched & query_similarity >= 0.60 ~ TRUE,
institution_matched & query_similarity >= 0.45 ~ TRUE,
display_name == 'Thijs W. de Vos' ~ FALSE,
display_name == 'Kees van Kersbergen' ~ FALSE,
display_name == 'René W. van der Hulst' ~ FALSE,
display_name == 'Younes Zoughlami' ~ FALSE,
display_name == 'Younes Zeboudj' ~ FALSE,
display_name == 'Younes Saramifar' ~ FALSE,
display_name == 'van Dijk' ~ FALSE,
display_name == 'Stephanie Maas' ~ FALSE,
display_name == 'A.J.J. Nijhuis' ~ FALSE,
.default = FALSE
),
orcid = as.character(orcid)
) |>
filter(keep) |> select(-counts_by_year)
}
hold3[[name_]] = tab
}