used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 605304 32.4 1371547 73.3 NA 715785 38.3
Vcells 1118046 8.6 8388608 64.0 16384 2012131 15.4
Data Processing Gender
Getting Started
Functions
Call Meertens Voornamen Databank
is_ok = function(resp) resp_status(resp) >= 200 && resp_status(resp) < 300
request_gender = function(
first_name,
base = "https://nvb.meertens.knaw.nl/naam/is/",
pause = 0.5
){
# configure url for scraping
url = paste0(base, URLencode(tolower(first_name), reserved = TRUE))
# configure user agent
ua = paste(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 15_5)",
"AppleWebKit/537.36 (KHTML, like Gecko)",
"Chrome/129.0.0.0 Safari/537.36"
)
req = request(url) |>
req_user_agent(ua) |>
req_timeout(30) |>
req_headers(
"Accept" = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language" = "nl,en;q=0.8"
) |>
# Retry on 429/5xx, and *also* on network hiccups:
req_retry(
max_tries = 4,
backoff = ~ runif(1, 0.5, 1.2) * (2 ^ (.x - 1)), # jittered exponential
is_transient = function(resp) {
code <- resp_status(resp)
isTRUE(code == 429L || (code >= 500L & code < 600L))
}
)
# polite pause + jitter
if (pause > 0) Sys.sleep(pause + runif(1, 0, 0.4))
# CRITICAL: don't throw on transport errors
resp <- tryCatch(
req_perform(req),
error = function(e) {
attr(e, "nvb_url") <- url
e
}
)
# Return a uniform list the caller can inspect
if (inherits(resp, "error")) {
return(list(ok = FALSE, status = NA_integer_, url = url, resp = NULL, error = resp))
}
list(ok = is_ok(resp), status = resp_status(resp), url = url, resp = resp, error = NULL)
}
Extract Gender Information
extract_gender_information = function(
resp,
first_name
){
# extract all the tables on the page
html = read_html(resp_body_string(resp))
tables = html_table(html, header=TRUE)
# select the first table if a table if found
if (length(tables) == 0) stop("No tables were found")
tab = tables[[1]]
# extract information from table
male_count = tab[1, 3] |> pull()
male_count = ifelse(male_count == '--', 0, as.numeric(male_count))
female_count = tab[5, 3] |> pull()
female_count = ifelse(female_count == '--', 0, as.numeric(female_count))
probability_male = male_count / (female_count + male_count)
# configure results table
res = tibble::tribble(
~first_name, ~male_count, ~female_count, ~probability_male,
first_name, male_count, female_count, probability_male
)
return(res)
}
Helper functions
safe_extract = purrr::possibly(
extract_gender_information,
otherwise = tibble::tibble(
first_name = NA_character_,
male_count = NA_integer_,
female_count = NA_integer_,
probability_male = NA_real_
)
)
get_gender_row = function(name, gender) {
# If cached, return from cache
if (name %in% gender$first_name) {
return(gender |> filter(first_name == name))
}
r <- request_gender(name)
# If transport error or HTTP not OK, surface status & keep going
if (!isTRUE(r$ok)) {
return(tibble(
first_name = NA_character_,
male_count = NA_integer_,
female_count = NA_integer_,
probability_male = NA_real_
))
}
out <- safe_extract(r$resp, name) |>
mutate(first_name = name)
out
}
Patch Difficult Names
patch_gender_on_splits = function(gender){
# set gender cache
gender_cache = gender |> drop_na()
# select authors without gender, and split their names
selection = gender |>
filter(is.na(male_count)) |>
mutate(first_name_split = str_split(first_name, ' ')) |>
unnest_longer(first_name_split) |>
select(first_name, first_name_split)
# get first names from selection
first_names = selection |>
select(first_name_split) |>
pull() |>
unique()
# patch gender --------------------------------------
gender_patch = purrr::map_dfr(
first_names, get_gender_row, gender = gender_cache
)
# aggregate gender information
gender_patch = selection |>
left_join(
gender_patch,
by=join_by(first_name_split == first_name)
) |>
drop_na() |>
# take the average gender count and probablity
# for names where both splits yielded a gender
# result
group_by(first_name) |>
summarise(
male_count = as.integer(mean(male_count)),
female_count = as.integer(mean(female_count)),
probability_male = mean(probability_male)
) |>
ungroup()
gender |> rows_update(gender_patch)
}
Clean Gender Information
Patch Missing Gender
patch_missing_gender = function(data){
female_names = c(
"Alaxandra", "Alinson", "Avyanthi",
"Brunilda", "Busisiwe", "Diliara",
"Dolive", "Echo", "Guangyu",
# mistake in name, has been patched with _create_name_corrections
"Guangye",
"Gul-i-Hina", "Haebin", "Haisu",
# Phoebe Kisibi Mbasalaki was incorrectly coded, has been patched with _create_name_corrections
"Kisubi", "Pheobe",
"Liubov", "Madalina", "Majolijn",
"Mansoureh", "Nankyung", "Nilmawati",
"Nodira", "Noyonika", "Radostina",
"Rojika", "Rozenmarijn", "Sayoni",
"Seonoki", "Shelliann", "Shiming",
"Siggie", "Siztine", "Sungmi",
"Talinta", "Teana", "Xingna",
"Yuliia", "Zhiyi"
)
male_names = c(
"Alborno", "Chenchen", "Chendi",
"Chunglin", "Chuyu", "Diliara",
"Gjovalin", "Kirils", "Kyohee",
"Madhud", "Quichen", "Soeren",
"Teana", "Tanzhe", "Vishwesh",
"Weverthon"
)
data |>
mutate(
gender = case_when(
!is.na(gender) ~ gender,
first_name %in% female_names ~ 'female',
first_name %in% male_names ~ 'male',
.default = gender
)
)
}
genderize_names = function(idx) {
# load cached gender information
gender_cache = readRDS(file.path('data', 'utils', "genderizer_cache.Rds"))
# load api key from secrets file
dotenv::load_dot_env()
APIKEY <- Sys.getenv("GENDERIZE_API_KEY")
# select uncached names
idx = idx |> filter(!term %in% gender_cache$name)
# select first_names
first_names = idx$term |> na.omit() |> unique()
first_names = first_names
# fetch gender results
hold = c()
for (name in first_names){
resp = genderizeAPI(name, apikey = APIKEY)
hold[[name]] = resp$response
}
# combine cache with results and put new cache results
res = bind_rows(gender_cache, bind_rows(hold)) |>
distinct(.keep_all = TRUE)
saveRDS(res, file.path('data', 'utils', "genderizer_cache.Rds"))
return(res)
}
scrape_gender = function(idx) {
# load gender cache
gender_cache = readRDS(file.path("data", "utils", "nvb_gender.Rds")) |>
drop_na()
first_names = idx |> pull(first_name) |> unique() |> na.omit()
# scrape gender results
res = purrr::map_dfr(
first_names,
get_gender_row,
gender = gender_cache
) |>
patch_gender_on_splits()
# put gender scrape results
saveRDS(res |> drop_na(), file.path('data', 'utils', "nvb_gender.Rds"))
# clean gender results
res = res |>
clean_gender() |>
patch_missing_gender()
return(res)
}
harmonize_gender = function(gender) {
gender |>
select(
first_name, term, starts_with('gender'),
starts_with('count'), starts_with('prob')
) |>
distinct(first_name, term, .keep_all=TRUE) |>
mutate(
has_multiple = str_detect(first_name, '( |-)'),
has_mismatch = gender.x != gender.y
) |>
drop_na(first_name) |>
filter(!(has_multiple & has_mismatch & str_detect(term, '.'))) |>
mutate(
gender = case_when(
is.na(gender.y) ~ gender.x,
.default = gender.y
),
prob = case_when(
is.na(probability) ~ prob,
.default = probability
),
count = case_when(
is.na(count.y) ~ count.x,
.default = count.y
)
) |>
group_by(first_name) |>
summarise(
gender = first(gender),
prob = mean(prob),
count = sum(count)
)
}
Application
dir = file.path('data', 'processed')
file = list.files(dir, pattern = 'names.Rds')[[1]]
names = readRDS(file.path(dir, file))
idx = names |>
mutate(term = first_name |>
str_to_lower() |>
str_split('( |-)')
) |>
unnest_longer(term)
res = scrape_gender(idx)
Matching, by = "first_name"
res2 = genderize_names(idx)
gender = idx |>
left_join(res) |>
left_join(res2, by=join_by(term == name), suffix = c(".x", ".y")) |>
harmonize_gender()
Joining with `by = join_by(first_name)`
[1] "SAVING: ./data/processed/20251021gender.Rds"