Data Processing Gender

Published

October 21, 2025

Getting Started

# clear the global environment
rm(list = ls())
gc()
          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells  605304 32.4    1371547 73.3         NA   715785 38.3
Vcells 1118046  8.6    8388608 64.0      16384  2012131 15.4
# load custom functions
source("src/utils/custom_functions.r")

# load and activate packages
fpackage.check(c(
  'tidyverse', 'readxl',  'stringr', 
  'lubridate', 'httr2', 'rvest', 'xml2',
  "purrr", "genderizeR", "dotenv"
))

Functions

Call Meertens Voornamen Databank

is_ok = function(resp) resp_status(resp) >= 200 && resp_status(resp) < 300

request_gender = function(
    first_name,
    base = "https://nvb.meertens.knaw.nl/naam/is/",
    pause = 0.5
  ){
  # configure url for scraping
  url  = paste0(base, URLencode(tolower(first_name), reserved = TRUE))

  # configure user agent
  ua = paste(
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 15_5)",
    "AppleWebKit/537.36 (KHTML, like Gecko)",
    "Chrome/129.0.0.0 Safari/537.36"
  )

  req = request(url) |>
    req_user_agent(ua) |>
    req_timeout(30) |>
    req_headers(
      "Accept" = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
      "Accept-Language" = "nl,en;q=0.8"
    ) |>
    # Retry on 429/5xx, and *also* on network hiccups:
    req_retry(
      max_tries = 4,
      backoff = ~ runif(1, 0.5, 1.2) * (2 ^ (.x - 1)),  # jittered exponential
      is_transient = function(resp) {
        code <- resp_status(resp)
        isTRUE(code == 429L || (code >= 500L & code < 600L))
      }
    )

  # polite pause + jitter
  if (pause > 0) Sys.sleep(pause + runif(1, 0, 0.4))

  # CRITICAL: don't throw on transport errors
  resp <- tryCatch(
    req_perform(req),
    error = function(e) {
      attr(e, "nvb_url") <- url
      e
    }
  )

  # Return a uniform list the caller can inspect
  if (inherits(resp, "error")) {
    return(list(ok = FALSE, status = NA_integer_, url = url, resp = NULL, error = resp))
  }

  list(ok = is_ok(resp), status = resp_status(resp), url = url, resp = resp, error = NULL)
}

Extract Gender Information

extract_gender_information = function(
    resp,
    first_name
  ){
  # extract all the tables on the page
  html = read_html(resp_body_string(resp))
  tables = html_table(html, header=TRUE)

  # select the first table if a table if found
  if (length(tables) == 0) stop("No tables were found")
  tab = tables[[1]]

  # extract information from table
  male_count = tab[1, 3] |> pull()
  male_count = ifelse(male_count == '--', 0, as.numeric(male_count))
  female_count = tab[5, 3] |> pull() 
  female_count = ifelse(female_count == '--', 0, as.numeric(female_count))
  probability_male = male_count / (female_count + male_count) 

  # configure results table
  res = tibble::tribble(
    ~first_name, ~male_count, ~female_count, ~probability_male,
    first_name, male_count,  female_count,  probability_male
  )
  return(res)
}

Helper functions

safe_extract = purrr::possibly(
  extract_gender_information,
  otherwise = tibble::tibble(
    first_name       = NA_character_,
    male_count       = NA_integer_,
    female_count     = NA_integer_,
    probability_male = NA_real_
  )
)

get_gender_row = function(name, gender) {
  # If cached, return from cache
  if (name %in% gender$first_name) {
    return(gender |> filter(first_name == name))
  }

  r <- request_gender(name)

  # If transport error or HTTP not OK, surface status & keep going
  if (!isTRUE(r$ok)) {
    return(tibble(
      first_name       = NA_character_,
      male_count       = NA_integer_,
      female_count     = NA_integer_,
      probability_male = NA_real_
    ))
  }

  out <- safe_extract(r$resp, name) |>
    mutate(first_name = name)

  out
}

Patch Difficult Names

patch_gender_on_splits = function(gender){
  # set gender cache
  gender_cache = gender |> drop_na()
  # select authors without gender, and split their names
  selection = gender |>
    filter(is.na(male_count)) |>
    mutate(first_name_split = str_split(first_name, ' ')) |>
    unnest_longer(first_name_split) |>
    select(first_name, first_name_split)

  # get first names from selection
  first_names = selection |>
    select(first_name_split) |>
    pull() |>
    unique()

  # patch gender --------------------------------------
  gender_patch = purrr::map_dfr(
    first_names, get_gender_row, gender = gender_cache
  )

  # aggregate gender information
  gender_patch = selection |>
    left_join(
      gender_patch, 
      by=join_by(first_name_split == first_name)
    ) |>
    drop_na() |>
    # take the average gender count and probablity
    # for names where both splits yielded a gender
    # result
    group_by(first_name) |>
    summarise(
      male_count = as.integer(mean(male_count)),
      female_count = as.integer(mean(female_count)),
      probability_male = mean(probability_male)
    ) |>
    ungroup() 

  gender |> rows_update(gender_patch)
}

Clean Gender Information

clean_gender = function(data){
  data |>
    mutate(
      gender = ifelse(
        probability_male >= 0.5,
        'male','female'
      ),
      count = ifelse(
        gender == 'male',
        male_count, female_count
      ),
      prob = ifelse(
        gender == 'male',
        probability_male, 1 - probability_male
      )
    ) |>
    select(first_name, gender, prob, count)
}

Patch Missing Gender

patch_missing_gender = function(data){
  female_names = c(
    "Alaxandra",   "Alinson",     "Avyanthi",
    "Brunilda",    "Busisiwe",    "Diliara",      
    "Dolive",      "Echo",        "Guangyu",
    # mistake in name, has been patched with _create_name_corrections
    "Guangye", 
    "Gul-i-Hina",  "Haebin",      "Haisu",
    # Phoebe Kisibi Mbasalaki was incorrectly coded, has been patched with _create_name_corrections 
    "Kisubi",      "Pheobe",
    "Liubov",      "Madalina",    "Majolijn",    
    "Mansoureh",   "Nankyung",    "Nilmawati",   
    "Nodira",      "Noyonika",    "Radostina",   
    "Rojika",      "Rozenmarijn", "Sayoni",
    "Seonoki",     "Shelliann",   "Shiming",     
    "Siggie",      "Siztine",     "Sungmi",      
    "Talinta",     "Teana",       "Xingna",
    "Yuliia",      "Zhiyi"
  )

  male_names = c(
    "Alborno",     "Chenchen",    "Chendi",
    "Chunglin",    "Chuyu",       "Diliara",
    "Gjovalin",    "Kirils",      "Kyohee",
    "Madhud",      "Quichen",     "Soeren",
    "Teana",       "Tanzhe",     "Vishwesh",
    "Weverthon"
  )

  data |>
    mutate(
      gender = case_when(
        !is.na(gender) ~ gender,
        first_name %in% female_names ~ 'female',
        first_name %in% male_names ~ 'male',
        .default = gender
      )
    )
}
genderize_names = function(idx) {
  # load cached gender information
  gender_cache = readRDS(file.path('data', 'utils', "genderizer_cache.Rds"))

  # load api key from secrets file
  dotenv::load_dot_env()
  APIKEY <- Sys.getenv("GENDERIZE_API_KEY")

  # select uncached names
  idx =  idx |> filter(!term %in% gender_cache$name)

  # select first_names
  first_names = idx$term |> na.omit() |> unique()
  first_names = first_names

  # fetch gender results
  hold = c()
  for (name in first_names){
    resp = genderizeAPI(name, apikey = APIKEY)
    hold[[name]] = resp$response
  }

  # combine cache with results and put new cache results
  res = bind_rows(gender_cache, bind_rows(hold)) |>
    distinct(.keep_all = TRUE)
  saveRDS(res, file.path('data', 'utils', "genderizer_cache.Rds"))

  return(res)
}
scrape_gender = function(idx) {
  # load gender cache
  gender_cache = readRDS(file.path("data", "utils", "nvb_gender.Rds")) |> 
    drop_na()

  first_names = idx |> pull(first_name) |> unique() |> na.omit()

  # scrape gender results
  res = purrr::map_dfr(
      first_names, 
      get_gender_row, 
      gender = gender_cache
    ) |>
    patch_gender_on_splits() 

  # put gender scrape results
  saveRDS(res |> drop_na(), file.path('data', 'utils', "nvb_gender.Rds"))

  # clean gender results
  res = res |> 
    clean_gender() |>
    patch_missing_gender()

  return(res)
}
harmonize_gender = function(gender) {
  gender |>
    select(
      first_name, term, starts_with('gender'), 
      starts_with('count'), starts_with('prob')
    ) |>
    distinct(first_name, term, .keep_all=TRUE) |>
    mutate(
      has_multiple = str_detect(first_name, '( |-)'),
      has_mismatch = gender.x != gender.y
    ) |>
    drop_na(first_name) |>
    filter(!(has_multiple & has_mismatch & str_detect(term, '.'))) |>
    mutate(
      gender = case_when(
        is.na(gender.y) ~ gender.x,
        .default = gender.y
      ),
      prob = case_when(
        is.na(probability) ~ prob,
        .default = probability
      ),
      count = case_when(
        is.na(count.y) ~ count.x,
        .default = count.y
      )
    ) |>
    group_by(first_name) |>
    summarise(
      gender = first(gender),
      prob = mean(prob), 
      count = sum(count)
    )
}

Application

dir = file.path('data', 'processed')
file = list.files(dir, pattern = 'names.Rds')[[1]]
names = readRDS(file.path(dir, file))

idx = names |>
  mutate(term = first_name |> 
    str_to_lower() |> 
    str_split('( |-)')
  ) |>
  unnest_longer(term)
res = scrape_gender(idx)
Matching, by = "first_name"
res2 = genderize_names(idx)

gender = idx |>
  left_join(res) |>
  left_join(res2, by=join_by(term == name), suffix = c(".x", ".y")) |>
  harmonize_gender()
Joining with `by = join_by(first_name)`
fsaveRDS(gender, "gender")
[1] "SAVING: ./data/processed/20251021gender.Rds"