Data Processing Names

Published

October 21, 2025

Getting Started

# clear the global environment
rm(list = ls())
gc()
source("src/utils/custom_functions.r")

# load and activate packages
fpackage.check(c(
  'tidyverse', 'readxl',  'stringr', 
  'lubridate', 'stringi'
))

Functions

Parse Names

parse_names = function(names){
  particles <- c(
    "de","den","der","het","te","ten","ter",
    "van","van de","van den","van der","van 't","van ’t",
    "'t","’t",
    "von","von der","von den",
    "la","le","du","del","della","di","da","dos","das","de la","de los","de las",
    "zu","zum","zur", "op de"
  )

  # Normalization helper for matching
  .normalize <- function(x) {
    x |>
      str_squish() |>
      str_replace_all("’", "'") |>
      str_to_lower() |>
      stri_trans_general("Latin-ASCII")
    }

  p_norm <- .normalize(particles)

  names |>
    mutate(
      tokens_raw = str_split(naam, "\\s+"),
      tokens_norm = map(tokens_raw, ~ .normalize(.x)),
      n = map_int(tokens_raw, length),

      # find the longest particle match immediately before the final token
        particle_idx = map2_int(tokens_norm, n, function(tok, n_tok) {
          if (n_tok < 2) return(NA_integer_)
          # check 3, 2, 1-token particles that end at position n_tok-1
          for (k in 3:1) {
            start <- n_tok - k
            end   <- n_tok - 1
            if (start >= 1) {
              cand <- paste(tok[start:end], collapse = " ")
              if (cand %in% p_norm) return(start)
            }
          }
          NA_integer_
        }),

        has_particle = !is.na(particle_idx),

        first_name = pmap_chr(
          list(tokens_raw, particle_idx, n),
          function(tok, p_i, n_tok) {
            end_giv <- if (is.na(p_i)) n_tok - 1 else p_i - 1
            if (end_giv <= 0) tok[1] else paste(tok[seq_len(end_giv)], collapse = " ")
          }
        ),

        particle = pmap_chr(
            list(tokens_raw, tokens_norm, particle_idx, n),
            function(tok_raw, tok_norm, p_i, n_tok) {
              if (is.na(p_i)) return(NA_character_)
              # output particle in lowercase, canonicalised apostrophes
              out <- paste(tok_norm[p_i:(n_tok-1)], collapse = " ")
              out
            }
          ) |> str_to_lower(),

        last_name = pmap_chr(
          list(tokens_raw, n),
          function(tok, n_tok) tok[n_tok]
        )
    ) |>
    select(naam, first_name, particle, last_name)
}

Extract Initials

extract_initials = function(names){
  names |>
    mutate(
      initials = str_extract(
          naam, 
          "^((?:\\p{Lu}{1,2}\\.)+(?:-\\p{Lu}{1,2}\\.)*)(?=\\s)"
        ),
      first_name = ifelse(is.na(initials), first_name, NA_character_)) |>
    relocate(initials, .after=naam)
}

Patch Names

patch_names = function(names){
  # read in dataset with corrections for name information
  corrections = readRDS(file.path('data', 'utils', 'name_corrections.Rds'))

  names |> rows_update(corrections, by='naam', unmatched='ignore')
}
readRDS(file.path('data', 'utils', 'name_corrections.Rds')) |> head()

Extract Maiden Name

extract_maiden_name = function(names){
  # split last names
  last_name_splits = str_split(names$last_name, '-', simplify = TRUE)

  # add splits to names dataframe
  names['last_name'] = last_name_splits[,1]
  names['maiden_name'] = ifelse(
    last_name_splits[,2] == '',
    NA_character_,
    last_name_splits[,2]
  )

  return(names)
}
format_names = function(data){
  data |>
    mutate(
      initials = initials |> stri_trans_general("Latin-ASCII"),
      first_name = first_name |> stri_trans_general("Latin-ASCII"),
      particle = particle |> str_to_lower() |> stri_trans_general("Latin-ASCII"),
      last_name = last_name |> stri_trans_general("Latin-ASCII"),
      maiden_name = maiden_name |> stri_trans_general("Latin-ASCII")
    ) |> 
    unite("clean_name", initials:last_name, na.rm=TRUE, sep=" ", remove=FALSE) |>
    unite("clean_name_full", clean_name, maiden_name, na.rm=TRUE, sep="-", remove=FALSE) |>
    mutate(
      temp = str_remove(clean_name, paste0(initials, " ")),
      temp_full = str_remove(clean_name_full, paste0(initials, " ")),
      clean_name = case_when(
        !is.na(initials) & !is.na(first_name) ~ temp,
        .default=clean_name
      ),
      clean_name_full = case_when(
        !is.na(initials) & !is.na(first_name) ~ temp_full,
        .default=clean_name_full
      ),
    ) |>
    select(-starts_with("temp"))
    
}

Application

# load data 
dir = file.path("data", "processed")
file = list.files(dir, pattern = "scholarid.Rds", full.names = TRUE)

data = freadRDS(file) 

names = data |>
  select(naam) |>
  arrange(naam) |>
  distinct() |>
  drop_na() |>
  parse_names() |>
  extract_initials() |>
  patch_names() |>
  extract_maiden_name() |>
  format_names()

fsaveRDS(
  names, 
  'names'
)
[1] "SAVING: ./data/processed/20251021names.Rds"