Data Processing Names
Getting Started
Functions
Parse Names
parse_names = function(names){
particles <- c(
"de","den","der","het","te","ten","ter",
"van","van de","van den","van der","van 't","van ’t",
"'t","’t",
"von","von der","von den",
"la","le","du","del","della","di","da","dos","das","de la","de los","de las",
"zu","zum","zur", "op de"
)
# Normalization helper for matching
.normalize <- function(x) {
x |>
str_squish() |>
str_replace_all("’", "'") |>
str_to_lower() |>
stri_trans_general("Latin-ASCII")
}
p_norm <- .normalize(particles)
names |>
mutate(
tokens_raw = str_split(naam, "\\s+"),
tokens_norm = map(tokens_raw, ~ .normalize(.x)),
n = map_int(tokens_raw, length),
# find the longest particle match immediately before the final token
particle_idx = map2_int(tokens_norm, n, function(tok, n_tok) {
if (n_tok < 2) return(NA_integer_)
# check 3, 2, 1-token particles that end at position n_tok-1
for (k in 3:1) {
start <- n_tok - k
end <- n_tok - 1
if (start >= 1) {
cand <- paste(tok[start:end], collapse = " ")
if (cand %in% p_norm) return(start)
}
}
NA_integer_
}),
has_particle = !is.na(particle_idx),
first_name = pmap_chr(
list(tokens_raw, particle_idx, n),
function(tok, p_i, n_tok) {
end_giv <- if (is.na(p_i)) n_tok - 1 else p_i - 1
if (end_giv <= 0) tok[1] else paste(tok[seq_len(end_giv)], collapse = " ")
}
),
particle = pmap_chr(
list(tokens_raw, tokens_norm, particle_idx, n),
function(tok_raw, tok_norm, p_i, n_tok) {
if (is.na(p_i)) return(NA_character_)
# output particle in lowercase, canonicalised apostrophes
out <- paste(tok_norm[p_i:(n_tok-1)], collapse = " ")
out
}
) |> str_to_lower(),
last_name = pmap_chr(
list(tokens_raw, n),
function(tok, n_tok) tok[n_tok]
)
) |>
select(naam, first_name, particle, last_name)
}
Extract Initials
Patch Names
Extract Maiden Name
extract_maiden_name = function(names){
# split last names
last_name_splits = str_split(names$last_name, '-', simplify = TRUE)
# add splits to names dataframe
names['last_name'] = last_name_splits[,1]
names['maiden_name'] = ifelse(
last_name_splits[,2] == '',
NA_character_,
last_name_splits[,2]
)
return(names)
}
format_names = function(data){
data |>
mutate(
initials = initials |> stri_trans_general("Latin-ASCII"),
first_name = first_name |> stri_trans_general("Latin-ASCII"),
particle = particle |> str_to_lower() |> stri_trans_general("Latin-ASCII"),
last_name = last_name |> stri_trans_general("Latin-ASCII"),
maiden_name = maiden_name |> stri_trans_general("Latin-ASCII")
) |>
unite("clean_name", initials:last_name, na.rm=TRUE, sep=" ", remove=FALSE) |>
unite("clean_name_full", clean_name, maiden_name, na.rm=TRUE, sep="-", remove=FALSE) |>
mutate(
temp = str_remove(clean_name, paste0(initials, " ")),
temp_full = str_remove(clean_name_full, paste0(initials, " ")),
clean_name = case_when(
!is.na(initials) & !is.na(first_name) ~ temp,
.default=clean_name
),
clean_name_full = case_when(
!is.na(initials) & !is.na(first_name) ~ temp_full,
.default=clean_name_full
),
) |>
select(-starts_with("temp"))
}
Application
# load data
dir = file.path("data", "processed")
file = list.files(dir, pattern = "scholarid.Rds", full.names = TRUE)
data = freadRDS(file)
names = data |>
select(naam) |>
arrange(naam) |>
distinct() |>
drop_na() |>
parse_names() |>
extract_initials() |>
patch_names() |>
extract_maiden_name() |>
format_names()
fsaveRDS(
names,
'names'
)
[1] "SAVING: ./data/processed/20251021names.Rds"