Main Analysis

Published

March 20, 2026

Getting Started

# load custom functions
source("src/utils/custom_functions.r")

# clear the global environment and set dependencies
.clear_global_environment()
.load_quarto_dependencies()

BASEDIR = stringr::str_replace_all(getwd(), '/results', '')
setwd(BASEDIR)

# load and activate packages
library(tidyverse)
library(RSiena)
library(igraph)
library(dplyr)
library(tibble)

Functions

Application

data <- freadRDS2('data', location = "./data/analysis/")
topics <-freadRDS2('topics', location = './data/analysis/')

# step 2: prune works for selection
make_adjacency_matrix <- function(
    data, uids, type, min_year = 2020, max_year = 2026, weighted = FALSE
  ) {
  
  # make edgelist from works
  edges <- data[['works']][uids] |> 
    purrr::keep(~ nrow(.x) > 0) |>
    bind_rows() |>
    filter(publication_year >= min_year, publication_year <= max_year) |>
    select(uid, work_id, authorships, publication_year) |>
    rename(e_uid = uid) |>
    distinct(work_id, .keep_all = TRUE) |>
    unnest(authorships) 

  if (type != 'all') {
    edges <- edges |>
      group_by(work_id) |>
      mutate(
        e_uid = uid[author_position == type][1]
      ) |> ungroup() |>
      filter(!is.na(e_uid), e_uid != uid) |>
      select(work_id, e_uid, uid)
  } else {
    edges <- edges |> 
      group_by(work_id) |>
      reframe(
        tidyr::crossing(
          e_uid = uid,
          uid   = uid
        )
      ) |>
      ungroup() |>
      filter(!is.na(e_uid), e_uid != uid) |>
      select(work_id, e_uid, uid)
  }

  # calculate the number of edges between ego and alters.
  edge_counts <- edges |> count(e_uid, uid, name="w")

  # convert counts to incidences
  if (!weighted) {
    edge_counts <- edge_counts %>% mutate(w = 1L)
  }

  # all nodes appearing as ego or alter
  nodes <- sort(unique(c(edge_counts$e_uid, edge_counts$uid)))
  
  # complete matrix grid and pivot to wide
  adj <- edge_counts %>%
    rename(from = e_uid, to = uid) %>%
    complete(from = nodes, to = nodes, fill = list(w = 0)) %>%
    pivot_wider(names_from = to, values_from = w) %>%
    arrange(from)
  
  # convert to matrix and set rownames
  mat <- adj %>% as.data.frame()
  
  row_ids <- mat$from
  mat$from <- NULL
  
  mat <- as.matrix(mat)
  rownames(mat) <- row_ids
  diag(mat) <- 0

  # make sure the matrices are consistent across waves
  all_uids <- as.character(uids)

  # indices of existing rows/cols in desired order
  ri <- match(all_uids, rownames(mat))
  ci <- match(all_uids, colnames(mat))

  # start with full NA matrix
  mat_full <- matrix(NA, nrow = length(all_uids), ncol = length(all_uids),
                    dimnames = list(all_uids, all_uids))

  # copy overlap block
  mat_full[!is.na(ri), !is.na(ci)] <- mat[ri[!is.na(ri)], ci[!is.na(ci)], drop = FALSE]
  mat <- mat_full

  mat
} 


fcolnet2 <- function(
    data,
    university = NULL,
    position   = NULL,
    discipline = NULL,
    type = 'all',
    waves = list(c(2019, 2022), c(2023, 2024), c(2025, 2026))
  ){

  valid_disciplines  <- c("Sociology", "Political Sciences")
  valid_universities <- c("UVA","EUR","UL","VU","UVG","UU","RU","UVT")
  valid_positions    <- c("Associate Professor","PhD Candidate","Full Professor",
                          "Researcher or Lecturer","Assistant Professor")

  # helper: set default + validate
  set_and_check <- function(x, valid, name = deparse(substitute(x))) {
    x <- if (is.null(x)) valid else x
    bad <- setdiff(unique(na.omit(x)), valid)
    if (length(bad)) stop("Invalid ", name, ": ", paste(bad, collapse = ", "), call. = FALSE)
    x
  }

  # set and check values
  discipline <- set_and_check(discipline, valid_disciplines,  "discipline")
  university <- set_and_check(university, valid_universities, "university")
  position   <- set_and_check(position,   valid_positions,    "position")
  min_year   <- min(unlist(waves))
  max_year   <- max(unlist(waves))

  # step 1: make selection of nodes
  authors <- data[["demographics"]] |>
    filter(
      if_any(c(university_22_first, university_24_first, university_25_first),
            ~ .x %in% university),
      if_any(c(discipline_22, discipline_24, discipline_25),
            ~ .x %in% discipline),
      if_any(c(position_22, position_24, position_25),
            ~ .x %in% position)
    ) |>
    arrange(uid)
  uids <- authors |> pull(uid) |> sort()
  
  # step 3: create empty matrixes (wave, i, j)
  nwaves <- length(waves)
  nets <- list()
  for (w in 1:length(waves)){
    nets[[w]] <- make_adjacency_matrix(
      data, uids, type, min_year = waves[[w]][1], max_year = waves[[w]][2])
  }

  # step 4: fill nets
  output <- list(
    data = authors,
    nets = nets
  )

  return(output)
}

harmonize_covariates <- function(data, what = 'data'){
  uni_levels = c("EUR", "RU" , "UL", "UU", "UVA", "UVG", "UVT", "VU")

  data[[what]] = data[[what]] |>
    mutate(
      female = as.integer(gender == 'female'),
      soc = pmap_chr(
        list(discipline_22, discipline_24, discipline_25),
        ~ unique(na.omit(c(...)))[1]
      ),
      soc = as.integer(soc == 'Sociology')
    ) |>
    rename_with(
      ~ .x |> 
        str_replace('university_', 'uni') |>
        str_replace('_first$', 'p')
    ) |>
    mutate(
      across(
        starts_with('uni') & ends_with('p'),
        ~ factor(tolower(.x), 
        levels = tolower(uni_levels), 
        ordered = TRUE)
      )
    )

  return(data)
}

# s = samp |>
#   harmonize_covariates(what = 'nodes') |> [['nodes']] |>
#   group_by(gender, migrant, discipline) | |>
#   summarize(n = n())

# uni_levels = tolower(c("EUR", "RU" , "UL", "UU", "UVA", "UVG", "UVT", "VU"))
uni_levels = tolower(c(
  "RU", "UU", "UVA", "UVG"
))

colnet <- fcolnet2(data, 
    # discipline = "Sociology",
    university = toupper(uni_levels),
    type = 'first',
  ) |>
  harmonize_covariates(what='data') 

wave1 = colnet[['nets']][[1]]
wave2 = colnet[['nets']][[2]]
wave3 = colnet[['nets']][[3]]

for (net in colnet$nets){
  print(rowSums(is.na(net)))
}

# set dependent variable
net_array <- array(
    data = c(wave1, wave2, wave3),
    dim = c(dim(wave1), 3)
)
net <- sienaDependent(net_array, )

# set constant variables as numeric covariances
gender <- coCovar(colnet[['data']]$female)
soc <- coCovar(colnet[['data']]$soc)

# set values for time varying universities
uni = colnet[['data']] |> select(uni22p, uni24p, uni25p)
uni <- uni_levels[-1] |>
  set_names() |>
  map(function(level) {
    mat <- uni |>
      transmute(
        # u22 = as.integer(uni22p == level), # excluded for model specification
        u24 = as.integer(uni24p == level),
        u25 = as.integer(uni25p == level)
      ) |>
      as.matrix()

    varCovar(mat, centered = FALSE)
  })

# functions
funcs = colnet[['data']] |> select(position_22, position_24, position_25)
funcs_levels = c(
  "PhD Candidate", "Researcher or Lecturer", 
  "Assistant Professor", "Associate Professor", 
  "Full Professor"
)

funcs = funcs_levels[-1] |>
  set_names() |>
  map(function(level){
    mat <- funcs |>
      transmute(
        # f22 = as.integer(position_22 == level),
        f24 = as.integer(position_25 == level),
        f25 = as.integer(position_25 == level)
      ) |>
      as.matrix()
    
    varCovar(mat, centered = FALSE)
  })

# set migrant constant covars
migrant_groups = c('NL', 'M4', 'other')
mig <- migrant_groups[-1] |>
  set_names() |>
  map(function(group){
    val <- as.integer(colnet[['data']]$migrant == group)

    coCovar(val)
  })

add_interdisciplinarity = function(colnet, topics){
  uids <- colnet[["data"]] |>
    dplyr::pull(uid) |>
    unique() |> sort()

  intdis <- topics[["intdis"]][["subfield"]]

  colnet[["intdis"]] <- {
    out <- matrix(
      NA_real_,
      nrow = length(uids),
      ncol = ncol(intdis),
      dimnames = list(uids, colnames(intdis))
    )

    common_uids <- intersect(uids, rownames(intdis))
    out[common_uids, ] <- intdis[common_uids, , drop = FALSE]
    out
  }
  
  return(colnet)
}

add_dissimilarity <- function(colnet, topics) {
  uids <- unique(dplyr::pull(colnet[["data"]], uid)) |> sort()

  colnet[["dissim"]] <- purrr::map(
    topics[["dissim"]][["subfield"]],
    \(mat) {
      common_uids <- Reduce(intersect, list(uids, rownames(mat), colnames(mat)))

      out <- matrix(
        NA_real_,
        nrow = length(uids),
        ncol = length(uids),
        dimnames = list(uids, uids)
      )

      out[common_uids, common_uids] <- mat[common_uids, common_uids, drop = FALSE]
      out
    }
  )

  return(colnet)
}

colnet = colnet |>
  add_interdisciplinarity(topics) |>
  add_dissimilarity(topics)

intdis = colnet[['intdis']][,1:2] |> varCovar(centered = TRUE)
dissim = colnet[['dissim']][c('2022', '2024')] |> simplify2array() |> varDyadCovar(centered = TRUE)

# combine variables into Rsiena data object
mydata <- sienaDataCreate(
  net, gender, soc,
  mig$M4, mig$other,
  # uni$ru, uni$ul, uni$uu, uni$uva, uni$uvg, uni$uvt, uni$vu,
  uni$uu, 
  uni$uva, uni$uvg,
  funcs[[1]], funcs[[2]], funcs[[3]], funcs[[4]],
  intdis, dissim
)

setwd(BASEDIR)
if(!dir.exists("results")) dir.create("results")
setwd(paste0(BASEDIR, '/results'))
print01Report(mydata, modelname = "./init")

# model instability: perhaps throwing cases withh full missingness out

# specify model
library(glue)

includeDefaultEffects <- function(myeff){
  myeff <- includeEffects(myeff, density, recip, inPop)
  # myeff <- includeEffects(myeff, antiInIso) # does not work in later models

  # add node characteristics
  myeff <- includeEffects(myeff, egoX, interaction1 = "soc")
  # sameX soc collinear with university (uu has no pol-sci)
  for (name in names(uni)){
    myeff <- includeEffects(myeff, egoX, interaction1 = glue("uni${name}"))
    # myeff <- includeEffects(myeff, simX, interaction1 = glue("uni${name}"))
  }

  return(myeff)
}

myeff <- getEffects(mydata)
myeff <- includeDefaultEffects(myeff)

myAlgorithm <- sienaAlgorithmCreate(
  projname = "init",
  # n3 = 3000 # for publication ready results
)
# estimate model
ansm1 <- siena07(
  myAlgorithm, 
  data = mydata, 
  effects = myeff, 
  returnDeps = TRUE
)

ansm1
save(ansm1, file = './ansm1')

# myeff <- getEffects(mydata)
# myeff <- includeDefaultEffects(myeff)
myeff <- includeEffects(myeff, egoX, interaction1 = "gender") # altX, sameX
myeff <- includeEffects(myeff, altX, interaction1 = "gender") 
myeff <- includeEffects(myeff, sameX, interaction1 = "gender") 
# too few minority in sample leave out for now.
# myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'mig$M4') # altX, sameX
# myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'mig$other') # altX, sameX

ansm2 <- siena07(
  myAlgorithm, 
  data = mydata, 
  effects = myeff,
  returnDeps = TRUE
)

save(ansm2, file = './ansm2')
ansm2

# myeff <- getEffects(mydata)
# myeff <- includeDefaultEffects(myeff)
myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'funcs[[1]]')
myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'funcs[[2]]')
myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'funcs[[3]]')
myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'funcs[[4]]')

ansm3 <- siena07(
  myAlgorithm, 
  data = mydata, 
  effects = myeff, 
  returnDeps = TRUE,
)

save(ansm3, file = './ansm3')

ansm3

# myeff <- getEffects(mydata)
# myeff <- includeDefaultEffects(myeff)
myeff <- includeEffects(myeff, egoX, altX, interaction1 = 'intdis') # simX does noet work
# myeff <- includeEffects(myeff, effFrom, interaction1 = 'dissim') # this does not work

ansm4 <- siena07(
  myAlgorithm, 
  data = mydata, 
  effects = myeff, 
  returnDeps = TRUE,
)

save(ansm4, file = './ansm4')
ansm4

tidy_rsiena <- function(ans) {
  # extract effects
  effects = tibble(
      depvar = ans$effects$name,
      effect = ans$effects$effectName,
      short  = ans$effects$shortName,
      interaction1 = ans$effects$interaction1,
      interaction2 = ans$effects$interaction2,
      include = ans$effects$include,
      estimate = ans$theta,
      se = ans$se,
      conv_t = ans$tconv
    ) |>
    filter(include) 
  
  # extract rate parameters
  rate = tibble(
    depvar = unique(ans$effects$name),
    effect = c('Rate parameter period 1', 'Rate parameter period 2'),
    short = 'rate',
    estimate = ans$rate,
    se = ans$vrate
  )

  #extract max conv rate
  tconv.max = tibble(
    depvar = unique(ans$effects$name),
    effect = 'Overall maximum convergence ratio',
    short = 'tconv.max',
    estimate = ans$tconv.max[[1]]
  )

  # combine and format
  rate |> 
    bind_rows(effects) |> 
    bind_rows(tconv.max) |>
    mutate(
      z = estimate / se,
      z = ifelse(short %in% c('rate', 'tconv.max'), NA, z),
      p_value = 2 * pnorm(-abs(z)),
      stars = case_when(
        p_value < 0.001 ~ "***",
        p_value < 0.01  ~ "**",
        p_value < 0.05  ~ "*",
        p_value < 0.1   ~ "+",
        TRUE            ~ ""
      ),
      festimate = sprintf("%.3f%s", estimate, stars),
      fse = sprintf("(%.3f)", se),
      fse = ifelse(short == 'tconv.max', '', fse),
      z = round(z, 3),
      p_value = round(p_value, 4),
      conv_t = round(conv_t, 3)
    )  |>
    select(
      depvar, effect, short, interaction1, interaction2,
      estimate, se, z, p_value, stars, conv_t, festimate, fse
    )
}
res = list()
res[['ansm1']] = tidy_rsiena(ansm1)
res[['ansm2']] = tidy_rsiena(ansm2)
res[['ansm3']] = tidy_rsiena(ansm3)
res[['ansm4']] = tidy_rsiena(ansm4)

setwd(BASEDIR)