Main Analysis
Getting Started
Functions
Application
# step 2: prune works for selection
make_adjacency_matrix <- function(
data, uids, type, min_year = 2020, max_year = 2026, weighted = FALSE
) {
# make edgelist from works
edges <- data[['works']][uids] |>
purrr::keep(~ nrow(.x) > 0) |>
bind_rows() |>
filter(publication_year >= min_year, publication_year <= max_year) |>
select(uid, work_id, authorships, publication_year) |>
rename(e_uid = uid) |>
distinct(work_id, .keep_all = TRUE) |>
unnest(authorships)
if (type != 'all') {
edges <- edges |>
group_by(work_id) |>
mutate(
e_uid = uid[author_position == type][1]
) |> ungroup() |>
filter(!is.na(e_uid), e_uid != uid) |>
select(work_id, e_uid, uid)
} else {
edges <- edges |>
group_by(work_id) |>
reframe(
tidyr::crossing(
e_uid = uid,
uid = uid
)
) |>
ungroup() |>
filter(!is.na(e_uid), e_uid != uid) |>
select(work_id, e_uid, uid)
}
# calculate the number of edges between ego and alters.
edge_counts <- edges |> count(e_uid, uid, name="w")
# convert counts to incidences
if (!weighted) {
edge_counts <- edge_counts %>% mutate(w = 1L)
}
# all nodes appearing as ego or alter
nodes <- sort(unique(c(edge_counts$e_uid, edge_counts$uid)))
# complete matrix grid and pivot to wide
adj <- edge_counts %>%
rename(from = e_uid, to = uid) %>%
complete(from = nodes, to = nodes, fill = list(w = 0)) %>%
pivot_wider(names_from = to, values_from = w) %>%
arrange(from)
# convert to matrix and set rownames
mat <- adj %>% as.data.frame()
row_ids <- mat$from
mat$from <- NULL
mat <- as.matrix(mat)
rownames(mat) <- row_ids
diag(mat) <- 0
# make sure the matrices are consistent across waves
all_uids <- as.character(uids)
# indices of existing rows/cols in desired order
ri <- match(all_uids, rownames(mat))
ci <- match(all_uids, colnames(mat))
# start with full NA matrix
mat_full <- matrix(NA, nrow = length(all_uids), ncol = length(all_uids),
dimnames = list(all_uids, all_uids))
# copy overlap block
mat_full[!is.na(ri), !is.na(ci)] <- mat[ri[!is.na(ri)], ci[!is.na(ci)], drop = FALSE]
mat <- mat_full
mat
}
fcolnet2 <- function(
data,
university = NULL,
position = NULL,
discipline = NULL,
type = 'all',
waves = list(c(2019, 2022), c(2023, 2024), c(2025, 2026))
){
valid_disciplines <- c("Sociology", "Political Sciences")
valid_universities <- c("UVA","EUR","UL","VU","UVG","UU","RU","UVT")
valid_positions <- c("Associate Professor","PhD Candidate","Full Professor",
"Researcher or Lecturer","Assistant Professor")
# helper: set default + validate
set_and_check <- function(x, valid, name = deparse(substitute(x))) {
x <- if (is.null(x)) valid else x
bad <- setdiff(unique(na.omit(x)), valid)
if (length(bad)) stop("Invalid ", name, ": ", paste(bad, collapse = ", "), call. = FALSE)
x
}
# set and check values
discipline <- set_and_check(discipline, valid_disciplines, "discipline")
university <- set_and_check(university, valid_universities, "university")
position <- set_and_check(position, valid_positions, "position")
min_year <- min(unlist(waves))
max_year <- max(unlist(waves))
# step 1: make selection of nodes
authors <- data[["demographics"]] |>
filter(
if_any(c(university_22_first, university_24_first, university_25_first),
~ .x %in% university),
if_any(c(discipline_22, discipline_24, discipline_25),
~ .x %in% discipline),
if_any(c(position_22, position_24, position_25),
~ .x %in% position)
) |>
arrange(uid)
uids <- authors |> pull(uid) |> sort()
# step 3: create empty matrixes (wave, i, j)
nwaves <- length(waves)
nets <- list()
for (w in 1:length(waves)){
nets[[w]] <- make_adjacency_matrix(
data, uids, type, min_year = waves[[w]][1], max_year = waves[[w]][2])
}
# step 4: fill nets
output <- list(
data = authors,
nets = nets
)
return(output)
}harmonize_covariates <- function(data, what = 'data'){
uni_levels = c("EUR", "RU" , "UL", "UU", "UVA", "UVG", "UVT", "VU")
data[[what]] = data[[what]] |>
mutate(
female = as.integer(gender == 'female'),
soc = pmap_chr(
list(discipline_22, discipline_24, discipline_25),
~ unique(na.omit(c(...)))[1]
),
soc = as.integer(soc == 'Sociology')
) |>
rename_with(
~ .x |>
str_replace('university_', 'uni') |>
str_replace('_first$', 'p')
) |>
mutate(
across(
starts_with('uni') & ends_with('p'),
~ factor(tolower(.x),
levels = tolower(uni_levels),
ordered = TRUE)
)
)
return(data)
}# uni_levels = tolower(c("EUR", "RU" , "UL", "UU", "UVA", "UVG", "UVT", "VU"))
uni_levels = tolower(c(
"RU", "UU", "UVA", "UVG"
))
colnet <- fcolnet2(data,
# discipline = "Sociology",
university = toupper(uni_levels),
type = 'first',
) |>
harmonize_covariates(what='data')
wave1 = colnet[['nets']][[1]]
wave2 = colnet[['nets']][[2]]
wave3 = colnet[['nets']][[3]]# set dependent variable
net_array <- array(
data = c(wave1, wave2, wave3),
dim = c(dim(wave1), 3)
)
net <- sienaDependent(net_array, )
# set constant variables as numeric covariances
gender <- coCovar(colnet[['data']]$female)
soc <- coCovar(colnet[['data']]$soc)
# set values for time varying universities
uni = colnet[['data']] |> select(uni22p, uni24p, uni25p)
uni <- uni_levels[-1] |>
set_names() |>
map(function(level) {
mat <- uni |>
transmute(
# u22 = as.integer(uni22p == level), # excluded for model specification
u24 = as.integer(uni24p == level),
u25 = as.integer(uni25p == level)
) |>
as.matrix()
varCovar(mat, centered = FALSE)
})
# functions
funcs = colnet[['data']] |> select(position_22, position_24, position_25)
funcs_levels = c(
"PhD Candidate", "Researcher or Lecturer",
"Assistant Professor", "Associate Professor",
"Full Professor"
)
funcs = funcs_levels[-1] |>
set_names() |>
map(function(level){
mat <- funcs |>
transmute(
# f22 = as.integer(position_22 == level),
f24 = as.integer(position_25 == level),
f25 = as.integer(position_25 == level)
) |>
as.matrix()
varCovar(mat, centered = FALSE)
})add_interdisciplinarity = function(colnet, topics){
uids <- colnet[["data"]] |>
dplyr::pull(uid) |>
unique() |> sort()
intdis <- topics[["intdis"]][["subfield"]]
colnet[["intdis"]] <- {
out <- matrix(
NA_real_,
nrow = length(uids),
ncol = ncol(intdis),
dimnames = list(uids, colnames(intdis))
)
common_uids <- intersect(uids, rownames(intdis))
out[common_uids, ] <- intdis[common_uids, , drop = FALSE]
out
}
return(colnet)
}
add_dissimilarity <- function(colnet, topics) {
uids <- unique(dplyr::pull(colnet[["data"]], uid)) |> sort()
colnet[["dissim"]] <- purrr::map(
topics[["dissim"]][["subfield"]],
\(mat) {
common_uids <- Reduce(intersect, list(uids, rownames(mat), colnames(mat)))
out <- matrix(
NA_real_,
nrow = length(uids),
ncol = length(uids),
dimnames = list(uids, uids)
)
out[common_uids, common_uids] <- mat[common_uids, common_uids, drop = FALSE]
out
}
)
return(colnet)
}
colnet = colnet |>
add_interdisciplinarity(topics) |>
add_dissimilarity(topics)
intdis = colnet[['intdis']][,1:2] |> varCovar(centered = TRUE)
dissim = colnet[['dissim']][c('2022', '2024')] |> simplify2array() |> varDyadCovar(centered = TRUE)# combine variables into Rsiena data object
mydata <- sienaDataCreate(
net, gender, soc,
mig$M4, mig$other,
# uni$ru, uni$ul, uni$uu, uni$uva, uni$uvg, uni$uvt, uni$vu,
uni$uu,
uni$uva, uni$uvg,
funcs[[1]], funcs[[2]], funcs[[3]], funcs[[4]],
intdis, dissim
)
setwd(BASEDIR)
if(!dir.exists("results")) dir.create("results")
setwd(paste0(BASEDIR, '/results'))
print01Report(mydata, modelname = "./init")
# model instability: perhaps throwing cases withh full missingness out# specify model
library(glue)
includeDefaultEffects <- function(myeff){
myeff <- includeEffects(myeff, density, recip, inPop)
# myeff <- includeEffects(myeff, antiInIso) # does not work in later models
# add node characteristics
myeff <- includeEffects(myeff, egoX, interaction1 = "soc")
# sameX soc collinear with university (uu has no pol-sci)
for (name in names(uni)){
myeff <- includeEffects(myeff, egoX, interaction1 = glue("uni${name}"))
# myeff <- includeEffects(myeff, simX, interaction1 = glue("uni${name}"))
}
return(myeff)
}
myeff <- getEffects(mydata)
myeff <- includeDefaultEffects(myeff)
myAlgorithm <- sienaAlgorithmCreate(
projname = "init",
# n3 = 3000 # for publication ready results
)
# estimate model
ansm1 <- siena07(
myAlgorithm,
data = mydata,
effects = myeff,
returnDeps = TRUE
)
ansm1
save(ansm1, file = './ansm1')# myeff <- getEffects(mydata)
# myeff <- includeDefaultEffects(myeff)
myeff <- includeEffects(myeff, egoX, interaction1 = "gender") # altX, sameX
myeff <- includeEffects(myeff, altX, interaction1 = "gender")
myeff <- includeEffects(myeff, sameX, interaction1 = "gender")
# too few minority in sample leave out for now.
# myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'mig$M4') # altX, sameX
# myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'mig$other') # altX, sameX
ansm2 <- siena07(
myAlgorithm,
data = mydata,
effects = myeff,
returnDeps = TRUE
)
save(ansm2, file = './ansm2')
ansm2# myeff <- getEffects(mydata)
# myeff <- includeDefaultEffects(myeff)
myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'funcs[[1]]')
myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'funcs[[2]]')
myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'funcs[[3]]')
myeff <- includeEffects(myeff, egoX, altX, simX, interaction1 = 'funcs[[4]]')
ansm3 <- siena07(
myAlgorithm,
data = mydata,
effects = myeff,
returnDeps = TRUE,
)
save(ansm3, file = './ansm3')
ansm3# myeff <- getEffects(mydata)
# myeff <- includeDefaultEffects(myeff)
myeff <- includeEffects(myeff, egoX, altX, interaction1 = 'intdis') # simX does noet work
# myeff <- includeEffects(myeff, effFrom, interaction1 = 'dissim') # this does not work
ansm4 <- siena07(
myAlgorithm,
data = mydata,
effects = myeff,
returnDeps = TRUE,
)
save(ansm4, file = './ansm4')
ansm4tidy_rsiena <- function(ans) {
# extract effects
effects = tibble(
depvar = ans$effects$name,
effect = ans$effects$effectName,
short = ans$effects$shortName,
interaction1 = ans$effects$interaction1,
interaction2 = ans$effects$interaction2,
include = ans$effects$include,
estimate = ans$theta,
se = ans$se,
conv_t = ans$tconv
) |>
filter(include)
# extract rate parameters
rate = tibble(
depvar = unique(ans$effects$name),
effect = c('Rate parameter period 1', 'Rate parameter period 2'),
short = 'rate',
estimate = ans$rate,
se = ans$vrate
)
#extract max conv rate
tconv.max = tibble(
depvar = unique(ans$effects$name),
effect = 'Overall maximum convergence ratio',
short = 'tconv.max',
estimate = ans$tconv.max[[1]]
)
# combine and format
rate |>
bind_rows(effects) |>
bind_rows(tconv.max) |>
mutate(
z = estimate / se,
z = ifelse(short %in% c('rate', 'tconv.max'), NA, z),
p_value = 2 * pnorm(-abs(z)),
stars = case_when(
p_value < 0.001 ~ "***",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
p_value < 0.1 ~ "+",
TRUE ~ ""
),
festimate = sprintf("%.3f%s", estimate, stars),
fse = sprintf("(%.3f)", se),
fse = ifelse(short == 'tconv.max', '', fse),
z = round(z, 3),
p_value = round(p_value, 4),
conv_t = round(conv_t, 3)
) |>
select(
depvar, effect, short, interaction1, interaction2,
estimate, se, z, p_value, stars, conv_t, festimate, fse
)
}
res = list()
res[['ansm1']] = tidy_rsiena(ansm1)
res[['ansm2']] = tidy_rsiena(ansm2)
res[['ansm3']] = tidy_rsiena(ansm3)
res[['ansm4']] = tidy_rsiena(ansm4)