diff --git a/DESCRIPTION b/DESCRIPTION index ea56a6f..eac6171 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ARTnet -Version: 2.9.0 -Date: 2024-09-06 +Version: 2.9.1 +Date: 2026-06-08 Title: Epidemic and Network Model Parameterization with the ARTnet Dataset Description: Epidemic and Network Model Parameterization with the ARTnet Dataset. Maintainer: Samuel Jenness @@ -21,9 +21,9 @@ Suggests: survival, testthat (>= 3.0.0) VignetteBuilder: knitr -RoxygenNote: 7.3.3 Encoding: UTF-8 Remotes: github::EpiModel/ARTnetData@main Roxygen: list(markdown = TRUE) Config/testthat/edition: 3 +Config/roxygen2/version: 8.0.0 diff --git a/R/NetStats.R b/R/NetStats.R index 22cc17d..db06bea 100644 --- a/R/NetStats.R +++ b/R/NetStats.R @@ -49,6 +49,43 @@ } +# Tabulate `x` over a fixed, complete set of `levels` so that downstream +# multiplication by a per-level netparams vector (nf.race, nf.age.grp, +# nf.risk.grp, nf.deg.tot, nf.diag.status) never silently recycles when a +# level is unobserved in the synthetic population. That can happen when a +# user passes a target_pop with empty cells (e.g. no nodes in a risk +# quintile or age group). Returns a "table" object so that the resulting +# nodefactor fields keep the same class/structure as the prior `table(x)` +# calls; with the default sampling path every level is populated and the +# `levels` are the sorted observed values, so this is byte-identical to the +# prior `table(x)` behavior (verified by the backward-compat snapshot). +.tab_levels <- function(x, levels) { + table(factor(x, levels = levels)) +} + +# Clamp a user-supplied target_pop attribute to the support the joint GLMs +# were fit on. Out-of-support values are a silent correctness hazard: the +# Poisson degree models extrapolate, and the nodefactor aggregations (which +# sum predicted degree only over the fitted levels) drop the out-of-support +# nodes' degree mass, breaking the sum(nodefactor) == 2 * edges invariant. +# Warns when clamping actually changes a value. +.clamp_support <- function(x, allowed, nm) { + oob <- !(x %in% allowed) + if (any(oob, na.rm = TRUE)) { + warning(sprintf( + paste0("target_pop$%s has %d value(s) outside the fitted support ", + "{%s}; clamping to range. Out-of-support values would otherwise ", + "break the sum(nodefactor_%s) == 2 * edges invariant. Supply ", + "integer-coded attributes within the fitted support."), + nm, sum(oob, na.rm = TRUE), + paste(range(allowed), collapse = "-"), nm), + call. = FALSE) + x <- pmin(pmax(x, min(allowed)), max(allowed)) + } + x +} + + # Aggregate synth-population stratum-level durations under joint_lm. Per-ego # predicted log(duration) from the joint_lm fit, marginalized over partner-race # uncertainty using joint_nm_race_model when race = TRUE, then exponentiated @@ -67,7 +104,8 @@ joint_nm_race_model, synth_data, n_age_grps, - sex_cess_extra_row = FALSE) { + sex_cess_extra_row = FALSE, + fallback_mean_dur_adj = NULL) { if (is.null(joint_dur_model)) return(NULL) if (!is.null(joint_nm_race_model)) { @@ -106,6 +144,27 @@ NA_real_, 1 / (1 - 2^(-1 / medians))) + # Per-stratum fallback to the within-ARTnet byage durations from + # build_netparams when a synth stratum is empty (no nodes in that age + # group) or yields a non-positive median. Without this, NA durations + # propagate into dissolution_coefs' `if (any(duration < 1))` check and + # abort the whole build. A restricted-age target_pop (which is common: + # forward-projected MSM populations often drop the oldest stratum) would + # otherwise crash. Mirrors build_netparams' own per-stratum empirical + # fallback in compute_alt_durs(). The fallback vector follows the same row + # layout (nonmatch first, then matched 1..N, plus an optional sex-cessation + # row), so drop the cessation row before substituting if present. + if (!is.null(fallback_mean_dur_adj)) { + fb <- fallback_mean_dur_adj + if (isTRUE(sex_cess_extra_row) && length(fb) == length(mean_dur_adj) + 1L) { + fb <- fb[-length(fb)] + } + if (length(fb) == length(mean_dur_adj)) { + na_idx <- which(is.na(mean_dur_adj)) + mean_dur_adj[na_idx] <- fb[na_idx] + } + } + if (isTRUE(sex_cess_extra_row)) mean_dur_adj <- c(mean_dur_adj, 1) mean_dur_adj } @@ -151,7 +210,15 @@ #' (derived from required columns if absent). When supplied, attribute sampling is #' bypassed entirely and `network.size` is set to `nrow(target_pop)`. This form is #' for users with a fully-specified joint synthetic population (e.g., post-stratified -#' to NHBS or AMIS demographics). +#' to NHBS or AMIS demographics). Note (`method = "joint"`): the supplied `deg.main` +#' and `deg.casl` columns are used **only as covariates** in the cross-layer degree +#' GLMs; the layer edge counts are re-predicted by g-computation, not taken from the +#' supplied degrees. Likewise `diag.status` enters the degree GLMs as a predictor, so +#' the HIV composition of the target population affects edges and not just +#' `nodefactor_diag.status`. Degree/role/risk columns outside the fitted support +#' (`deg.main` 0-2, `deg.casl`/`deg.tot` 0-3, `role.class` 0-2, `risk.grp` +#' 1-`nquants`) are clamped with a warning to preserve the +#' `sum(nodefactor) = 2 * edges` invariant. #' \item A **character string** naming a built-in reference population. Currently #' raises an informative error. The planned set is geography-specific general male #' population demographics (NCHS age pyramid + `ARTnetData::race.dist` by @@ -180,6 +247,11 @@ #' stratum. `nodefactor_risk.grp` and `diss.homog` still use the within-ARTnet #' univariate / aggregated values — those are not consumed by the standard #' EpiModelHIV-Template dissolution offset. +#' @param seed Optional integer. If supplied, `set.seed(seed)` is called at the start of the +#' function for reproducibility of the stochastic sampling path (default `target_pop` and +#' list forms). Defaults to `NULL` (no seeding), which preserves prior behavior. The +#' data.frame form of `target_pop` bypasses attribute sampling and is deterministic +#' regardless of this argument. #' @param browser If `TRUE`, run `build_netparams` in interactive browser mode. #' #' @details @@ -244,9 +316,17 @@ build_netstats <- function(epistats, netparams, young.prop = NULL, method = c("existing", "joint"), target_pop = NULL, + seed = NULL, browser = FALSE) { method <- match.arg(method) + # Optional reproducibility. The default sampling path draws the synthetic + # population stochastically (sample / runif / rbinom / apportion_lr), so two + # calls with identical arguments otherwise differ at the Monte Carlo noise + # level. Setting `seed` makes a run reproducible. The data.frame form of + # target_pop bypasses attribute sampling and is already deterministic. + if (!is.null(seed)) set.seed(seed) + # Ensures that ARTnetData is installed if (system.file(package = "ARTnetData") == "") stop(missing_data_msg) @@ -466,12 +546,15 @@ build_netstats <- function(epistats, netparams, attr_race <- apportion_lr(num, seq_along(flattened_race_level), race_numbers / num, shuffled = TRUE) } - attr_deg.casl <- df$deg.casl - attr_deg.main <- df$deg.main - attr_deg.tot <- if (!is.null(df$deg.tot)) df$deg.tot else + attr_deg.casl <- .clamp_support(df$deg.casl, 0:3, "deg.casl") + attr_deg.main <- .clamp_support(df$deg.main, 0:2, "deg.main") + attr_deg.tot <- if (!is.null(df$deg.tot)) { + .clamp_support(df$deg.tot, 0:3, "deg.tot") + } else { pmin(attr_deg.main + attr_deg.casl, 3L) - attr_risk.grp <- df$risk.grp - attr_role.class <- df$role.class + } + attr_risk.grp <- .clamp_support(df$risk.grp, seq_len(nquants), "risk.grp") + attr_role.class <- .clamp_support(df$role.class, 0:2, "role.class") } else { # ---- Existing sampling path (default + list-form marginal overrides) ---- if (!is.null(age.pyramid)) { @@ -700,14 +783,16 @@ build_netstats <- function(epistats, netparams, joint_nm_race_model = netparams$main$joint_nm_race_model, synth_data = synth, n_age_grps = n_age_grps_main, - sex_cess_extra_row = sex.cess.mod + sex_cess_extra_row = sex.cess.mod, + fallback_mean_dur_adj = netparams$main$durs.main.byage$mean.dur.adj ) synth_dur_casl_byage <- .aggregate_synth_byage_durations( joint_dur_model = netparams$casl$joint_duration_model, joint_nm_race_model = netparams$casl$joint_nm_race_model, synth_data = synth, n_age_grps = n_age_grps_casl, - sex_cess_extra_row = sex.cess.mod + sex_cess_extra_row = sex.cess.mod, + fallback_mean_dur_adj = netparams$casl$durs.casl.byage$mean.dur.adj ) } @@ -723,10 +808,12 @@ build_netstats <- function(epistats, netparams, if (edges.avg == FALSE) { out$main$edges <- (netparams$main$md.main * num) / 2 } else { - out$main$edges <- sum(unname(table(out$attr$race)) * netparams$main$nf.race) / 2 + out$main$edges <- sum(.tab_levels(out$attr$race, seq_along(netparams$main$nf.race)) * + netparams$main$nf.race) / 2 } # nodefactor("race") --- - nodefactor_race <- table(out$attr$race) * netparams$main$nf.race + nodefactor_race <- .tab_levels(out$attr$race, seq_along(netparams$main$nf.race)) * + netparams$main$nf.race out$main$nodefactor_race <- unname(nodefactor_race) # nodematch("race") --- @@ -741,7 +828,8 @@ build_netstats <- function(epistats, netparams, } # nodefactor("age.grp") --- - nodefactor_age.grp <- table(out$attr$age.grp) * netparams$main$nf.age.grp + nodefactor_age.grp <- .tab_levels(out$attr$age.grp, seq_along(netparams$main$nf.age.grp)) * + netparams$main$nf.age.grp if (sex.cess.mod == TRUE) { nodefactor_age.grp[length(nodefactor_age.grp)] <- 0 } @@ -766,7 +854,8 @@ build_netstats <- function(epistats, netparams, out$main$concurrent <- num * netparams$main$concurrent # nodefactor("diag.status") --- - nodefactor_diag.status <- table(out$attr$diag.status) * netparams$main$nf.diag.status + nodefactor_diag.status <- .tab_levels(out$attr$diag.status, 0:1) * + netparams$main$nf.diag.status out$main$nodefactor_diag.status <- unname(nodefactor_diag.status) } else { # method == "joint" @@ -847,10 +936,12 @@ build_netstats <- function(epistats, netparams, if (edges.avg == FALSE) { out$casl$edges <- (netparams$casl$md.casl * num) / 2 } else { - out$casl$edges <- sum(unname(table(out$attr$race)) * netparams$casl$nf.race) / 2 + out$casl$edges <- sum(.tab_levels(out$attr$race, seq_along(netparams$casl$nf.race)) * + netparams$casl$nf.race) / 2 } # nodefactor("race") --- - nodefactor_race <- table(out$attr$race) * netparams$casl$nf.race + nodefactor_race <- .tab_levels(out$attr$race, seq_along(netparams$casl$nf.race)) * + netparams$casl$nf.race out$casl$nodefactor_race <- unname(nodefactor_race) # nodematch("race") --- @@ -865,7 +956,8 @@ build_netstats <- function(epistats, netparams, } # nodefactor("age.grp") --- - nodefactor_age.grp <- table(out$attr$age.grp) * netparams$casl$nf.age.grp + nodefactor_age.grp <- .tab_levels(out$attr$age.grp, seq_along(netparams$casl$nf.age.grp)) * + netparams$casl$nf.age.grp if (sex.cess.mod == TRUE) { nodefactor_age.grp[length(nodefactor_age.grp)] <- 0 } @@ -890,7 +982,8 @@ build_netstats <- function(epistats, netparams, out$casl$concurrent <- num * netparams$casl$concurrent # nodefactor("diag.status") --- - nodefactor_diag.status <- table(out$attr$diag.status) * netparams$casl$nf.diag.status + nodefactor_diag.status <- .tab_levels(out$attr$diag.status, 0:1) * + netparams$casl$nf.diag.status out$casl$nodefactor_diag.status <- unname(nodefactor_diag.status) } else { # method == "joint" @@ -954,10 +1047,12 @@ build_netstats <- function(epistats, netparams, if (edges.avg == FALSE) { out$inst$edges <- (netparams$inst$md.inst * num) / 2 } else { - out$inst$edges <- sum(unname(table(out$attr$race)) * netparams$inst$nf.race) / 2 + out$inst$edges <- sum(.tab_levels(out$attr$race, seq_along(netparams$inst$nf.race)) * + netparams$inst$nf.race) / 2 } # nodefactor("race") --- - nodefactor_race <- table(out$attr$race) * netparams$inst$nf.race + nodefactor_race <- .tab_levels(out$attr$race, seq_along(netparams$inst$nf.race)) * + netparams$inst$nf.race out$inst$nodefactor_race <- unname(nodefactor_race) # nodematch("race") --- @@ -972,7 +1067,8 @@ build_netstats <- function(epistats, netparams, } # nodefactor("age.grp") --- - nodefactor_age.grp <- table(out$attr$age.grp) * netparams$inst$nf.age.grp + nodefactor_age.grp <- .tab_levels(out$attr$age.grp, seq_along(netparams$inst$nf.age.grp)) * + netparams$inst$nf.age.grp if (sex.cess.mod == TRUE) { nodefactor_age.grp[length(nodefactor_age.grp)] <- 0 } @@ -991,15 +1087,17 @@ build_netstats <- function(epistats, netparams, out$inst$absdiff_sqrt.age <- absdiff_sqrt.age # nodefactor("risk.grp") --- - nodefactor_risk.grp <- table(out$attr$risk.grp) * netparams$inst$nf.risk.grp + nodefactor_risk.grp <- .tab_levels(out$attr$risk.grp, seq_len(nquants)) * + netparams$inst$nf.risk.grp out$inst$nodefactor_risk.grp <- unname(nodefactor_risk.grp) # nodefactor("deg.tot") --- - nodefactor_deg.tot <- table(out$attr$deg.tot) * netparams$inst$nf.deg.tot + nodefactor_deg.tot <- .tab_levels(out$attr$deg.tot, 0:3) * netparams$inst$nf.deg.tot out$inst$nodefactor_deg.tot <- unname(nodefactor_deg.tot) # nodefactor("diag.status") --- - nodefactor_diag.status <- table(out$attr$diag.status) * netparams$inst$nf.diag.status + nodefactor_diag.status <- .tab_levels(out$attr$diag.status, 0:1) * + netparams$inst$nf.diag.status out$inst$nodefactor_diag.status <- unname(nodefactor_diag.status) } else { # method == "joint" @@ -1030,7 +1128,8 @@ build_netstats <- function(epistats, netparams, out$inst$absdiff_sqrt.age <- sum(edge_wt_inst * dyad_inst$pred_ad_sqrtage) / 2 # nodefactor("risk.grp") -- quantile-scheme preserved from univariate - nodefactor_risk.grp <- table(out$attr$risk.grp) * netparams$inst$nf.risk.grp + nodefactor_risk.grp <- .tab_levels(out$attr$risk.grp, seq_len(nquants)) * + netparams$inst$nf.risk.grp out$inst$nodefactor_risk.grp <- unname(nodefactor_risk.grp) # nodefactor("deg.tot") -- truncated at 3 in build_netparams diff --git a/inst/validation/forward_projection_bridge_decision.md b/inst/validation/forward_projection_bridge_decision.md new file mode 100644 index 0000000..e4a6ef2 --- /dev/null +++ b/inst/validation/forward_projection_bridge_decision.md @@ -0,0 +1,136 @@ +# Forward-projection bridge: the estimand decision (ARTnetPredict to ARTnet) + +Status: decision memo for the PI. Written 2026-06-08 during the deep-dive review of the +joint g-computation refactor and its use as the bridge from ARTnetPredict's forward-projected +populations to updated `netstats`. + +This memo exists because one question on the ARTnetPredict path is a modeling decision, not a +bug, and it determines the architecture of everything downstream. The package-side bugs found +during the review have been fixed (see the last section); this is the part that needs a call. + +## The question in one sentence + +When we push ARTnetPredict's forward-projected 2022-24 population into ARTnet to get updated +network targets, should the temporal change enter through the population's **attribute +distribution** (Option A) or through ARTnetPredict's **projected degrees** (Option B)? They +give different answers and only one of them can ride through `build_netstats(target_pop=)`. + +## What the refactor gives us, and the catch + +`build_netstats(method = "joint", target_pop = df)` takes a one-row-per-node population and +runs g-computation (direct standardization): it predicts each node's expected degree from the +joint Poisson GLMs **fit on ARTnet 2017-18**, then aggregates `edges = sum(pred_deg)/2`, +`nodefactor[level] = sum(pred_deg | level)`, internally consistent by construction. + +The catch, verified live against current HEAD: the `deg.main` / `deg.casl` columns you supply +are used **only as covariates**; the edge counts are re-predicted, not taken from the supplied +degrees. + +``` +supplied mean deg.casl = 1.50 -> implied casl edges 3753 ; g-comp casl edges 1182 +supplied mean deg.main = 1.01 -> implied main edges 2528 ; g-comp main edges 680 +``` + +There is no fixed point. Forcing everyone to a given casual degree changes the predicted main +degree but never reproduces the input: + +``` +supplied deg.casl = 0 (all) -> predicted mean main degree 0.459 +supplied deg.casl = 3 (all) -> predicted mean main degree 0.222 +``` + +This is correct behavior for g-computation. It is also the crux of the problem. + +## Why it matters for ARTnetPredict + +ARTnetPredict's headline result is a **conditional behavioral** change: after standardizing to +the 2017-18 age distribution, casual degree still grows +20 to +70% (Step 8b), and total +partners/year is +31% pooled 2022-24. That change lives in the conditional behavior, not in the +demographic composition. + +`target_pop` holds conditional behavior fixed at ARTnet 2017-18 and only swaps the attribute +distribution. So feeding ARTnetPredict's projected population through `target_pop` would +transport the demographic shift (age, race, HIV mix) but reproduce roughly 2017-18 partner-count +behavior on top of it. It cannot, on its own, carry the +31% finding. The HIV mix does move +edges (`diag.status` is a degree predictor: all-negative gives main degree 0.346, all-positive +0.442, about a 28% swing), so composition is not nothing, but it is not the behavioral signal. + +## The three options + +**Option A: attribute standardization (what target_pop does today).** +Supply the projected attribute distribution; let ARTnet's frozen 2017-18 GLMs predict degrees. +- Answers: "what would 2017-18 ARTnet network structure look like on a 2022-24 demographic + population?" +- Loses: the projected behavioral change. The +31% does not appear. +- Cost: near zero. Already built, validated, snapshot-backed. + +**Option B: transport ARTnetPredict's projected degrees.** +Make the projected degrees drive the targets. +- Answers: "what are the 2022-24 network targets given projected behavior?" +- Requirement: `target_pop` is the wrong hook (it discards supplied degrees). Needs either a + custom netstats assembler that uses ARTnetPredict's degrees directly for the degree-based + targets, or re-fitting ARTnet's joint GLMs on projected node-level (attribute, degree) data. +- Cost: real engineering; this is essentially the original Step 9 Part B plan. + +**Option C: refit on projected data, reuse ARTnet's machinery (recommended).** +Re-fit the joint Poisson/binomial GLMs on ARTnetPredict's projected per-node (attribute, +degree) pairs, then reuse ARTnet's g-computation aggregation for internal consistency and +borrow ARTnet's joint dyad / mixing / duration models only where AMIS lacks the data +(partner attributes, role.class). This gets both the projected behavior and the +internal-consistency guarantee, without a second from-scratch g-comp implementation. +- This reframes the earlier "Step 9 Part B is ~80-90% subsumed by the refactor" read: the + refactor subsumes the **aggregation and post-stratification machinery**, but **not** the + fit-on-projected-data step, which is exactly what carries the behavioral change. + +## The go/no-go experiment (do this before committing to an architecture) + +Build the realized 2022-24 node data.frame, run it through Option A, and compare the resulting +`edges` / `nodefactor` to ARTnetPredict's own projected `md.*` / `nf.*`. If Option A does not +reproduce the +31% casual shift (expected, since attributes alone should not carry it), that is +the definitive evidence that Option A is insufficient and the project needs Option B/C. Cheap, +and it settles the fork with data rather than argument. + +## Secondary decisions that ride along + +- **HIV prevalence for the projected population.** `diag.status` is a degree predictor (about + 28% swing), so the chosen prevalence scales edges, not just `nodefactor_diag.status`. AMIS/SL + do not project it. Decide: supply `df$diag.status` from a chosen 2022-24 prevalence, or fall + back to `epistats$init.hiv.prev`. +- **role.class.** AMIS has no sex-role variable; carry it from the ARTnet canonical + distribution. Impact is confined to node composition (`nodematch_role.class` is structurally + c(0,0)), so carrying it forward is defensible. +- **Durations.** ARTnetPredict projects scalar mean durations (D_main -36%, D_casl +38%); + ARTnet aggregates `joint_lm` log-duration predictions into the `diss.byage` offset object the + template consumes, targeting a different estimand (cross-sectional extant-tie age, not full + duration). Decide whether to trust ARTnet's `diss.byage` over the projected population + (internally consistent) or splice the projected scalars (reintroduces the inconsistency the + refactor fixed). +- **Population size N.** Independent of the numerics (counts scale linearly with N; rates and + mixing are N-invariant), but the data.frame form forces `network.size = nrow(df)`, so N is + chosen at df-assembly time. Can proceed in parallel with the popsize2026 work. + +## Dependency notes for whoever wires this + +- ARTnetPredict's `renv.lock` pins ARTnet to the pre-refactor SHA `a19ffe98`; the joint API + does not exist in the pinned dependency. Bump the pin to a post-refactor commit and + `renv::restore` before any of this is callable in the project environment. +- ARTnetPredict currently emits aggregate targets plus a per-node table of soft predictions + (`resp_preds` / `slot_preds`), not realized integer node attributes. A realization adapter + (sample/round the slot decomposition into integer `deg.main`/`deg.casl`/`deg.tot`, carry + `role.class`/`diag.status`, integer-code race to ARTnet's level order, clamp to fitted + support, drop/recode age>65) is the one genuinely non-subsumed engineering piece. + +## Package fixes already made (so this memo stands alone) + +These were unambiguous bugs found during the review and are fixed in `R/NetStats.R`, with +backward compatibility verified byte-identical (snapshot 3/3) and regression tests added in +`tests/testthat/test-target-pop-robustness.R`: + +1. `method="joint"` + `duration.method="joint_lm"` + a `target_pop` with an empty age stratum + (e.g. a population restricted below 55) crashed in `dissolution_coefs`. Now falls back to the + within-ARTnet byage durations per empty stratum. +2. Degree / role / risk values outside the fitted support are clamped with a warning so the + `sum(nodefactor) = 2 * edges` invariant holds. +3. Nodefactor tabulations now span the full level set, so an unobserved level + (e.g. an absent risk quintile) no longer silently recycles a per-level vector. +4. Optional `seed` argument to `build_netstats` for reproducibility of the sampling path. diff --git a/inst/validation/method_comparison.R b/inst/validation/method_comparison.R index da90760..a99c55f 100644 --- a/inst/validation/method_comparison.R +++ b/inst/validation/method_comparison.R @@ -214,12 +214,17 @@ render_comparison_report <- function(comparison, out() out("## Scenarios") out() + out("Derived from the scenarios actually present in the comparison object, ", + "so this table always matches the data below.") + out() out("| Scenario | Description |") out("|---|---|") - out("| `atlanta_default` | Baseline EpiModelHIV-Template config (Atlanta, race = TRUE) |") - out("| `national_no_geog` | No geographic stratification (sanity check) |") - out("| `atlanta_nhbs_shifted` | Atlanta config with `race.prop = c(0.35, 0.25, 0.40)` (NHBS-MSM-like) |") - out("| `atlanta_no_race` | `race = FALSE` path (sanity check) |") + for (s in unique(comparison$scenario)) { + city <- gsub("_default$", "", s) + city <- paste0(toupper(substring(city, 1, 1)), substring(city, 2)) + out("| `", s, "` | City config (", city, + ", race = TRUE, init.hiv.prev = c(0.33, 0.137, 0.084)) |") + } out() out("## High-level summary") out() diff --git a/inst/validation/method_comparison.md b/inst/validation/method_comparison.md index f312f6f..305c719 100644 --- a/inst/validation/method_comparison.md +++ b/inst/validation/method_comparison.md @@ -1,17 +1,19 @@ # Method comparison: marginal vs joint g-computation -Generated by `inst/validation/method_comparison.R` on 2026-04-25. Phase 1.5 of the joint g-comp refactor; closes part of issue #65. +Generated by `inst/validation/method_comparison.R` on 2026-06-08. Phase 1.5 of the joint g-comp refactor; closes part of issue #65. ARTnet version: 2.9.0. Seed: 20260420. Network size: 5000. ## Scenarios +Derived from the scenarios actually present in the comparison object, so this table always matches the data below. + | Scenario | Description | |---|---| -| `atlanta_default` | Baseline EpiModelHIV-Template config (Atlanta, race = TRUE) | -| `national_no_geog` | No geographic stratification (sanity check) | -| `atlanta_nhbs_shifted` | Atlanta config with `race.prop = c(0.35, 0.25, 0.40)` (NHBS-MSM-like) | -| `atlanta_no_race` | `race = FALSE` path (sanity check) | +| `atlanta_default` | City config (Atlanta, race = TRUE, init.hiv.prev = c(0.33, 0.137, 0.084)) | +| `boston_default` | City config (Boston, race = TRUE, init.hiv.prev = c(0.33, 0.137, 0.084)) | +| `chicago_default` | City config (Chicago, race = TRUE, init.hiv.prev = c(0.33, 0.137, 0.084)) | +| `seattle_default` | City config (Seattle, race = TRUE, init.hiv.prev = c(0.33, 0.137, 0.084)) | ## High-level summary diff --git a/man/ARTnet-package.Rd b/man/ARTnet-package.Rd index 1039f06..9e8bd14 100644 --- a/man/ARTnet-package.Rd +++ b/man/ARTnet-package.Rd @@ -31,5 +31,10 @@ Useful links: \author{ \strong{Maintainer}: Samuel Jenness \email{samuel.m.jenness@emory.edu} +Authors: +\itemize{ + \item Samuel Jenness \email{samuel.m.jenness@emory.edu} +} + } \keyword{package} diff --git a/man/build_netstats.Rd b/man/build_netstats.Rd index 061f5b9..f6c1e61 100644 --- a/man/build_netstats.Rd +++ b/man/build_netstats.Rd @@ -15,6 +15,7 @@ build_netstats( young.prop = NULL, method = c("existing", "joint"), target_pop = NULL, + seed = NULL, browser = FALSE ) } @@ -25,7 +26,7 @@ build_netstats( \item{network.size}{Size of the starting network.} -\item{expect.mort}{Expected average mortality level to pass into \code{\link[EpiModel:dissolution_coefs]{EpiModel::dissolution_coefs}} function.} +\item{expect.mort}{Expected average mortality level to pass into \code{\link[EpiModel:dissolution_coefs]{dissolution_coefs}} function.} \item{age.pyramid}{Numerical vector of length equal to the length of the age range specified in \code{\link{build_epistats}}, containing probability distribution of each year of age, summing to @@ -84,7 +85,15 @@ Optional columns: \code{sqrt.age}, \code{age.grp}, \code{active.sex}, \code{deg. (derived from required columns if absent). When supplied, attribute sampling is bypassed entirely and \code{network.size} is set to \code{nrow(target_pop)}. This form is for users with a fully-specified joint synthetic population (e.g., post-stratified -to NHBS or AMIS demographics). +to NHBS or AMIS demographics). Note (\code{method = "joint"}): the supplied \code{deg.main} +and \code{deg.casl} columns are used \strong{only as covariates} in the cross-layer degree +GLMs; the layer edge counts are re-predicted by g-computation, not taken from the +supplied degrees. Likewise \code{diag.status} enters the degree GLMs as a predictor, so +the HIV composition of the target population affects edges and not just +\code{nodefactor_diag.status}. Degree/role/risk columns outside the fitted support +(\code{deg.main} 0-2, \code{deg.casl}/\code{deg.tot} 0-3, \code{role.class} 0-2, \code{risk.grp} +1-\code{nquants}) are clamped with a warning to preserve the +\code{sum(nodefactor) = 2 * edges} invariant. \item A \strong{character string} naming a built-in reference population. Currently raises an informative error. The planned set is geography-specific general male population demographics (NCHS age pyramid + \code{ARTnetData::race.dist} by @@ -92,6 +101,12 @@ geography) — bundles like \code{"atlanta"} or \code{"us_msm_male"} packaged fr already in the package, no NHBS or other restricted data required. }} +\item{seed}{Optional integer. If supplied, \code{set.seed(seed)} is called at the start of the +function for reproducibility of the stochastic sampling path (default \code{target_pop} and +list forms). Defaults to \code{NULL} (no seeding), which preserves prior behavior. The +data.frame form of \code{target_pop} bypasses attribute sampling and is deterministic +regardless of this argument.} + \item{browser}{If \code{TRUE}, run \code{build_netparams} in interactive browser mode.} } \description{ @@ -101,7 +116,7 @@ use in the EpiModelHIV workflow. } \details{ This function takes output from \code{\link{build_epistats}} and \code{\link{build_netparams}} to build the relevant -population-level network statistics that will be used in network estimation using the \link[EpiModel:EpiModel-package]{EpiModel::EpiModel} +population-level network statistics that will be used in network estimation using the \link[EpiModel:EpiModel]{EpiModel} package. The parameter \code{edge.avg} allows a user specify how the network edges statistics as estimated from diff --git a/tests/testthat/test-target-pop-robustness.R b/tests/testthat/test-target-pop-robustness.R new file mode 100644 index 0000000..f615914 --- /dev/null +++ b/tests/testthat/test-target-pop-robustness.R @@ -0,0 +1,144 @@ +# Robustness tests for build_netstats() with target_pop, covering bug fixes +# surfaced during the ARTnetPredict forward-projection bridge review: +# 1. joint_lm + target_pop must not crash on empty/sparse age strata +# (per-stratum fallback to netparams byage durations). +# 2. Out-of-support degree/role/risk values are clamped (with a warning) so +# the sum(nodefactor) == 2 * edges invariant is preserved. +# 3. nodefactor over a fixed level set never recycles when a level is +# unobserved in the target population. +# 4. The `seed` argument makes the stochastic sampling path reproducible. + +skip_without_artnetdata <- function() { + testthat::skip_if(system.file(package = "ARTnetData") == "", + "ARTnetData not installed") +} + +have_data <- system.file(package = "ARTnetData") != "" +if (have_data) { + set.seed(20260420L) + .ep <- build_epistats(geog.lvl = "city", geog.cat = "Atlanta", + init.hiv.prev = c(0.33, 0.137, 0.084), + race = TRUE, time.unit = 7) + set.seed(20260420L) + .np_joint <- build_netparams(.ep, smooth.main.dur = TRUE, + method = "joint", duration.method = "joint_lm") +} + +mk_df <- function(n, age_lo, age_hi, risk_levels = 1:5) { + data.frame( + age = stats::runif(n, age_lo, age_hi), + race = sample(1:3, n, replace = TRUE, prob = c(0.5, 0.05, 0.45)), + deg.casl = sample(0:3, n, replace = TRUE), + deg.main = sample(0:2, n, replace = TRUE), + role.class = sample(0:2, n, replace = TRUE), + risk.grp = sample(risk_levels, n, replace = TRUE) + ) +} + + +# ---- 1. joint_lm + empty age stratum must not crash ------------------------ + +test_that("joint_lm + target_pop with an empty top age stratum does not crash", { + skip_without_artnetdata() + set.seed(11L) + # age in [20, 54] leaves the 55-64 age group (group 5) empty under the + # default breaks 15/25/35/45/55/65. Before the fix this aborted in + # dissolution_coefs with "missing value where TRUE/FALSE needed". + df <- mk_df(3000, 20, 54) + expect_true(all(df$age < 55)) + ns <- build_netstats(.ep, .np_joint, expect.mort = 0.000478213, + method = "joint", target_pop = df) + expect_true(is.finite(ns$main$edges)) + expect_true(all(is.finite(ns$main$diss.byage$coef.diss))) + expect_true(all(is.finite(ns$casl$diss.byage$coef.diss))) +}) + +test_that("joint_lm empty-stratum durations fall back to netparams byage", { + skip_without_artnetdata() + set.seed(12L) + df <- mk_df(3000, 20, 54) + ns <- build_netstats(.ep, .np_joint, expect.mort = 0.000478213, + method = "joint", target_pop = df) + # diss.byage offset must be fully populated (one edges coef + one per + # age-group nodematch offset), all finite, no NA leaking through. + expect_false(anyNA(ns$main$diss.byage$coef.diss)) + expect_false(anyNA(ns$casl$diss.byage$coef.diss)) +}) + + +# ---- 2. out-of-support degrees are clamped, invariant preserved ------------ + +test_that("out-of-support deg.casl is clamped with a warning", { + skip_without_artnetdata() + set.seed(21L) + df <- mk_df(2000, 15, 64) + df$deg.casl[1:50] <- 4L # 4 is outside the fitted support 0:3 + expect_warning( + build_netstats(.ep, .np_joint, expect.mort = 0.000478213, + method = "joint", target_pop = df), + regexp = "deg.casl.*outside the fitted support" + ) +}) + +test_that("clamping preserves the deg.casl nodefactor invariant under joint", { + skip_without_artnetdata() + set.seed(22L) + df <- mk_df(2000, 15, 64) + df$deg.casl[1:50] <- 4L + ns <- suppressWarnings( + build_netstats(.ep, .np_joint, expect.mort = 0.000478213, + method = "joint", target_pop = df)) + # After clamping, sum over the deg.casl-keyed nodefactor must equal 2*edges. + expect_equal(sum(ns$main$nodefactor_deg.casl), 2 * ns$main$edges, + tolerance = 1e-8) +}) + +test_that("in-support degrees produce no clamp warning", { + skip_without_artnetdata() + set.seed(23L) + df <- mk_df(1000, 15, 64) + expect_no_warning( + build_netstats(.ep, .np_joint, expect.mort = 0.000478213, + method = "joint", target_pop = df) + ) +}) + + +# ---- 3. nodefactor over a fixed level set does not recycle ----------------- + +test_that("nodefactor_risk.grp spans all quintiles when a level is absent", { + skip_without_artnetdata() + set.seed(31L) + nquants <- length(.np_joint$inst$nf.risk.grp) + # supply risk.grp only in {1, 2}; levels 3..nquants are unobserved + df <- mk_df(1500, 15, 64, risk_levels = 1:2) + ns <- build_netstats(.ep, .np_joint, expect.mort = 0.000478213, + method = "joint", target_pop = df) + expect_length(ns$inst$nodefactor_risk.grp, nquants) + # unobserved quintiles contribute exactly zero (no recycling) + expect_equal(as.numeric(ns$inst$nodefactor_risk.grp[3:nquants]), + rep(0, nquants - 2)) +}) + + +# ---- 4. seed reproducibility for the sampling path ------------------------- + +test_that("seed makes the sampling path reproducible under method = 'joint'", { + skip_without_artnetdata() + ns1 <- build_netstats(.ep, .np_joint, expect.mort = 0.000478213, + network.size = 2000, method = "joint", seed = 123L) + ns2 <- build_netstats(.ep, .np_joint, expect.mort = 0.000478213, + network.size = 2000, method = "joint", seed = 123L) + expect_equal(ns1$attr$age, ns2$attr$age) + expect_equal(ns1$main$edges, ns2$main$edges) + expect_equal(ns1$casl$edges, ns2$casl$edges) +}) + +test_that("different seeds give different draws", { + skip_without_artnetdata() + ns1 <- build_netstats(.ep, .np_joint, expect.mort = 0.000478213, + network.size = 2000, method = "joint", seed = 123L) + ns3 <- build_netstats(.ep, .np_joint, expect.mort = 0.000478213, + network.size = 2000, method = "joint", seed = 456L) + expect_false(isTRUE(all.equal(ns1$main$edges, ns3$main$edges))) +})