Files
GraphonSimulation/R/graphon_distribution.R
2026-01-20 15:13:06 +01:00

322 lines
12 KiB
R
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Load the general quantile function, the loading can be improved later
# using the here package.
source("R/qinf.R")
# 1. Distribution Function -----------------------------------------------------
#' Empirical Distribution function for the graphon model.
#'
#' Computes the empirical expectation
#' \deqn{E[F_v(y - a^\top X_i)]}
#' using observed covariate samples \eqn{X_i}. This is done by evaluating the
#' supplied CDF \code{Fv} at the shifted values \eqn{y - a^\top X_i} and taking
#' their empirical average. See also Remark 2.1
#'
#' @param y Numeric scalar or vector of evaluation points.
#' @param a Numeric vector. Coefficient vector used to compute the inner products
#' \eqn{a^\top X_i}. Its length must match the number of columns in
#' \code{X_matrix}.
#' @param Fv Function. Cumulative distribution function of the noise variable
#' \eqn{v} (e.g., \code{pnorm}, \code{pexp}, etc.). Must accept a numeric vector
#' and return a numeric vector of the same length.
#' @param X_matrix Numeric matrix of dimension \eqn{n \times p}, where each row
#' corresponds to an observed covariate vector \eqn{X_i}.
#'
#' @return A numeric scalar giving the empirical expectation
#' \eqn{(1/n) \sum_{i=1}^n F_v(y - a^\top X_i)}.
#'
#' @examples
#' set.seed(1)
#' X <- matrix(rnorm(100), ncol = 2)
#' a <- c(1, -0.5)
#' y <- 0
#' pgraphon(y, a, pnorm, X)
#'
#' @export
pgraphon <- function(
y, # scalar
a, # vector (coefficient vector, can be one-dimensional)
Fv, # CDF function of the v's (e.g., pnorm, pexp, etc.)
X_matrix # n x p matrix: each row is X_i
) {
## 1.1 Check inputs ==========================================================
if (ncol(X_matrix) != length(a)) {
stop("Number of columns in X_matrix must match length of a")
}
if (!is.numeric(y)) {
stop("'y' must be numeric (scalar or vector)")
}
if (!is.numeric(a) || !is.vector(a)) {
stop("'a' must be a numeric vector")
}
if (!is.matrix(X_matrix) || !is.numeric(X_matrix)) {
stop("'X_matrix' must be a numeric matrix")
}
if (!is.function(Fv)) {
stop("'Fv' must be a function (a CDF)")
}
## 1.2 Compute the expected value ============================================
# Compute a^T X_i as in the paper. Here we switched to the R convention that each
# observation is a row. The paper assumes that each observation is a vector
inner_products <- as.vector(X_matrix %*% a) # vector of length n
# outer(y, inner_prod, "-") creates an |y| × n matrix where element (j,i)
# equals y[j] - inner_prod[i].
dev_mat <- outer(y, inner_products, "-")
# Apply CDF Fv to each deviation
cdf_vals <- Fv(dev_mat)
# Row means give the empirical expectation for each y_j
out <- rowMeans(cdf_vals)
out
}
# 2. Density Function ----------------------------------------------------------
#' Empirical Graphon Density Estimate
#'
#' Computes the empirical expectation \eqn{E[f_v(y - a^\top X_i)]} using observed
#' covariate samples \eqn{X_i}. This serves as an empirical approximation of the
#' graphon density evaluated at \eqn{y}, where \eqn{f_v} is the density of the
#' latent variable \eqn{v}. See also Remark 2.1
#'
#' @param y Numeric scalar or vector of points at which the density should be
#' evaluated.
#' @param a Numeric vector. Coefficient vector used to compute the linear
#' projection \eqn{a^\top X_i}. Its length must match the number of columns in
#' \code{X_matrix}.
#' @param fv Function. Density function of the latent variable \eqn{v} (e.g.,
#' \code{dnorm}, \code{dexp}). Must accept a numeric vector and return a
#' numeric vector of the same length.
#' @param X_matrix Numeric matrix of size \eqn{n \times p}. Each row represents
#' an observed covariate vector \eqn{X_i}. The number of columns must match the
#' length of \code{a}.
#'
#' @return A numeric scalar giving the empirical average
#' \eqn{\frac{1}{n} \sum_{i=1}^n f_v(y - a^\top X_i)}.
#'
#' @examples
#' set.seed(1)
#' X <- matrix(rnorm(50), ncol = 2)
#' a <- c(1, -0.5)
#' dgraphon(y = 0.2, a = a, fv = dnorm, X_matrix = X)
#'
#' @export
dgraphon <- function(
y, # scalar
a, # vector (coefficient vector, can be )
fv, # density function of the v's (e.g., dnorm, dexp, etc.)
X_matrix # n x p matrix: each row is X_i
) {
## 2.1 Check inputs ----------------------------------------------------------
if (!is.numeric(y)) {
stop("'y' must be numeric (scalar or vector)")
}
if (!is.numeric(a) || !is.vector(a)) {
stop("'a' must be a numeric vector")
}
if (!is.matrix(X_matrix) || !is.numeric(X_matrix)) {
stop("'X_matrix' must be a numeric matrix")
}
if (ncol(X_matrix) != length(a)) {
stop("Number of columns in X_matrix must match length of a")
}
if (!is.function(fv)) {
stop("'fv' must be a function (a density)")
}
## 1.2 Compute the expected value ============================================
# Compute a^T X_i as in the paper. Here we switched to the R convention that each
# observation is a row. The paper assumes that each observation is a vector
inner_products <- as.vector(X_matrix %*% a) # vector of length n
# outer(y, inner_prod, "-") creates an |y| × n matrix where element (j,i)
# equals y[j] - inner_prod[i].
dev_mat <- outer(y, inner_products, "-")
# Apply the density function to each y[j]
dens_vals <- fv(dev_mat)
# Row means give the empirical expectation for each y_j
out <- rowMeans(dens_vals)
out
}
# 3. Quantile Function ---------------------------------------------------------
#' Quantile of the empirical graphon distribution
#'
#' This is a thin wrapper around the generic infimumtype quantile routine
#' \code{qinf()}. It builds the CDF of the graphon,
#' \eqn{p_{\text{graphon}}(x) = \frac{1}{n}\sum_{i=1}^{n}F_v\bigl(x-a^\top X_i\bigr)},
#' and then finds the quantile(s) for the supplied probability(ies) \code{p}.
#'
#' @param p Numeric vector of probabilities in \eqn{[0,1]}. Values outside this
#' interval are rejected.
#' @param a Numeric coefficient vector. Its length must equal the number of
#' columns of \code{X_matrix}.
#' @param Fv Function. The CDF of the latent variable $v$. It must be
#' vectorised (i.e. accept a numeric vector and return a numeric vector of
#' the same length). Typical examples are \code{pnorm}, \code{pexp}, etc.
#' @param X_matrix Numeric matrix of dimension \eqn{n\times p}. Each row
#' corresponds to an observation $X_i$.
#' @param lower,upper Numeric scalars giving the search interval for the root
#' finder. By default they are set to \code{-Inf} and \code{Inf}.
#' @param tol Numeric tolerance for the bisection algorithm used inside
#' \code{qinf()}. The default is the squareroot of machine epsilon.
#' @param max.iter Maximum number of bisection iterations (safety guard).
#' @param ... Additional arguments that are passed **directly** to the
#' internal CDF \code{pgraphon}. This makes the wrapper flexible you do not
#' have to list every argument (e.g. \code{a}, \code{X_matrix}, \code{Fv})
#' explicitly.
#'
#' @return A numeric vector of the same length as \code{p} containing the
#' quantiles. The vector is named by the probabilities.
#'
#' @examples
#' ## ---- simple normal example ------------------------------------------------
#' set.seed(123)
#' X <- matrix(rnorm(200), ncol = 2) # n = 100, p = 2
#' a <- c(0.7, -0.3)
#' qgraphon(p = c(0.25, 0.5, 0.75),
#' a = a,
#' Fv = pnorm,
#' X_matrix = X)
#'
#' ## ---- mixture example ------------------------------------------------------
#' mix_cdf <- function(x, w = 0.4, mu = 0, sigma = 1) {
#' w * (x >= 0) + (1 - w) * pnorm(x, mean = mu, sd = sigma)
#' }
#' qgraphon(p = seq(0, 1, 0.2),
#' a = a,
#' Fv = mix_cdf,
#' X_matrix = X)
#'
#' @export
qgraphon <- function(
p, # probabilities
a, # coefficient vector
Fv, # CDF of the v's
X_matrix, # X_i samples, matrix
lower = -Inf, # lower bound roots
upper = Inf, # upper bound roots
tol = .Machine$double.eps^0.5, # parameter root finding
max.iter = 100) {
## 3.1 Check inputs ----------------------------------------------------------
if (!is.numeric(p) || any(is.na(p))) {
stop("'p' must be a numeric vector without NA values")
}
if (any(p < 0 | p > 1)) {
stop("All probabilities in 'p' must lie in [0, 1]")
}
if (!is.numeric(a) || !is.vector(a)) {
stop("'a' must be a numeric vector")
}
if (!is.matrix(X_matrix) || !is.numeric(X_matrix)) {
stop("'X_matrix' must be a numeric matrix")
}
if (ncol(X_matrix) != length(a)) {
stop("Number of columns of 'X_matrix' (", ncol(X_matrix),
") must equal length of 'a' (", length(a), ")")
}
if (!is.function(Fv)) {
stop("'Fv' must be a function (the CDF of the latent variable)")
}
## 3.2 Call the generic quantile function ------------------------------------
out <- qinf(F = pgraphon,
p = p,
lower = lower,
upper = upper,
tol = tol,
max.iter = max.iter,
a = a,
X_matrix = X_matrix,
Fv = Fv)
# Name the result by the probabilities this mirrors the behaviour of
# baseR quantile functions (e.g. qnorm, qbeta).
names(out) <- as.character(p)
out
}
# 4. Create conditional density ------------------------------------------------
#' Create a Conditional Density Function for the Graphon Model.
#'
#' This function constructs and returns another function that computes the
#' conditional density of the graphon model at specified values.
#' The returned function takes inputs \code{u} (probabilities) and \code{x}
#' (a scalar covariate value) and evaluates the graphon-based conditional
#' density using the supplied coefficient vector \code{a}, the density
#' \code{fv}, the distribution function \code{Fv}, and the empirical sample
#' \code{X_matrix}.
#'
#' @param a Numeric vector. Coefficient vector used in the graphon model; must
#' have length equal to the number of columns in \code{X_matrix}.
#' @param fv Function. Density function of the latent variable \eqn{v}
#' (e.g., \code{dnorm}, \code{dexp}).
#' @param Fv Function. Distribution function (CDF) corresponding to \code{fv}
#' (e.g., \code{pnorm}, \code{pexp}).
#' @param X_matrix Numeric matrix. An \eqn{n \times p} matrix of covariates
#' \eqn{X_i}, where each row corresponds to an observation.
#'
#' @return A function of two arguments \code{u} and \code{x}.
#' The returned function is vectorized in \code{u} and returns the conditional
#' density:
#' \deqn{ f_{U \mid X}(u \mid x) = \frac{f_v(q(u) - a^\top x)}{
#' \mathbb{E}[f_v(q(u) - a^\top X_i)] },
#' }
#' where \eqn{q(u)} is computed via \code{qgraphon()}.
#'
#' @details
#' Internally, the function constructs a scalar evaluator
#' \code{scalar_conditional_density()} that computes the conditional density for a
#' single value of \code{u}.
#' A wrapper function \code{conditional_density()} then applies this scalar
#' function to each element of the input vector \code{u}.
#'
#' Note: the function relies on the existence of \code{qgraphon()},
#' \code{pgraphon()}, and \code{dgraphon()} in the user's environment.
#'
#' @examples
#' \dontrun{
#' a <- c(1, -0.5)
#' fv <- dnorm
#' Fv <- pnorm
#' X_matrix <- matrix(rnorm(200), ncol = 2)
#'
#' cond_dens <- create_cond_density(a, fv, Fv, X_matrix)
#' cond_dens(c(0.2, 0.5, 0.8), x = 1.0)
#' }
#'
#' @export
create_cond_density <- function(
a,
fv,
Fv,
X_matrix){
scalar_conditional_density <-function(u, x) {
q_a <- qgraphon(u, a, Fv, X_matrix)
num <- fv(q_a - x %*% a)
den <- dgraphon(q_a, a, fv, X_matrix)
num / den
}
conditional_density <- function(u, x) {
vapply(u, scalar_conditional_density, numeric(1), x = x)
}
return(conditional_density)
}