## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  warning = FALSE,
  message = FALSE,
  echo = TRUE,
  eval = TRUE
)

## -----------------------------------------------------------------------------
library(fastglm)
set.seed(1)

## -----------------------------------------------------------------------------
library(Matrix)
n <- 5000

## a 50-level factor that interacts with a continuous covariate.
g    <- factor(sample.int(50, n, replace = TRUE))
xnum <- rnorm(n)
Xd   <- model.matrix(~ g * xnum)        # 5000 x 100, mostly zeros
Xs   <- as(Xd, "CsparseMatrix")

object.size(Xd)
object.size(Xs)

beta_true <- rnorm(ncol(Xd)) / 4
y <- rbinom(n, 1, plogis(Xd %*% beta_true))

## fit using the dense and sparse representations
fit_dense  <- fastglm(Xd, y, family = binomial(), method = 2)
fit_sparse <- fastglm(Xs, y, family = binomial(), method = 2)

max(abs(coef(fit_sparse) - coef(fit_dense)))

## ----error = TRUE-------------------------------------------------------------
try({
fastglm(Xs, y, family = binomial(), method = 0)
})

## -----------------------------------------------------------------------------
library(bigmemory)
n <- 20000
p <- 20
X  <- matrix(rnorm(n * p), n, p)
Xb <- as.big.matrix(X)               # in-memory big.matrix; could equally be filebacked

y  <- rbinom(n, 1, plogis(X %*% rnorm(p) * 0.05))

fit_dense <- fastglm(X,  y, family = binomial(), method = 2)
fit_big   <- fastglm(Xb, y, family = binomial(), method = 2)

max(abs(coef(fit_big) - coef(fit_dense)))

## -----------------------------------------------------------------------------
Sys.setenv(FASTGLM_CHUNK_ROWS = "2000")
fit_big_small <- fastglm(Xb, y, family = binomial(), method = 2)
Sys.unsetenv("FASTGLM_CHUNK_ROWS")

max(abs(coef(fit_big_small) - coef(fit_big)))

## ----error = TRUE-------------------------------------------------------------
try({
fastglm(Xb, y, family = binomial(), method = 0)
})

## -----------------------------------------------------------------------------
n <- 10000
p <- 5
X <- cbind(1, matrix(rnorm(n * (p - 1)), n, p - 1))
y <- rbinom(n, 1, plogis(X %*% c(0.2, 0.5, -0.3, 0.4, -0.2)))

## simulate a "data source" that yields the design matrix in 5 row-blocks
chunk_size <- 2000
chunks <- function(k) {
    idx <- ((k - 1) * chunk_size + 1):(k * chunk_size)
    list(X = X[idx, , drop = FALSE], y = y[idx])
}

fit_stream <- fastglm_streaming(chunks, n_chunks = 5, family = binomial())
fit_full   <- fastglm(X, y, family = binomial(), method = 2)

max(abs(coef(fit_stream) - coef(fit_full)))

## -----------------------------------------------------------------------------
n <- 8000
X <- cbind(1, matrix(rnorm(n * 3), n, 3))
ofs <- runif(n, -0.1, 0.1)
pw  <- runif(n,  0.5, 1.5)
y   <- rpois(n, exp(X %*% c(0.2, 0.4, -0.2, 0.3) + ofs))

chunk_size <- 1000
chunks <- function(k) {
    idx <- ((k - 1) * chunk_size + 1):(k * chunk_size)
    list(X = X[idx, , drop = FALSE], y = y[idx],
         offset = ofs[idx], weights = pw[idx])
}

fit_stream <- fastglm_streaming(chunks, n_chunks = 8, family = poisson())
fit_full   <- fastglm(X, y, family = poisson(), offset = ofs, weights = pw, method = 2)

max(abs(coef(fit_stream) - coef(fit_full)))

## ----eval = FALSE-------------------------------------------------------------
# library(arrow)
# ds      <- open_dataset("path/to/data.parquet")
# batches <- as.list(Scanner$create(ds)$ScanBatches())
# 
# chunks <- function(k) {
#     tbl <- as.data.frame(batches[[k]])
#     list(X = model.matrix(~ x1 + x2 + x3, data = tbl),
#          y = tbl$y)
# }
# 
# fit <- fastglm_streaming(chunks,
#                          n_chunks = length(batches),
#                          family   = binomial())