This article measures two different costs:
The comparison target is the callme
package, which builds ordinary .Call() entry points with
R CMD SHLIB. That means it goes through the platform
compiler toolchain (gcc/clang on the usual
Unix-like targets), so we should expect stronger optimization than
TinyCC for steady-state machine code. That does not make the comparison
useless, but it does mean the runtime results combine two effects:
.Call() entry points and direct R C API
allocation in callmeThe point is not that the two packages expose identical APIs. They do not. Instead, the comparison asks a narrower question:
We use three small workloads:
noop(): takes nothing, returns nothingfill_rand(out, n): fills a caller-provided numeric
buffer in placerand_unif(n): generates n random
doublesThe fill_rand() case is the fairer array-oriented
comparison:
Rtinycc receives a numeric_array, so the
wrapper borrows the backing REAL() storage of the R vector
directlycallme takes an R numeric vector and writes into
REAL(vec) directlyThe rand_unif() case intentionally stresses the extra
copy path:
callme allocates the final R vector directly with the R
C APIRtinycc returns a heap-allocated double*,
and the generated wrapper copies that buffer into a fresh R numeric
vector before freeing the original C allocation#include <R.h>
#include <Rinternals.h>
#include <Rmath.h>
#include <stdlib.h>
void noop(void) {}
void fill_rand(double* out, int n) {
if (n < 0) {
Rf_error("n must be non-negative");
}
GetRNGstate();
for (int i = 0; i < n; ++i) {
out[i] = unif_rand();
}
PutRNGstate();
}
double* rand_unif(int n) {
if (n < 0) {
Rf_error("n must be non-negative");
}
if (n == 0) {
return (double*) malloc(sizeof(double));
}
double *out = (double*) malloc(sizeof(double) * (size_t) n);
if (!out) {
Rf_error("malloc failed");
}
GetRNGstate();
for (int i = 0; i < n; ++i) {
out[i] = unif_rand();
}
PutRNGstate();
return out;
}
rtinycc_code <- "#include <R.h>\n#include <Rinternals.h>\n#include <Rmath.h>\n#include <stdlib.h>\n\nvoid noop(void) {}\n\nvoid fill_rand(double* out, int n) {\n if (n < 0) {\n Rf_error(\"n must be non-negative\");\n }\n\n GetRNGstate();\n for (int i = 0; i < n; ++i) {\n out[i] = unif_rand();\n }\n PutRNGstate();\n}\n\ndouble* rand_unif(int n) {\n if (n < 0) {\n Rf_error(\"n must be non-negative\");\n }\n if (n == 0) {\n return (double*) malloc(sizeof(double));\n }\n\n double *out = (double*) malloc(sizeof(double) * (size_t) n);\n if (!out) {\n Rf_error(\"malloc failed\");\n }\n\n GetRNGstate();\n for (int i = 0; i < n; ++i) {\n out[i] = unif_rand();\n }\n PutRNGstate();\n return out;\n}"#include <R.h>
#include <Rinternals.h>
#include <Rmath.h>
SEXP noop(void) {
return R_NilValue;
}
SEXP fill_rand(SEXP out_, SEXP n_) {
int n = asInteger(n_);
if (n < 0) {
Rf_error("n must be non-negative");
}
if (TYPEOF(out_) != REALSXP) {
Rf_error("out must be a numeric vector");
}
if (XLENGTH(out_) < n) {
Rf_error("out is shorter than n");
}
double *out = REAL(out_);
GetRNGstate();
for (int i = 0; i < n; ++i) {
out[i] = unif_rand();
}
PutRNGstate();
return out_;
}
SEXP rand_unif(SEXP n_) {
int n = asInteger(n_);
if (n < 0) {
Rf_error("n must be non-negative");
}
SEXP out = PROTECT(allocVector(REALSXP, n));
double *ptr = REAL(out);
GetRNGstate();
for (int i = 0; i < n; ++i) {
ptr[i] = unif_rand();
}
PutRNGstate();
UNPROTECT(1);
return out;
}
callme_code <- "#include <R.h>\n#include <Rinternals.h>\n#include <Rmath.h>\n\nSEXP noop(void) {\n return R_NilValue;\n}\n\nSEXP fill_rand(SEXP out_, SEXP n_) {\n int n = asInteger(n_);\n if (n < 0) {\n Rf_error(\"n must be non-negative\");\n }\n\n if (TYPEOF(out_) != REALSXP) {\n Rf_error(\"out must be a numeric vector\");\n }\n\n if (XLENGTH(out_) < n) {\n Rf_error(\"out is shorter than n\");\n }\n\n double *out = REAL(out_);\n GetRNGstate();\n for (int i = 0; i < n; ++i) {\n out[i] = unif_rand();\n }\n PutRNGstate();\n\n return out_;\n}\n\nSEXP rand_unif(SEXP n_) {\n int n = asInteger(n_);\n if (n < 0) {\n Rf_error(\"n must be non-negative\");\n }\n\n SEXP out = PROTECT(allocVector(REALSXP, n));\n double *ptr = REAL(out);\n\n GetRNGstate();\n for (int i = 0; i < n; ++i) {\n ptr[i] = unif_rand();\n }\n PutRNGstate();\n\n UNPROTECT(1);\n return out;\n}"build_rtinycc_module <- function() {
tcc_ffi() |>
tcc_source(rtinycc_code) |>
tcc_bind(
noop = list(args = list(), returns = "void"),
fill_rand = list(args = list("numeric_array", "i32"), returns = "void"),
rand_unif = list(
args = list("i32"),
returns = list(type = "numeric_array", length_arg = 1, free = TRUE)
)
) |>
tcc_compile()
}
build_callme_module <- function() {
before <- names(getLoadedDLLs())
mod <- callme::compile(callme_code, env = NULL, verbosity = 0)
dlls <- getLoadedDLLs()
new_names <- setdiff(names(dlls), before)
new_names <- new_names[startsWith(new_names, "callme_")]
attr(mod, "dll_paths") <- unname(vapply(
dlls[new_names],
function(x) x[["path"]],
character(1)
))
mod
}
unload_callme_dlls <- function(dll_paths) {
dll_paths <- rev(unique(dll_paths))
if (is.null(dll_paths) || !length(dll_paths)) {
return(invisible(NULL))
}
for (dll_path in dll_paths) {
if (is.character(dll_path) && nzchar(dll_path) && file.exists(dll_path)) {
try(dyn.unload(dll_path), silent = TRUE)
}
}
invisible(NULL)
}
build_and_dispose_callme_module <- function() {
mod <- build_callme_module()
dll_paths <- attr(mod, "dll_paths", exact = TRUE)
rm(mod)
gc()
unload_callme_dlls(dll_paths)
invisible(NULL)
}
callme_runtime_reason <- NULL
can_run_callme <- FALSE
if (!has_callme) {
callme_runtime_reason <- "`callme` is not installed."
} else if (.Platform$OS.type == "windows") {
callme_runtime_reason <- paste(
"`callme` comparisons are skipped on Windows during vignette builds",
"because the helper DLL compilation step is not reliable in CI."
)
} else {
callme_probe <- tryCatch(
{
build_and_dispose_callme_module()
NULL
},
error = identity
)
if (inherits(callme_probe, "error")) {
callme_runtime_reason <- paste(
"`callme` comparisons were skipped because runtime compilation failed:",
conditionMessage(callme_probe)
)
} else {
can_run_callme <- TRUE
}
}
can_run_benchmarks <- can_run_callme && has_bench
if (is.null(callme_runtime_reason) && !has_bench) {
callme_runtime_reason <- "`bench` is not installed."
} else if (is.null(callme_runtime_reason)) {
callme_runtime_reason <- "Executable comparisons are enabled."
}
with_benchmark_modules <- function(fun) {
rt_mod <- build_rtinycc_module()
cm_mod <- build_callme_module()
dll_paths <- attr(cm_mod, "dll_paths", exact = TRUE)
on.exit({
rm(rt_mod, cm_mod)
gc()
unload_callme_dlls(dll_paths)
}, add = TRUE)
fun(rt_mod, cm_mod)
}
median_elapsed <- function(expr, times = 3L) {
expr <- substitute(expr)
env <- parent.frame()
stats::median(replicate(
times,
{
gc()
t0 <- proc.time()[["elapsed"]]
eval(expr, envir = env)
proc.time()[["elapsed"]] - t0
}
))
}
run_noop <- function(fun, n) {
for (i in seq_len(n)) {
fun()
}
invisible(NULL)
}
run_rand <- function(fun, n, reps) {
for (i in seq_len(reps)) {
invisible(fun(n))
}
invisible(NULL)
}
run_fill <- function(fun, n, reps) {
for (i in seq_len(reps)) {
out <- numeric(n)
invisible(fun(out, n))
}
invisible(NULL)
}
rtinycc_recipe <- tcc_ffi() |>
tcc_source(rtinycc_code) |>
tcc_bind(
noop = list(args = list(), returns = "void"),
fill_rand = list(args = list("numeric_array", "i32"), returns = "void"),
rand_unif = list(
args = list("i32"),
returns = list(type = "numeric_array", length_arg = 1, free = TRUE)
)
)
generated_code <- Rtinycc:::generate_ffi_code(
symbols = rtinycc_recipe$symbols,
headers = rtinycc_recipe$headers,
c_code = rtinycc_recipe$c_code,
is_external = FALSE,
structs = rtinycc_recipe$structs,
unions = rtinycc_recipe$unions,
enums = rtinycc_recipe$enums,
globals = rtinycc_recipe$globals,
container_of = rtinycc_recipe$container_of,
field_addr = rtinycc_recipe$field_addr,
struct_raw_access = rtinycc_recipe$struct_raw_access,
introspect = rtinycc_recipe$introspect
)If callme or bench is unavailable, or if
the current build environment cannot compile the temporary
callme helper DLL, the executable comparisons below are
skipped.
Current comparison status:
This measures module build time, not call time.
compile_times <- data.frame(
implementation = c("Rtinycc", "callme"),
seconds = c(
median_elapsed(build_rtinycc_module(), times = 3L),
median_elapsed(build_and_dispose_callme_module(), times = 3L)
)
)
compile_times$milliseconds <- round(compile_times$seconds * 1000, 1)
compile_times
#> implementation seconds milliseconds
#> 1 Rtinycc 0.011 11
#> 2 callme 0.207 207The expected pattern is:
Rtinycc wins clearly on tiny compile latency because it
stays in-process and does not shell out to R CMD SHLIBcallme pays the ordinary shared-library toolchain
costThe generated code makes the extra return-path work explicit. In
particular, the rand_unif() wrapper allocates an R vector,
memcpy()s the native double* buffer into it,
then free()s the original buffer. In contrast,
fill_rand() uses the borrowed numeric_array
input path.
/* TinyCC workaround: _Complex not supported */
#define _Complex
#include <R.h>
#include <Rinternals.h>
#ifndef STRING_PTR_RO
#define STRING_PTR_RO STRING_PTR
#endif
void RC_free_finalizer(SEXP ext);
void RC_owned_native_finalizer(SEXP ext);
SEXP RC_make_borrowed_view(void *ptr, SEXP tag, SEXP owner);
SEXP RC_make_unowned_ptr(void *ptr, SEXP tag);
SEXP RC_make_owned_ptr(void *ptr, SEXP tag);
SEXP RC_make_owned_composite_ptr(void *ptr, SEXP tag);
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
#include <limits.h>
#include <math.h>
#include <string.h>
/* User code */
#include <R.h>
#include <Rinternals.h>
#include <Rmath.h>
#include <stdlib.h>
void noop(void) {}
void fill_rand(double* out, int n) {
if (n < 0) {
Rf_error("n must be non-negative");
}
GetRNGstate();
for (int i = 0; i < n; ++i) {
out[i] = unif_rand();
}
PutRNGstate();
}
double* rand_unif(int n) {
if (n < 0) {
Rf_error("n must be non-negative");
}
if (n == 0) {
return (double*) malloc(sizeof(double));
}
double *out = (double*) malloc(sizeof(double) * (size_t) n);
if (!out) {
Rf_error("malloc failed");
}
GetRNGstate();
for (int i = 0; i < n; ++i) {
out[i] = unif_rand();
}
PutRNGstate();
return out;
}
/* R callable wrappers for bound symbols */
SEXP R_wrap_noop(void) {
// No arguments
// Call and return
noop();
return R_NilValue;
}
SEXP R_wrap_fill_rand(SEXP arg1_, SEXP arg2_) {
if (TYPEOF(arg1_) != REALSXP) Rf_error("expected numeric vector for argument 'arg1'");
double* arg1 = REAL(arg1_);
int _arg2 = asInteger(arg2_);
if (_arg2 == NA_INTEGER) Rf_error("integer value is NA");
if (_arg2 < INT32_MIN || _arg2 > INT32_MAX) Rf_error("i32 out of range");
int32_t arg2 = (int32_t)_arg2;
// Call and return
fill_rand(arg1, arg2);
return R_NilValue;
}
SEXP R_wrap_rand_unif(SEXP arg1_) {
int _arg1 = asInteger(arg1_);
if (_arg1 == NA_INTEGER) Rf_error("integer value is NA");
if (_arg1 < INT32_MIN || _arg1 > INT32_MAX) Rf_error("i32 out of range");
int32_t arg1 = (int32_t)_arg1;
// Call and return
double* __rtinycc_ret = rand_unif(arg1);
if (!__rtinycc_ret) return R_NilValue;
SEXP out = PROTECT(allocVector(REALSXP, arg1));
if (arg1 > 0) memcpy(REAL(out), __rtinycc_ret, sizeof(double) * arg1);
if (__rtinycc_ret) free(__rtinycc_ret);
UNPROTECT(1);
return out;
}noop() Call OverheadThis is the smallest useful call path. It approximates the lower
bound on call overhead above a plain .Call() entry
point.
noop_bench <- with_benchmark_modules(function(rt_mod, cm_mod) {
n_noop <- 1000L
bench::mark(
Rtinycc = run_noop(rt_mod$noop, n_noop),
callme = run_noop(cm_mod$noop, n_noop),
iterations = 20,
check = TRUE,
memory = TRUE,
filter_gc = FALSE
)
})
noop_bench
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 Rtinycc 638µs 680µs 1471. 13KB 0
#> 2 callme 251µs 255µs 3811. 0B 0Interpretation:
callme path is close to the cost of a conventional
.Call() wrapperRtinycc path adds the generated wrapper layer and
external-pointer call targetcheck = TRUE is appropriate here because both
expressions always return NULLbench also exposes allocation and GC differences
directly, which is useful for understanding the cost of boxing and
copyingfill_rand(out, n) And Zero-Copy ArraysThis is the fairer vector comparison because both implementations fill an existing R numeric vector instead of returning a newly allocated result.
fill_bench_n4096 <- with_benchmark_modules(function(rt_mod, cm_mod) {
bench::mark(
Rtinycc = run_fill(rt_mod$fill_rand, 4096L, 100L),
callme = run_fill(cm_mod$fill_rand, 4096L, 100L),
iterations = 20,
check = FALSE,
memory = TRUE,
filter_gc = FALSE
)
})
fill_bench_n4096
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 Rtinycc 1.72ms 2.66ms 363. 3.15MB 18.2
#> 2 callme 1.39ms 1.44ms 615. 3.13MB 30.7Interpretation:
rand_unif(n) And Copy CostHere the implementation work is still small, but the return path differs:
callme fills the final R vector directlyRtinycc fills a native buffer, then the wrapper copies
into a fresh R vectorWe time both a tiny and a larger return size.
rand_results <- with_benchmark_modules(function(rt_mod, cm_mod) {
rand_bench_n1 <- bench::mark(
Rtinycc = run_rand(rt_mod$rand_unif, 1L, 1000L),
callme = run_rand(cm_mod$rand_unif, 1L, 1000L),
iterations = 20,
check = FALSE,
memory = TRUE,
filter_gc = FALSE
)
rand_bench_n4096 <- bench::mark(
Rtinycc = run_rand(rt_mod$rand_unif, 4096L, 100L),
callme = run_rand(cm_mod$rand_unif, 4096L, 100L),
iterations = 20,
check = FALSE,
memory = TRUE,
filter_gc = FALSE
)
list(rand_bench_n1 = rand_bench_n1, rand_bench_n4096 = rand_bench_n4096)
})
rand_results$rand_bench_n1
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 Rtinycc 883µs 974µs 800. 15.4KB 40.0
#> 2 callme 479µs 486µs 2027. 0B 0
rand_results$rand_bench_n4096
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 Rtinycc 1.68ms 2.42ms 373. 3.13MB 18.7
#> 2 callme 1.16ms 1.95ms 480. 3.13MB 24.0The usual pattern is:
fill_rand(), the comparison is much closer to
Rtinycc’s intended array-oriented usagen = 1, wrapper overhead and return-path mechanics
dominaten, the copy still matters, but more of the
time is spent in the actual loop and RNG generationThe benchmark gives a reasonable mental model:
Rtinycc is optimized for low compilation latency and
direct interactive use.Call()
entry point has lower overheadRtinycc must copy returned buffers into R vectors,
that copy is real and measurablecallme is using the system compiler, while
Rtinycc is using TinyCCSo the package is usually strongest when:
It is less ideal when:
.Call() entry point that writes its
final result straight into R-managed objects