We'll use the benchmark function from Luke's UseR! talk in 2010. The function is
p1 = function(x) { for(i in seq_along(x)) x[i] <- x[i] + 1 x }
We want to "compile" this using LLVM.
We start by initializing the LLVM environment and creating a module.
library(Rllvm) InitializeNativeTarget()
mod = Module("x plus 1")
Our native routine will take a double * and the length of the vector. It will return nothing, mutating the elements of the array in place.
ptrDouble = pointerType(DoubleType) fun = Function("plus1", VoidType, c(x = ptrDouble, len = Int32Type), mod)
We get the parameter objects so we can refer to them.
params = getParameters(fun)
Now onto the body of the function. We'll need 4 blocks again (as in cumsum.Rdb) since we have a loop.
entry = Block(fun, "entry") cond = Block(fun, "loopCond") body = Block(fun, "loopBody") ret = Block(fun, "return")
We'll start with the entry block and create and initialize some local variables. We create an IR builder first
ir = IRBuilder(entry)
So now the local variables.
iv = ir$createLocalVariable(Int32Type, "i") xref = ir$createLocalVariable(ptrDouble, "x_addr") len.ref = ir$createLocalVariable(Int32Type, "len_addr")
ir$createStore(0L, iv) ir$createStore(params$x, xref) ir$createStore(params$len, len.ref)
Now we jump/branch into the condition of the loop.
ir$createBr(cond)
So now onto the loop.
ir$setInsertPoint(cond)
We load i and the value of len and perform a comparison. We use this logical value to then branch to the body of the loop or the return block. This is the same as in the cumsum example.
a = ir$createLoad(iv) b = ir$createLoad(len.ref) ok = ir$createICmp(ICMP_SLT, a, b) ir$createCondBr(ok, body, ret)
The return is simple as we just return without any value (i.e. void)
ir$setInsertPoint(ret) ir$createRetVoid()
So now let's do the body of the loop
ir$setInsertPoint(body)
Again this is very similar to what we did in cumsum but simpler. We have to load x[i]
a = ir$createLoad(xref) i = ir$createLoad(iv) idx = ir$createSExt(i, 64L) xi = ir$createLoad(ir$createGEP(a, idx))
Then add 1 to this:
tmp = ir$binOp(FAdd, 1.0, xi)
Next we store the result back in x[i].
x = ir$createLoad(xref) i = ir$createLoad(iv) i = ir$createSExt(i, 64L) x = ir$createGEP(x, i) ir$createStore(tmp, x)
That is the heart of the loop. We just need to increment i.
i = ir$createLoad(iv) inc = ir$binOp(Add, i, 1L) ir$createStore(inc, iv)
And loop back to the condition
ir$createBr(cond)
We'll now optimize this.
ee = ExecutionEngine(mod) Optimize(mod, ee)
And now we will call this function
n = 1e7
ll = system.time({x = rep(1, n); run(fun, x = x, n = length(x), .all = TRUE, .ee = ee)$x})
x = rep(1, n) interp = replicate(5, system.time({p1(x)})) interp = apply(interp, 1, median)
library(compiler) g = cmpfun(p1) x = rep(1, n) cmp = replicate(5, system.time({g(x)})) cmp = apply(cmp, 1, median)
ll = replicate(10, system.time({x = rep(1, n); run(fun, x = x, n = length(x), .all = TRUE, .ee = ee)$x})) ll = apply(ll, 1, median)
x = rep(1, n) vec = replicate(10, system.time(x + 1)) vec = apply(vec, 1, median)
tmp = c(interp[1], cmp[1], ll[1], vec[1]) matrix(c(tmp, interp[1]/tmp), length(tmp), , dimnames = list(c("Interpeted", "Byte Compiled", "Rllvm", "Vectorized"), c("Time", "Speedup")))
For n = 1e7, here are the timings for the different implementations:
On OS X with R compiled with -O3 optimization (and no -g for debugging)