Duncan Temple Lang

University of California at Davis

Department of Statistics


We'll use the benchmark function from Luke's UseR! talk in 2010. The function is

p1 = 
function(x) {
  for(i in seq_along(x))
     x[i] <- x[i] + 1

  x
}


We want to "compile" this using LLVM.

We start by initializing the LLVM environment and creating a module.

+
library(Rllvm)

InitializeNativeTarget()

+
mod = Module("x plus 1")

Our native routine will take a double * and the length of the vector. It will return nothing, mutating the elements of the array in place.

+
ptrDouble = pointerType(DoubleType)
fun = Function("plus1", VoidType, c(x = ptrDouble, len = Int32Type), mod)

We get the parameter objects so we can refer to them.

+
params = getParameters(fun)

Now onto the body of the function. We'll need 4 blocks again (as in cumsum.Rdb) since we have a loop.

+
entry = Block(fun, "entry")
cond = Block(fun, "loopCond")
body = Block(fun, "loopBody")
ret = Block(fun, "return")

We'll start with the entry block and create and initialize some local variables. We create an IR builder first

+
ir = IRBuilder(entry)

So now the local variables.

+
iv = ir$createLocalVariable(Int32Type, "i")
xref = ir$createLocalVariable(ptrDouble, "x_addr")
len.ref = ir$createLocalVariable(Int32Type, "len_addr")

+
ir$createStore(0L, iv)
ir$createStore(params$x, xref)
ir$createStore(params$len, len.ref)

Now we jump/branch into the condition of the loop.

+
ir$createBr(cond)

So now onto the loop.

+
ir$setInsertPoint(cond)

We load i and the value of len and perform a comparison. We use this logical value to then branch to the body of the loop or the return block. This is the same as in the cumsum example.

+
a = ir$createLoad(iv)
b = ir$createLoad(len.ref)
ok = ir$createICmp(ICMP_SLT, a, b)
ir$createCondBr(ok, body, ret)

The return is simple as we just return without any value (i.e. void)

+
ir$setInsertPoint(ret)
ir$createRetVoid()

So now let's do the body of the loop

+
ir$setInsertPoint(body)

Again this is very similar to what we did in cumsum but simpler. We have to load x[i]

+
a = ir$createLoad(xref)
i = ir$createLoad(iv)
idx = ir$createSExt(i, 64L)
xi = ir$createLoad(ir$createGEP(a, idx))

Then add 1 to this:

+
tmp = ir$binOp(FAdd, 1.0, xi)

Next we store the result back in x[i].

+
x = ir$createLoad(xref)
i = ir$createLoad(iv)
i = ir$createSExt(i, 64L)
x = ir$createGEP(x, i)
ir$createStore(tmp, x)

That is the heart of the loop. We just need to increment i.

+
i = ir$createLoad(iv)
inc = ir$binOp(Add, i, 1L)
ir$createStore(inc, iv)

And loop back to the condition

+
ir$createBr(cond)

We'll now optimize this.

+
ee = ExecutionEngine(mod)
Optimize(mod, ee)

And now we will call this function

+
n = 1e7

+
ll = system.time({x = rep(1, n); run(fun, x = x, n = length(x), .all = TRUE, .ee = ee)$x})

+
x = rep(1, n)
interp = replicate(5, system.time({p1(x)}))
interp = apply(interp, 1, median)

+
library(compiler)
g = cmpfun(p1)
x = rep(1, n)
cmp = replicate(5, system.time({g(x)}))
cmp = apply(cmp, 1, median)

+
ll = replicate(10, system.time({x = rep(1, n); run(fun, x = x, n = length(x), .all = TRUE, .ee = ee)$x}))
ll = apply(ll, 1, median)

+
x = rep(1, n)
vec = replicate(10, system.time(x + 1))
vec = apply(vec, 1, median)

+
tmp = c(interp[1], cmp[1], ll[1], vec[1])
matrix(c(tmp, interp[1]/tmp), length(tmp), ,
        dimnames = list(c("Interpeted", "Byte Compiled", "Rllvm", "Vectorized"), c("Time", "Speedup")))

For n = 1e7, here are the timings for the different implementations:

Table 1. 

MethodTimesSpeedup
Interpreted29.351
Byte compiled7.653.83
Rllvm.4170.3
Vectorized.039793.4



On OS X with R compiled with -O3 optimization (and no -g for debugging)

Table 2. 

 TimeSpeedup
Interpeted26.11
Byte Compiled7.53.48
Rllvm0.41662.6
Vectorized0.0365715