Duncan Temple Lang

University of California at Davis

Department of Statistics

We'll use the benchmark function from Luke's UseR! talk in 2010. The function is

p1 = 
function(x) {
  for(i in seq_along(x))
     x[i] <- x[i] + 1

  x
}

We want to "compile" this using LLVM.

We start by initializing the LLVM environment and creating a module.

library(Rllvm)

InitializeNativeTarget()

mod = Module("x plus 1")

Our native routine will take a double * and the length of the vector. It will return nothing, mutating the elements of the array in place.

ptrDouble = pointerType(DoubleType)
fun = Function("plus1", VoidType, c(x = ptrDouble, len = Int32Type), mod)

We get the parameter objects so we can refer to them.

params = getParameters(fun)

Now onto the body of the function. We'll need 4 blocks again (as in cumsum.Rdb) since we have a loop.

entry = Block(fun, "entry")
cond = Block(fun, "loopCond")
body = Block(fun, "loopBody")
ret = Block(fun, "return")

We'll start with the entry block and create and initialize some local variables. We create an IR builder first

ir = IRBuilder(entry)

So now the local variables.

iv = ir$createLocalVariable(Int32Type, "i")
xref = ir$createLocalVariable(ptrDouble, "x_addr")
len.ref = ir$createLocalVariable(Int32Type, "len_addr")

ir$createStore(0L, iv)
ir$createStore(params$x, xref)
ir$createStore(params$len, len.ref)

Now we jump/branch into the condition of the loop.

ir$createBr(cond)

So now onto the loop.

ir$setInsertPoint(cond)

We load i and the value of len and perform a comparison. We use this logical value to then branch to the body of the loop or the return block. This is the same as in the cumsum example.

a = ir$createLoad(iv)
b = ir$createLoad(len.ref)
ok = ir$createICmp(ICMP_SLT, a, b)
ir$createCondBr(ok, body, ret)

The return is simple as we just return without any value (i.e. void)

ir$setInsertPoint(ret)
ir$createRetVoid()

So now let's do the body of the loop

ir$setInsertPoint(body)

Again this is very similar to what we did in cumsum but simpler. We have to load x[i]

a = ir$createLoad(xref)
i = ir$createLoad(iv)
idx = ir$createSExt(i, 64L)
xi = ir$createLoad(ir$createGEP(a, idx))

Then add 1 to this:

tmp = ir$binOp(FAdd, 1.0, xi)

Next we store the result back in x[i].

x = ir$createLoad(xref)
i = ir$createLoad(iv)
i = ir$createSExt(i, 64L)
x = ir$createGEP(x, i)
ir$createStore(tmp, x)

That is the heart of the loop. We just need to increment i.

i = ir$createLoad(iv)
inc = ir$binOp(Add, i, 1L)
ir$createStore(inc, iv)

And loop back to the condition

ir$createBr(cond)

We'll now optimize this.

ee = ExecutionEngine(mod)
Optimize(mod, ee)

And now we will call this function

n = 1e7

ll = system.time({x = rep(1, n); run(fun, x = x, n = length(x), .all = TRUE, .ee = ee)$x})

x = rep(1, n)
interp = replicate(5, system.time({p1(x)}))
interp = apply(interp, 1, median)

library(compiler)
g = cmpfun(p1)
x = rep(1, n)
cmp = replicate(5, system.time({g(x)}))
cmp = apply(cmp, 1, median)

ll = replicate(10, system.time({x = rep(1, n); run(fun, x = x, n = length(x), .all = TRUE, .ee = ee)$x}))
ll = apply(ll, 1, median)

x = rep(1, n)
vec = replicate(10, system.time(x + 1))
vec = apply(vec, 1, median)

tmp = c(interp[1], cmp[1], ll[1], vec[1])
matrix(c(tmp, interp[1]/tmp), length(tmp), ,
        dimnames = list(c("Interpeted", "Byte Compiled", "Rllvm", "Vectorized"), c("Time", "Speedup")))

For n = 1e7, here are the timings for the different implementations:

Table�1.�

Method	Times	Speedup
Interpreted	29.35	1
Byte compiled	7.65	3.83
Rllvm	.41	70.3
Vectorized	.039	793.4

On OS X with R compiled with -O3 optimization (and no -g for debugging)

Table�2.�

�	Time	Speedup
Interpeted	26.1	1
Byte Compiled	7.5	3.48
Rllvm	0.416	62.6
Vectorized	0.0365	715