Duncan Temple Lang

University of California at Davis

Department of Statistics

Here we implement the very initial version of Ross Ihaka's 2D random walk example of how to improve the performance of code. His example starts with the naieve and obvious implementation of the 2D walk. At the end, he vectorizes this in R code and gets a speedup of a factor of about 200.

The naive implementation is

rw2d1 =
function(n = 100) {
    xpos = ypos = numeric(n)
    for(i in 2:n) {
          # Decide whether we are moving horizontally or vertically.
      delta = if(runif(1) > .5) 1 else -1
      if (runif(1) > .5) {
        xpos[i] = xpos[i-1] + delta
        ypos[i] = ypos[i-1]
      }
      else {
        xpos[i] = xpos[i-1]
        ypos[i] = ypos[i-1] + delta
      }
    }
    list(x = xpos, y = ypos)
  }

We have seen how to write loops, get elements from pointers and do conditional branching in cumsum.Rdb. So to "compile" this function is very similar.

Compiling the Code

We are going to implement a slightly different version that receives the int * pointers for x and y and just fills them in. We can write a .Call() wrapper for this so we can compare apples and apples.

library(Rllvm)
InitializeNativeTarget()
mod = Module("2Drw")
ptrInt = pointerType(Int32Type)
fun = Function("rw2d", VoidType, c(x = ptrInt, y = ptrInt, len = Int32Type), mod)

entry = Block(fun, "entry")
cond = Block(fun, "loopCond")
body = Block(fun, "loopBody")
ret = Block(fun, "return")
h = Block(fun, "Horizontal")
v = Block(fun, "Vertical")
increment = Block(fun, "increment")

one = createIntegerConstant(1L)
minusOne = createIntegerConstant(-1L)

ir = IRBuilder(entry)

iv = ir$createLocalVariable(Int32Type, "i")
lena = ir$createLocalVariable(Int32Type, "lenp")
xa = ir$createLocalVariable(ptrInt, "xp")
ya = ir$createLocalVariable(ptrInt, "yp")
delta = ir$createLocalVariable(Int32Type, "delta")
ir$createStore(fun$x, xa)
ir$createStore(fun$y, ya)
ir$createStore(fun$len, lena)
ir$createStore(one, iv)
ir$createBr(cond)

ir$setInsertPoint(cond)
a = ir$createLoad(iv)
b = ir$createLoad(lena)
ok = ir$createICmp(ICMP_SLT, a, b)
ir$createCondBr(ok, body, ret)

ir$setInsertPoint(ret)
ir$createRetVoid()

ir$setInsertPoint(body)
 # declare runif which takes a number but ignores it. 
runif = Function("runif", DoubleType, c(n = Int32Type), mod)

 # compute delta
u = ir$createCall(runif, one)
gt = ir$createFCmp(FCMP_UGE, u, createDoubleConstant(.5))
ir$createStore(ir$createSelect(gt, minusOne, one),  delta)

 # now determine whether to go horiz or vert.
u = ir$createCall(runif, one)
gt = ir$createFCmp(FCMP_UGE, u, createDoubleConstant(.5))

ir$createCondBr(gt, h, v)

Next we fill in the horizontal move. We load x[i-1] and add delta to it and the store this in x[i]

ir$setInsertPoint(h)
a = ir$createLoad(iv)
b = ir$binOp(Sub, a, one)
r = ir$createLoad(xa)
idx = ir$createSExt(b, 64L)
x.prev = ir$createLoad(ir$createGEP(r, idx))
nw = ir$binOp(Add, x.prev, ir$createLoad(delta))
a = ir$createLoad(xa)
i = ir$createLoad(iv)
idx = ir$createSExt(i, 64L)
xi = ir$createGEP(a, idx)
ir$createStore(nw, xi)

Next we copy y[i-1] to y[i]

a = ir$createLoad(iv)
b = ir$binOp(Sub, a, one)
r = ir$createLoad(ya)
idx = ir$createSExt(b, 64L)
y.prev = ir$createLoad(ir$createGEP(r, idx))
a = ir$createLoad(ya)
i = ir$createLoad(iv)
idx = ir$createSExt(i, 64L)
yi = ir$createGEP(a, idx)
ir$createStore(y.prev, yi)

Finally, we jump to the loop increment and then to the condition.

ir$createBr(increment)

This is the increment block that updates i and then jumps to the loop condition.

ir$setInsertPoint(increment)
i = ir$createLoad(iv)
inc = ir$binOp(Add, i, 1L)
ir$createStore(inc, iv)
ir$createBr(cond)

ir$setInsertPoint(v)

a = ir$createLoad(iv)
b = ir$binOp(Sub, a, one)
r = ir$createLoad(ya)
idx = ir$createSExt(b, 64L)
y.prev = ir$createLoad(ir$createGEP(r, idx))
nw = ir$binOp(Add, y.prev, ir$createLoad(delta))
a = ir$createLoad(ya)
i = ir$createLoad(iv)
idx = ir$createSExt(i, 64L)
yi = ir$createGEP(a, idx)
ir$createStore(nw, yi)


a = ir$createLoad(iv)
b = ir$binOp(Sub, a, one)
r = ir$createLoad(xa)
idx = ir$createSExt(b, 64L)
x.prev = ir$createLoad(ir$createGEP(r, idx))
a = ir$createLoad(xa)
i = ir$createLoad(iv)
idx = ir$createSExt(i, 64L)
xi = ir$createGEP(a, idx)
ir$createStore(x.prev, xi)



ir$createBr(increment)

Timings

We now run our compiled code to see how fast it is relative to the other implementations. Our compiled code references the runif routine and so needs to be able to find that. We do this before running the code as it only needs to be done once.

ee = ExecutionEngine(mod)
Optimize(mod, ee)
llvmAddSymbol(runif = getNativeSymbolInfo("runif")$address)

We'll set the number of steps to something quite large so that we measure sufficiently lengthy computations:

n = 1e7

We run the interpreted version 5 times and take the medians of the three different measurements (self, system and elapsed):

#interp = system.time({rw2d1(n)})
interp = replicate(5, system.time({rw2d1(n)}))
interp = apply(interp, 1, median)

Now we run the compiled code

doit = 
function(n = 1000000) {
 x = integer(n)
 y = integer(n)
 run(fun, x = x, y = y, n, .ee = ee, .all = TRUE)[c("x", "y")]
}

tt = system.time({ doit(n)})
tt = system.time({ doit(n)}) # second time runs faster

Since this runs much faster than the interpreted version, we run more repetitions and again compute the median:

tt = replicate(10, system.time({ doit(n)}))
tt = apply(tt, 1, median)

We'll also compare how well byte-compiling our interpreted function improves the performance:

library(compiler)
g = cmpfun(rw2d1)
cmp = replicate(3, system.time(g(n)))
cmp = apply(cmp, 1, median)
interp/cmp

Finally, we'll run the manually implemented vectorized version of the random walk that Ross Ihaka wrote. This is pure R, but highly leverages C-level computations within the interpreter.

ross = "rw1.R"
source(ross)
fastInterp = replicate(10, system.time({rw2d5(n)}))
fastInterp = apply(fastInterp, 1, median)
fastInterp/interp

apply(fastInterp, 1, median)/apply(tt, 1, median)

tmp = c(interp[1], cmp[1], fastInterp[1], tt[1])
m = matrix(c(tmp, interp[1]/tmp), length(tmp), ,
        dimnames = list(c("Interpeted", "Byte Compiled", "Vectorized", "Rllvm"), c("Time", "Speedup")))

Table�1.�

Method	Times	Speedup
Interpreted	170.9	1
Byte compiled	93.8	1.8
Vectorized	.88	194.2
Rllvm	.50	341.8

With an optimized R (i.e. compiled with -O3)

Table�2.�

�	Time	Speedup
Interpeted	169	1
Byte Compiled	84.5	2.00
Vectorized	0.81	208
Rllvm	0.487	346