Here we implement the very initial version of Ross Ihaka's 2D random walk example of how to improve the performance of code. His example starts with the naieve and obvious implementation of the 2D walk. At the end, he vectorizes this in R code and gets a speedup of a factor of about 200.
The naive implementation is
rw2d1 = function(n = 100) { xpos = ypos = numeric(n) for(i in 2:n) { # Decide whether we are moving horizontally or vertically. delta = if(runif(1) > .5) 1 else -1 if (runif(1) > .5) { xpos[i] = xpos[i-1] + delta ypos[i] = ypos[i-1] } else { xpos[i] = xpos[i-1] ypos[i] = ypos[i-1] + delta } } list(x = xpos, y = ypos) }
We have seen how to write loops, get elements from pointers and do conditional branching
in cumsum.Rdb
.
So to "compile" this function is very similar.
We are going to implement a slightly different version that receives the int * pointers for x and y and just fills them in. We can write a .Call() wrapper for this so we can compare apples and apples.
library(Rllvm) InitializeNativeTarget() mod = Module("2Drw") ptrInt = pointerType(Int32Type) fun = Function("rw2d", VoidType, c(x = ptrInt, y = ptrInt, len = Int32Type), mod)
entry = Block(fun, "entry") cond = Block(fun, "loopCond") body = Block(fun, "loopBody") ret = Block(fun, "return") h = Block(fun, "Horizontal") v = Block(fun, "Vertical") increment = Block(fun, "increment")
one = createIntegerConstant(1L) minusOne = createIntegerConstant(-1L)
ir = IRBuilder(entry) iv = ir$createLocalVariable(Int32Type, "i") lena = ir$createLocalVariable(Int32Type, "lenp") xa = ir$createLocalVariable(ptrInt, "xp") ya = ir$createLocalVariable(ptrInt, "yp") delta = ir$createLocalVariable(Int32Type, "delta") ir$createStore(fun$x, xa) ir$createStore(fun$y, ya) ir$createStore(fun$len, lena) ir$createStore(one, iv) ir$createBr(cond)
ir$setInsertPoint(cond) a = ir$createLoad(iv) b = ir$createLoad(lena) ok = ir$createICmp(ICMP_SLT, a, b) ir$createCondBr(ok, body, ret)
ir$setInsertPoint(ret) ir$createRetVoid()
ir$setInsertPoint(body) # declare runif which takes a number but ignores it. runif = Function("runif", DoubleType, c(n = Int32Type), mod) # compute delta u = ir$createCall(runif, one) gt = ir$createFCmp(FCMP_UGE, u, createDoubleConstant(.5)) ir$createStore(ir$createSelect(gt, minusOne, one), delta) # now determine whether to go horiz or vert. u = ir$createCall(runif, one) gt = ir$createFCmp(FCMP_UGE, u, createDoubleConstant(.5)) ir$createCondBr(gt, h, v)
Next we fill in the horizontal move. We load x[i-1] and add delta to it and the store this in x[i]
ir$setInsertPoint(h) a = ir$createLoad(iv) b = ir$binOp(Sub, a, one) r = ir$createLoad(xa) idx = ir$createSExt(b, 64L) x.prev = ir$createLoad(ir$createGEP(r, idx)) nw = ir$binOp(Add, x.prev, ir$createLoad(delta)) a = ir$createLoad(xa) i = ir$createLoad(iv) idx = ir$createSExt(i, 64L) xi = ir$createGEP(a, idx) ir$createStore(nw, xi)
Next we copy y[i-1] to y[i]
a = ir$createLoad(iv) b = ir$binOp(Sub, a, one) r = ir$createLoad(ya) idx = ir$createSExt(b, 64L) y.prev = ir$createLoad(ir$createGEP(r, idx)) a = ir$createLoad(ya) i = ir$createLoad(iv) idx = ir$createSExt(i, 64L) yi = ir$createGEP(a, idx) ir$createStore(y.prev, yi)
Finally, we jump to the loop increment and then to the condition.
ir$createBr(increment)
This is the increment block that updates i and then jumps to the loop condition.
ir$setInsertPoint(increment) i = ir$createLoad(iv) inc = ir$binOp(Add, i, 1L) ir$createStore(inc, iv) ir$createBr(cond)
ir$setInsertPoint(v) a = ir$createLoad(iv) b = ir$binOp(Sub, a, one) r = ir$createLoad(ya) idx = ir$createSExt(b, 64L) y.prev = ir$createLoad(ir$createGEP(r, idx)) nw = ir$binOp(Add, y.prev, ir$createLoad(delta)) a = ir$createLoad(ya) i = ir$createLoad(iv) idx = ir$createSExt(i, 64L) yi = ir$createGEP(a, idx) ir$createStore(nw, yi) a = ir$createLoad(iv) b = ir$binOp(Sub, a, one) r = ir$createLoad(xa) idx = ir$createSExt(b, 64L) x.prev = ir$createLoad(ir$createGEP(r, idx)) a = ir$createLoad(xa) i = ir$createLoad(iv) idx = ir$createSExt(i, 64L) xi = ir$createGEP(a, idx) ir$createStore(x.prev, xi) ir$createBr(increment)
We now run our compiled code to see how fast it is relative to the other implementations. Our compiled code references the runif routine and so needs to be able to find that. We do this before running the code as it only needs to be done once.
ee = ExecutionEngine(mod) Optimize(mod, ee) llvmAddSymbol(runif = getNativeSymbolInfo("runif")$address)
We'll set the number of steps to something quite large so that we measure sufficiently lengthy computations:
n = 1e7
We run the interpreted version 5 times and take the medians of the three different measurements (self, system and elapsed):
#interp = system.time({rw2d1(n)}) interp = replicate(5, system.time({rw2d1(n)})) interp = apply(interp, 1, median)
Now we run the compiled code
doit = function(n = 1000000) { x = integer(n) y = integer(n) run(fun, x = x, y = y, n, .ee = ee, .all = TRUE)[c("x", "y")] } tt = system.time({ doit(n)}) tt = system.time({ doit(n)}) # second time runs faster
Since this runs much faster than the interpreted version, we run more repetitions and again compute the median:
tt = replicate(10, system.time({ doit(n)})) tt = apply(tt, 1, median)
We'll also compare how well byte-compiling our interpreted function improves the performance:
library(compiler) g = cmpfun(rw2d1) cmp = replicate(3, system.time(g(n))) cmp = apply(cmp, 1, median) interp/cmp
Finally, we'll run the manually implemented vectorized version of the random walk that Ross Ihaka wrote. This is pure R, but highly leverages C-level computations within the interpreter.
ross = "rw1.R" source(ross) fastInterp = replicate(10, system.time({rw2d5(n)})) fastInterp = apply(fastInterp, 1, median) fastInterp/interp
apply(fastInterp, 1, median)/apply(tt, 1, median)
tmp = c(interp[1], cmp[1], fastInterp[1], tt[1]) m = matrix(c(tmp, interp[1]/tmp), length(tmp), , dimnames = list(c("Interpeted", "Byte Compiled", "Vectorized", "Rllvm"), c("Time", "Speedup")))
With an optimized R (i.e. compiled with -O3)