## Saturday, September 27, 2008

### Tutorial 10: Generic programming using quotations

The code for this tutorial comes in several files.
First: GenericPhysics.fs

#light

open System
open Microsoft.FSharp.Linq.QuotationEvaluation

#nowarn "57"

type ps<'Vec> = class
val pos:'Vec
val speed:'Vec
new(pos, speed) = { pos = pos; speed = speed }
end

type sa<'Vec> = class
val speed:'Vec
val accel:'Vec
new(speed, accel) = { speed = speed; accel = accel }
end

let mk_gravity scale_func (up: 'Vec): 'Vec =
let q =
<@
let (*) = %scale_func in -9.81 * up
@>
q.Eval()

let mk_spring scale_func: float -> 'Vec -> 'Vec =
let q =
<@
let (*) = %scale_func in fun k v -> -k * v
@>
q.Eval()

let mk_drag scale_func: float -> 'Vec -> 'Vec =
let q =
<@
let (*) = %scale_func in fun k v -> -k * v
@>
q.Eval()

let mk_euler_implicit plus_func scale_func =
let q =
<@
let (+) = %plus_func in
let (*) = %scale_func in
fun accel_func pos speed delta ->
let speed2 = speed + delta * (accel_func pos speed)
let pos2 = pos + delta * speed2
ps<_>(pos2, speed2)
@>
q.Eval()

let inline adapt (f: 'Vec -> 'Vec -> 'Vec) =
fun pos speed ->
let accel = f pos speed in
sa<_>(speed, accel)

let mk_rk4_q plus_func scale_func =
<@
fun deriv_func pos speed delta ->
let pf = %plus_func
let sf = %scale_func

let extrapolate x delta dx =
pf x (sf delta dx)

let half_delta = 0.5 * delta

let sa1: sa<_> = deriv_func pos speed
let sa2 = deriv_func (extrapolate pos half_delta sa1.speed) (extrapolate speed half_delta sa1.accel)
let sa3 = deriv_func (extrapolate pos half_delta sa2.speed) (extrapolate speed half_delta sa2.accel)
let sa4 = deriv_func (extrapolate pos delta sa3.speed) (extrapolate speed delta sa3.accel)

let sixth_delta = delta / 6.0

let update old v1 v2 v3 v4 =
pf old (sf sixth_delta
(pf v1
(pf v2
(pf v2
(pf v3
(pf v3
v4))))))

let pos1 = update pos sa1.speed sa2.speed sa3.speed sa4.speed
let speed1 = update speed sa1.accel sa2.accel sa3.accel sa4.accel

ps<'Vec>(pos1, speed1)
@>

let mk_rk4 plus_func scale_func =
let q = mk_rk4_q plus_func scale_func
q.Eval()

let simulate intg_func (t0: float) t_fin delta pos0 speed0 =
let rec repeat t0 pos0 speed0 =
if t0 > t_fin then (pos0, speed0)
else
let t1 = t0 + delta
let ps: ps<_> = intg_func pos0 speed0 delta
repeat t1 ps.pos ps.speed
repeat t0 pos0 speed0

Second file: FloatPhysics.fs

#light

let gravity = GenericPhysics.mk_gravity <@ fun x y -> x * y @> 1.0
let drag = GenericPhysics.mk_drag <@ fun x y -> x * y @>
let spring = GenericPhysics.mk_spring <@ fun x y -> x * y @>
let rk4 = GenericPhysics.mk_rk4 <@ fun x y -> x + y @> <@ fun x y -> x * y @>
let euler = GenericPhysics.mk_euler_implicit <@ fun x (y:float) -> x + y @> <@ fun x y -> x * y @>

Third file: Util.fs

#light

open System

let run_time_func (func: unit -> 'a) =
let watch = new System.Diagnostics.Stopwatch()
watch.Start()
let res = func ()
watch.Stop()
let diff0 = float watch.ElapsedMilliseconds / 1000.0
(diff0, res)

let map_parallel func items =
seq {
for i in items -> async {
return (func i)
}
}

Last file: Main.fs
#light

open System
open FloatPhysics
open Util

let inline accel pos speed =
drag 1.0 speed + spring 100.0 pos + gravity

let ef = (FloatPhysics.euler accel)
let rk4f = (rk4 (GenericPhysics.adapt accel))

let run euler_func s0 =
let t0 = 0.0
let t_fin = 1000.0
let delta = 0.005

let run_timed map_func =
let n_runs = 2
let times =
[
for i in 1..n_runs ->
let (run_time, res) =
run_time_func (fun () ->
s0 |> map_func(fun (x, y) ->
GenericPhysics.simulate euler_func t0 t_fin delta x y))
run_time
]

let max = Seq.max times
let min = Seq.min times
let avg = (Seq.fold (fun x y -> x+y) 0.0 times) / float n_runs
(min, max, avg)

let single_min, single_max, single_avg = run_timed List.map
let multi_min, multi_max, multi_avg = run_timed map_parallel

printfn "Single (min, max, avg): %f %f %f" single_min single_max single_avg
printfn "Multi (min, max, avg): %f %f %f" multi_min multi_max multi_avg

let n_bodies = 2

let s0_float = [ for s in 1..n_bodies -> (float s, 0.0) ]
run ef s0_float
run rk4f s0_float

If you want to compile these sources using Visual Studio, you need to know that the order in which the files appear in the project is important. If a file B uses types or values from a file A, then A must be placed higher in the list of files of the project.

You will also need to add references to the following DLLs: FSharp.PowerPack and FSharp.PowerPack.Linq.

Let us now take a deeper look at the new features of F# used in the code.

open Microsoft.FSharp.Linq.QuotationEvaluation

#nowarn "57"

The main item of this Tutorial is quotations. These represent pieces of code which can be constructed at compile time, changed dynamically at run time, and recompiled and executed dynamically. This last part is still experimental, and requires the use of a specific module.

let mk_gravity scale_func (up: 'Vec): 'Vec =
let q =
<@
let (*) = %scale_func in -9.81 * up
@>
q.Eval()

The interesting piece here is the part between <@ and @>. The entire thing is called a quotation. It must contain a valid segment of F# code. The F# compiler will parse the code in this quotation, and build an object representing this code. More precisely, the compiler will generate an abstract syntax tree (AST). Note that this object does not in itself constitute executable code.
You may be wondering what "%scale_func" means in this context. The "%" operator specifies that another AST should be inserted there. "scale_func" is the variable which should contain the AST to insert.
To clarify this, look at the call to mk_gravity in FloatPhysics.fs:
let gravity = GenericPhysics.mk_gravity <@ fun x y -> x * y @> 1.0

The resulting quotation would therefore look as follows:

<@
let (*) = fun x y -> x * y in -9.81 * up
@>

which is has the same effect as:

<@
-9.81 * up
@>

An AST generated from a quotation can be compiled using method "Compile", or compiled and executed using method "Eval".
Had I written "q.Eval()" instead, "mk_gravity" would have returned a value of type "unit -> 'Vec", i.e. a function taking no argument and returning a vector when called. We may just as well returned the vector directly, which we can do by using method "Eval" instead.

If we take a step back and look at what we have just achieved, we will see that we have constructed a rather complicated way of perform the product of a vector by a scalar.

Let us look at the next function for a more interesting example.
let mk_spring scale_func: float -> 'Vec -> 'Vec =
let q =
<@
let (*) = %scale_func in fun k v -> -k * v
@>
q.Eval()

While I wrote this code I repeatedly found myself wondering "What did I just write?". Happily, the compiler can answer that question.This function takes an AST which should represent a function taking a float and a vector (in curried form) and returning a vector. The "mk_spring" function will then return a function taking a float and a vector and returning a vector. In other words, it will "unquote" the quotation, which is simply done by evaluating the AST. Before doing so, it inserts the quoted form of the function performing the product of a vector by a scalar into q, which in the present case has the same effect as:
<@
let (*) = fun x y -> x * y in fun k v -> -k * v
@>

Or in simpler terms:
<@
-k * v
@>

We have found yet another way to perform the product of a vector (which is actually a float in this simple example) by a scalar (another float, but not to be mistaken with the vector). The "mk_spring" function differs from "mk_gravity" in the way that its parameters are expected to vary. Gravity is assumed to be constant, and so is the upward direction, denoted by the "up" parameter of "mk_gravity". The stiffness coefficient and the position of the body, on the other hand, change across calls.

Using quotations to achieve genericity can be seen as some kind of hybrid between the functional approach described in Tutorial 8 and the approach based on static constraints of Tutorial 9. It shares with Tutorial 8 its use of function parameters to specify the implementation of operations on vectors. It share with Tutorial 9 its way of emitting specific code for each concrete type of vector.

I had hoped this would result in more efficient code, but that is unfortunately not currently the case. The quotation evaluation functionality is still experimental, and it may be possible that future versions of F# might produce faster code.

Update
I've come across an entry in Jomo Fisher's blog about Linq which makes me wonder if my code is really right. Jomo has used Linq in a setting where performance is truly impressive. Why I am getting such bad results here? Further investigation needed...
End of Update

The point of the approach described here is not so much what it achieves, but what possibilities it opens. For instance, it is possible to implement aggressive optimization techniques that the F# compiler will probably never support, such as rearranging code to avoid creating objects for intermediate values, or turning pure computations into computations with local side effects. Obviously, it would not be realistic to support all features of F#, but a subset dedicated to computations would probably suffice. Using Quotations.Expr from F# and Reflection.Emit from .NET, it seems very feasible to write a specialized optimizing compiler for a subset of F#.
Other interesting potential uses of quotations for games include programs to run on the GPU (shaders), or on any other kind of peripheral, for that matter (game controllers, head-on displays...).

## Wednesday, September 24, 2008

### Updated C++ version

This is the C++ version of the integrator, updated to use vectors.

UPDATE: Bug fixes. Funny how non-explicit single-argument constructors can hide errors in your code...

// RK++NET.cpp : main project file.

#include "stdafx.h"
#include <windows.h>

#include <map>
#include <vector>
#include <iostream>

const double gravity = -9.81;

template<typename T>
inline
T spring(const T& pos)
{
return -100.0 * pos;
}

template<typename T>
inline
T drag(double k, const T& pos)
{
return -k * pos;
}

template<typename A>
class EulerImplicit
{
public:
EulerImplicit(const A& accel):
accel(accel)
{
}

template<typename T>
inline
void operator()(T& pos, T& speed, const double delta) const
{
speed += delta * accel(pos, speed);
pos += delta * speed;
}

private:
const A& accel;
};

template<typename A>
class RK4
{
public:
RK4(const A& accel):
accel(accel)
{
}

template<typename T>
inline
void operator()(T& pos, T& speed, const double delta) const
{
const double half_delta = 0.5 * delta;

std::pair<T, T> k1 = accel(pos, speed);
std::pair<T, T> k2 = accel(pos + half_delta * k1.first, speed + half_delta * k1.second);
std::pair<T, T> k3 = accel(pos + half_delta * k2.first, speed + half_delta * k2.second);
std::pair<T, T> k4 = accel(pos + delta * k3.first, speed + delta * k3.second);

pos += delta / 6.0 * (k1.first + 2.0 * (k2.first + k3.first) + k4.first);
speed += delta / 6.0 * (k1.second + 2.0 * (k2.second + k3.second) + k4.second);
}

private:
const A& accel;
};

template<typename T>
class Force
{
public:
explicit Force(const T& up) : m_drag(1.0), m_gravity(gravity*up)
{
}

T operator()(const T& pos, const T& speed) const
{
return drag(m_drag, speed) + spring(pos) + m_gravity;
}

private:

const double m_drag;
T m_gravity;
};

template<typename A>
{
public:
accel(accel)
{
}

template<typename T>
inline std::pair<T, T> operator()(const T& pos, const T& speed) const
{
std::pair<T, T> ret;
ret.first = speed;
ret.second = accel(pos, speed);
return ret;
}

private:
const A& accel;
};

template<typename T, typename I>
inline
void simulate(const I& intg_func, T& pos, T& speed, double t0, const double t_fin, const double delta)
{
while(t0 < t_fin)
{
t0 += delta;
intg_func(pos, speed, delta);
}
}

template<int DIM>
class VecD
{
public:
VecD() {
for (int i=0; i<DIM; ++i)
vals[i] = 0.0;
}

explicit VecD(double v0, double v1=0.0, double v2=0.0)
{
if (DIM >= 1)
vals[0] = v0;

if (DIM >= 2)
vals[1] = v1;

if (DIM >= 3)
vals[2] = v2;

for (int i=3; i<DIM; ++i)
vals[i] = 0.0;
}

inline VecD<DIM> operator+=(const VecD<DIM>& other)
{
for (int i=0; i<DIM; ++i)
vals[i] += other.vals[i];
return *this;
}

inline VecD<DIM> operator+(const VecD<DIM>& other) const
{
VecD<DIM> tmp(*this);
return tmp += other;
}

public:
double vals[DIM];
};

template<int DIM>
inline
VecD<DIM>
operator*(double k, const VecD<DIM>& other)
{
VecD<DIM> ret(other);
for (int i=0; i<DIM; ++i)
ret.vals[i] *= k;
return ret;
}

typedef VecD<1> Vec1D;
typedef VecD<2> Vec2D;
typedef VecD<3> Vec3D;

void make_pos(int i, double& out) { out = (double)i; }
void make_pos(int i, Vec1D& out) { out = Vec1D((double) i); }
void make_pos(int i, Vec2D& out) { out = Vec2D(1.0, (double) i); }
void make_pos(int i, Vec3D& out) { out = Vec3D(1.0, (double) i, -1.0); }

void make_up(double &out) { out = 1.0; }
void make_up(Vec1D &out) { out = Vec1D(1.0); }
void make_up(Vec2D &out) { out = Vec2D(0.0, 1.0); }
void make_up(Vec3D &out) { out = Vec3D(0.0, 1.0, 0.0); }

template<typename Vec>
void run_sim(const char *descr)
{
typedef std::vector< std::pair<Vec, Vec> > Tuples;

std::cout << descr << std::endl;

const double t0 = 0.0;
const double t_fin = 1000.0;
const double delta = 0.005;
const int N_BODIES = 100;

Tuples initial_states;
initial_states.reserve(N_BODIES);
for (int i=0; i<N_BODIES; ++i)
{
Vec pos;
make_pos(i, pos);
initial_states.push_back( std::make_pair(pos, Vec(0.0)) );
}

double el = 0.0;
const int NRUNS=3;
for (int i=0; i<NRUNS; ++i)
{
Tuples states(initial_states);

Vec up;
make_up(up);
Force<Vec> accel(up);
EulerImplicit< Force<Vec> > euler(accel);
long int start0 = GetTickCount();
for (Tuples::iterator it = states.begin();
it != states.end();
++it)
{
simulate(euler, it->first, it->second, t0, t_fin, delta);
}
el += (GetTickCount() - start0) / 1000.0;
}
std::cout << "Time: " << el / NRUNS << "\n";
}

int main(int argc, char **argv)
{
run_sim<double>("double");
run_sim<Vec1D>("class 1D");
run_sim<Vec2D>("class 2D");
run_sim<Vec3D>("class 3D");

char c;
std::cin >> c;
return 0;
}

### Tutorial 9: Generic programming using type constraints

This Tutorial 9 shows a generic implementation of the integrator, based on type parameters.

#light

open System

let run_time_func (func: unit -> 'a) =
let watch = new System.Diagnostics.Stopwatch()
watch.Start()
let res = func ()
watch.Stop()
let diff0 = float watch.ElapsedMilliseconds / 1000.0
(diff0, res)

let map_parallel func items =
seq {
for i in items -> async {
return (func i)
}
}

type ps<'num> = struct
val pos:'num
val speed:'num
new(pos, speed) = { pos = pos; speed = speed }
end

type VecN =
One of float
| Two of float*float
| Three of float*float*float
with
static member inline plus (a: VecN) (b: VecN) =
match (a, b) with
| (One(v), One(w)) -> One(v+w)
| (Two(v0, v1), Two(w0, w1)) -> Two(v0 + w0, v1 + w1)
| (Three(v0, v1, v2), Three(w0, w1, w2)) -> Three(v0 + w0, v1 + w1, v2 + w2)
| _ -> failwith "Size mismatch"

static member inline scale (k: float) (a: VecN) =
match a with
| One(v) -> One(k * v)
| Two(v0, v1) -> Two(k * v0, k * v1)
| Three(v0, v1, v2) -> Three(k * v0, k * v1, k * v2)

type Vec1d = struct
val x: float

new (x) = { x = x }

static member inline plus (a: Vec1d) (b: Vec1d) = Vec1d(a.x + b.x)
static member inline scale (k: float) (a: Vec1d) = Vec1d(k * a.x)
end

type Vec2d = struct
val x: float
val y: float

new (x, y) = { x = x; y = y }

static member inline plus (a: Vec2d) (b: Vec2d) = Vec2d(a.x + b.x, a.y + b.y)
static member inline scale (k: float) (a: Vec2d) = Vec2d(k * a.x, k * a.y)
end

type Vec3d = struct
val x: float
val y: float
val z: float

new (x, y, z) = { x = x; y = y; z = z }

static member inline plus (a: Vec3d) (b: Vec3d) = Vec3d(a.x + b.x, a.y + b.y, a.z + b.z)
static member inline scale (k: float) (a: Vec3d) = Vec3d(k * a.x, k * a.y, k * a.z)
end

type CVec1d (x: float) = class
member v.X = x

static member inline plus (a: CVec1d) (b: CVec1d) = CVec1d(a.X + b.X)
static member inline scale (k: float) (a: CVec1d) = CVec1d(k * a.X)
end

type CVec2d (x: float, y:float) = class
member v.X = x
member v.Y = y

static member inline plus (a: CVec2d) (b: CVec2d) = CVec2d(a.X + b.X, a.Y + b.Y)
static member inline scale (k: float) (a: CVec2d) = CVec2d(k * a.X, k * a.Y)
end

type CVec3d (x: float, y: float, z: float) = class
member v.X = x
member v.Y = y
member v.Z = z

static member inline plus (a: CVec3d) (b: CVec3d) = CVec3d(a.X + b.X, a.Y + b.Y, a.Z + b.Z)
static member inline scale (k: float) (a: CVec3d) = CVec3d(k * a.X, k * a.Y, k * a.Z)
end

let gravity =
-9.81

let inline spring (pos: ^vec) =
let (*) k x = (^vec: (static member inline scale: float -> ^vec -> ^vec) k) x
let k = 100.0
-k * pos

let inline drag k (pos: ^speed) (speed: ^speed) =
let (*) k x = (^vec: (static member inline scale: float -> ^vec -> ^vec) k) x
-k * speed

let inline euler_implicit accel_func (pos: ^vec) (speed: ^vec) delta =
let (+) x y = (^vec: (static member inline plus: ^vec -> ^vec -> ^vec) x) y
let (*) k x = (^vec: (static member inline scale: float -> ^vec -> ^vec) k) x

let speed2 = speed + (delta * (accel_func pos speed))
let pos2 = pos + (delta * speed2)
ps<_>(pos2, speed2)

let inline simulate intg_func t0 t_fin delta pos0 speed0 =
let rec repeat t0 pos0 speed0 =
if t0 > t_fin then (pos0, speed0)
else
let t1 = t0 + delta
let ps: ps<_> = oc.Invoke(pos0, speed0, delta)
repeat t1 ps.pos ps.speed
repeat t0 pos0 speed0

let inline accel (up: ^vec) (pos: ^vec) (speed: ^vec) =
let (+) x y = (^vec: (static member inline plus: ^vec -> ^vec -> ^vec) x) y
let (*) k x = (^vec: (static member inline scale: float -> ^vec -> ^vec) k) x
(drag 1.0 pos speed) + (spring pos) + (gravity * up)

let run euler_func s0 =
let t0 = 0.0
let t_fin = 1000.0
let delta = 0.005

let run_timed map_func =
let n_runs = 3
let times =
[
for i in 1..n_runs ->
let (run_time, res) = run_time_func (fun () -> s0 |> map_func(fun (x, y) -> simulate euler_func t0 t_fin delta x y))
run_time
]

let max = Seq.max times
let min = Seq.min times
let avg = (Seq.fold (fun x y -> x+y) 0.0 times) / float n_runs
(min, max, avg)

let single_min, single_max, single_avg = run_timed List.map
let multi_min, multi_max, multi_avg = run_timed map_parallel

printfn "Single (min, max, avg): %f %f %f" single_min single_max single_avg
printfn "Multi (min, max, avg): %f %f %f" multi_min multi_max multi_avg

let n_bodies = 10

let s0_Vec1d = [ for s in 1..n_bodies -> (Vec1d(float s), Vec1d(0.0)) ]
let ef_Vec1d = euler_implicit (accel (Vec1d(1.0)))

let s0_Vec2d = [ for s in 1..n_bodies -> (Vec2d(float s, 1.0), Vec2d(0.0, 0.0)) ]
let ef_Vec2d = euler_implicit (accel (Vec2d(0.0, 1.0)))

let s0_Vec3d = [ for s in 1..n_bodies -> (Vec3d(float s, 1.0, -1.0), Vec3d(0.0, 0.0, 0.0)) ]
let ef_Vec3d = euler_implicit (accel (Vec3d(0.0, 1.0, 0.0)))

let s0_CVec1d = [ for s in 1..n_bodies -> (CVec1d(float s), CVec1d(0.0)) ]
let ef_CVec1d = euler_implicit (accel (CVec1d(1.0)))

let s0_CVec2d = [ for s in 1..n_bodies -> (CVec2d(float s, 1.0), CVec2d(0.0, 0.0)) ]
let ef_CVec2d = euler_implicit (accel (CVec2d(0.0, 1.0)))

let s0_CVec3d = [ for s in 1..n_bodies -> (CVec3d(float s, 1.0, -1.0), CVec3d(0.0, 0.0, 0.0)) ]
let ef_CVec3d = euler_implicit (accel (CVec3d(0.0, 1.0, 0.0)))

let s0_One = [ for s in 1..n_bodies -> (One(float s), One(0.0)) ]
let ef_One = euler_implicit (accel (One(1.0)))

let s0_Two = [ for s in 1..n_bodies -> (Two(float s, 1.0), Two(0.0, 0.0)) ]
let ef_Two = euler_implicit (accel (Two(0.0, 1.0)))

let s0_Three = [ for s in 1..n_bodies -> (Three(float s, 1.0, -1.0), Three(0.0, 0.0, 0.0)) ]
let ef_Three = euler_implicit (accel (Three(0.0, 1.0, 0.0)))

printfn "1D (struct)"
run ef_Vec1d s0_Vec1d

printfn "2D (struct)"
run ef_Vec2d s0_Vec2d

printfn "3D (struct)"
run ef_Vec3d s0_Vec3d

printfn "1D (class)"
run ef_CVec1d s0_CVec1d

printfn "2D (class)"
run ef_CVec2d s0_CVec2d

printfn "3D (class)"
run ef_CVec3d s0_CVec3d

printfn "1D (union)"
run ef_One s0_One

printfn "2D (union)"
run ef_Two s0_Two

printfn "3D (union)"
run ef_Three s0_Three

Here come the detailed explanations:
type VecN =
One of float
[...]
type CVec3d (x: float, y: float, z: float) = class
member v.X = x
member v.Y = y
member v.Z = z

static member inline plus (a: CVec3d) (b: CVec3d) = CVec3d(a.X + b.X, a.Y + b.Y, a.Z + b.Z)
static member inline scale (k: float) (a: CVec3d) = CVec3d(k * a.X, k * a.Y, k * a.Z)
end

These definitions show different ways to implement vector types. The first method, which uses a union, is a bit uncommon. It is inelegant because it does not take advantage of the type system to capture the fact that adding vectors of different dimensions is not allowed. It is also the slowest implementation of vectors of all.
Follow definitions of one, two and three dimensional vectors using structs, which we already saw in the previous tutorial. The last group is almost identical, except for the fact that classes are used. I won't go into the differences between structs and classes in this post, but it's interesting to see that structs are faster in this program. This is however not generally true, even when considering only the special case of 3d vectors.

The interesting part starts below.
let inline spring (pos: ^vec) =

This defines a generic function "spring" taking a parameter of type "^vec". Note that "vec" here is not an actual type, but a type variable. We have already used the notation " 'vec", which plays a similar role. The difference resides in who creates the concrete function. In the case of " 'vec", this is handled by the generics mechanics in the .NET runtime. In the present case, using "^vec", it is handled by the F# compiler.
(QUESTION: is that correct? I tried replacing "^vec" with "'vec" in this line, and that worked too.)
This requires the function to be declared as inline. The compiler will generate a new copy of the code for each concrete type with which it is used.
For further details about type constraints, see Statically typed duck typing in F#.

let (*) k x = (^vec: (static member scale: float -> ^vec -> ^vec) k) x

I will start with the part of the line after the "=" symbol. It constitutes a type constraint, and means that "^vec" must have a static member named "scale", accepting a float and returning a function from "^vec" to "^vec" (in other words: a function taking a float and a "^vec" and returning a "^vec").
The rest of the line defines a function "*", thus overloading the multiplication operator, which allows us to write expressions in a natural way. Note that this method could also have been used in Tutorial 8, its availability is not subject to the use of type constraints.

let k = 100.0
-k * pos

Finally something we recognize...

You may have noticed that unlike Tutorial 8, this tutorial includes the optimizations from Tutorial 7 and 7b (aggressive inlining and OptimizedClosures). They do seem to make a difference here.

Update:
This Tutorial used to include a section comparing the performance of the F# and the C++ versions of Euler using doubles and structs (for 1-, 2- and 3-dimensional vectors). I noticed the timing results depended very much on whether the programs were started from Visual Studio or directly from Windows. C++ programs using .NET are faster when started from Visual studio, whereas F# programs run faster when started outside of Visual Studio. The differences were large enough to render comparisons meaningless, and I have therefore decided to remove the benchmark results until I have cleared that up.

I have kept the most original method to write generic code for the next Tutorial. Stay tuned!

## Monday, September 22, 2008

### Tutorial 8: Generic programming, functional approach

I have decided to continue this series by extending the integrator to handle 2d and 3d. One of the annoyances an amateur game developer encounters is the multiplication of 3d vector classes. They all use the same data layout, all offer the same methods, all offer the same performance, yet there is at least one such class per library. For instance, in my go-kart racing game, I have to deal with 3d vectors in OpenAL, 3d vectors in CGAL (which I used for DeLaunay triangulation), and should I start using ODE (a physics library), I'll have to deal with yet another type of 3d vectors. To make things worse, I am myself guilty of having implemented my own 3d vector math library.

The type inference feature in F# makes it easy to avoid introducing yet another 3d vector class. When the obligation to explicitly annotate all your variables with types disappears, writing polymorphic code becomes easy.

What approaches do we have to write generic code?
- the object-oriented approach based on interfaces, classes and method overriding,
- what I would call the functional approach,
- the template approach (in C++ terminology) also known as the generics approach (Java, .Net and friends terminology), and
- an approach which I have never seen yet (in compiled languages), which I will call the meta-programming approach.

I will skip over the first method, mostly because it has already abundantly been described and explained.
The second method is the subject of this Tutorial, and is also described in chapter 5 of Expert F#
The third method will be explored in the next Tutorial, and I am keeping the fourth method, the most exciting, for the last of the tutorials dedicated to generic programming.
Normally I would have started by showing the code, but in this case I thought I needed to justify why I should change the code at all. Especially if you come from C++, you might think that running the integration in 2d or 3d would just be a matter of changing the kind of vectors passed to the "simulate" and "accel" functions. As long as the new vector types support "operator+" and "operator*" we should be fine.
You would be wrong. Consider:
let x = 0.5 * a

I am not sure of the reasons behind this design decision, but in F#, this means that "a" and "x" are inferred to be the same type as 0.5, which is float. Moreover, the multiplication operator is now constrained to be applicable only on floats. That's right, you are no longer allowed to write "2 * 3". Like it or not, that's the way it is.

In practice, this means that I am probably best off avoiding operators + and *. Instead, the caller of my functions will have to tell them what functions to use to perform arithmetic operations.

OK, enough talk, here comes the code.

#light

open System

let run_time_func (func: unit -> 'a) =
let watch = new System.Diagnostics.Stopwatch()
watch.Start()
let res = func ()
watch.Stop()
let diff0 = float watch.ElapsedMilliseconds / 1000.0
(diff0, res)

let map_parallel func items =
seq {
for i in items -> async {
return (func i)
}
}

type Vec1d = struct
val x: float

new (x) = { x = x }

static member plus (a: Vec1d) (b: Vec1d) = Vec1d(a.x + b.x)
static member scale (k: float) (a: Vec1d) = Vec1d(k * a.x)
end

type Vec2d = struct
val x: float
val y: float

new (x, y) = { x = x; y = y }

static member plus (a: Vec2d) (b: Vec2d) = Vec2d(a.x + b.x, a.y + b.y)
static member scale (k: float) (a: Vec2d) = Vec2d(k * a.x, k * a.y)
end

type Vec3d = struct
val x: float
val y: float
val z: float

new (x, y, z) = { x = x; y = y; z = z }

static member plus (a: Vec3d) (b: Vec3d) = Vec3d(a.x + b.x, a.y + b.y, a.z + b.z)
static member scale (k: float) (a: Vec3d) = Vec3d(k * a.x, k * a.y, k * a.z)
end

type VecOps<'vector, 'scalar> = class
val up: 'vector
val add: 'vector -> 'vector -> 'vector
val scale: 'scalar -> 'vector -> 'vector

end

let gravity (vec_ops: VecOps<'a, 'b>) =
vec_ops.scale -9.81 vec_ops.up

let spring (vec_ops: VecOps<'a, 'b>) stiff pos =
vec_ops.scale stiff pos

let drag (vec_ops: VecOps<'a, 'b>) k speed =
vec_ops.scale (-k) speed

let euler_implicit (vec_ops: VecOps<'a, 'b>) accel_func pos speed delta =
let speed2 = vec_ops.add speed (vec_ops.scale delta (accel_func pos speed))
vec_ops.add pos (vec_ops.scale delta speed2), speed2

let adapt (f: 'num -> 'num -> 'num) pos speed =
let accel = f pos speed
speed, accel

let runge_kutta_4 (vec_ops: VecOps<'a, 'b>) deriv_func pos speed delta =
let extrapolate x delta dx =

let half_delta = 0.5 * delta

let s1, a1 = deriv_func pos speed
let s2, a2 = deriv_func (extrapolate pos half_delta s1) (extrapolate speed half_delta a1)
let s3, a3 = deriv_func (extrapolate pos half_delta s2) (extrapolate speed half_delta a2)
let s4, a4 = deriv_func (extrapolate pos delta s3) (extrapolate speed delta a3)

let sixth_delta = delta / 6.0

let update old v1 v2 v3 v4 =
(vec_ops.scale sixth_delta
v4))))))

let pos1 = update pos s1 s2 s3 s4
let speed1 = update speed a1 a2 a3 a4
pos1, speed1

let simulate intg_func t0 t_fin delta pos0 speed0 =
let rec repeat t0 pos0 speed0 =
if t0 > t_fin then (pos0, speed0)
else
let t1 = t0 + delta
let pos1, speed1 = intg_func pos0 speed0 delta
repeat t1 pos1 speed1
repeat t0 pos0 speed0

let run euler_func s0 =
let t0 = 0.0
let t_fin = 1000.0
let delta = 0.005

let run_timed map_func =
let n_runs = 1
let times =
[
for i in 1..n_runs ->
let (run_time, res) = run_time_func (fun () -> s0 |> map_func(fun (x, y) -> simulate euler_func t0 t_fin delta x y))
run_time
]

let max = Seq.max times
let min = Seq.min times
let avg = (Seq.fold (fun x y -> x+y) 0.0 times) / float n_runs
(min, max, avg)

let single_min, single_max, single_avg = run_timed List.map
let multi_min, multi_max, multi_avg = run_timed map_parallel

printfn "Single (min, max, avg): %f %f %f" single_min single_max single_avg
printfn "Multi (min, max, avg): %f %f %f" multi_min multi_max multi_avg

let Vec1D_ops = VecOps<Vec1d, float>(Vec1d(1.0), Vec1d.plus, Vec1d.scale)
let Vec2D_ops = VecOps<Vec2d, float>(Vec2d(0.0, 1.0), Vec2d.plus, Vec2d.scale)
let Vec3D_ops = VecOps<Vec3d, float>(Vec3d(0.0, 1.0, 0.0), Vec3d.plus, Vec3d.scale)

let accel (vec_ops: VecOps<'a, 'b>) (pos: 'a) (speed: 'a) =
(vec_ops.add (drag vec_ops 1.0 speed) (gravity vec_ops))

let n_bodies = 10

let s0_Vec1d = [ for s in 1..n_bodies -> (Vec1d(float s), Vec1d(0.0)) ]
let ef_Vec1d = euler_implicit Vec1D_ops (accel Vec1D_ops)
let rk4_Vec1d = runge_kutta_4 Vec1D_ops (adapt (accel Vec1D_ops))

let s0_Vec2d = [ for s in 1..n_bodies -> (Vec2d(float s, 1.0), Vec2d(0.0, 0.0)) ]
let ef_Vec2d = euler_implicit Vec2D_ops (accel Vec2D_ops)
let rk4_Vec2d = runge_kutta_4 Vec2D_ops (adapt (accel Vec2D_ops))

let s0_Vec3d = [ for s in 1..n_bodies -> (Vec3d(float s, 1.0, -1.0), Vec3d(0.0, 0.0, 0.0)) ]
let ef_Vec3d = euler_implicit Vec3D_ops (accel Vec3D_ops)
let rk4_Vec3d = runge_kutta_4 Vec3D_ops (adapt (accel Vec3D_ops))

printfn "1D (struct)"
run ef_Vec1d s0_Vec1d
run rk4_Vec1d s0_Vec1d

printfn "2D (struct)"
run ef_Vec2d s0_Vec2d
run rk4_Vec2d s0_Vec2d

printfn "3D (struct)"
run ef_Vec3d s0_Vec3d
run rk4_Vec3d s0_Vec3d

The code was somewhat cleaned up from previous tutorials, but no new concepts were added. I will never the less describe in further details my usage of structs.

type Vec2d = struct
val x: float
val y: float

new (x, y) = { x = x; y = y }

static member plus (a: Vec2d) (b: Vec2d) = Vec2d(a.x + b.x, a.y + b.y)
static member scale (k: float) (a: Vec2d) = Vec2d(k * a.x, k * a.y)
end

This defines a two-dimensional vector type supporting addition through the static method plus, and scalar product by a float through the static method scale. Nothing surprising there.

The next struct is more interesting:
type VecOps<'vector, 'scalar> = class
val up: 'vector
val add: 'vector -> 'vector -> 'vector
val scale: 'scalar -> 'vector -> 'vector

end

Passing the addition and product methods to each function would be tedious, so I have defined a class to hold them. The first line defines a type called VecOps which is parametrized by two types, denoting respectively the vector type and the scalar type.
The second line is used to specify what is the upward direction. It's used only by function "gravity". A real integration library would normally not contain functions computing forces, since these are provided by the users. I know it's a bit clumsy to have this data member there.
The third and second lines define the type of the addition and scalar product operations.

It was a bit of a pain to add the extra parameter to each function manipulating vectors, but it's nothing unmanageable. This could have been averted by grouping all these functions into a class and passing the VecOps object to the constructor.
To avoid repeatedly passing VecOps objects to functions in the physics library, users can introduce short-cuts using function closures:
let ef_Vec1d = euler_implicit Vec1D_ops (accel Vec1D_ops)
let rk4_Vec1d = runge_kutta_4 Vec1D_ops (adapt (accel Vec1D_ops))

I'll close this up with a comment on performance. To put things bluntly, it's terrible. I am not quite sure if I have done something wrong, so I won't post scary numbers (yet), but I would say this method is not usable for real time physics simulations. I have un-inlined all functions, because (i) it makes little difference with respect to execution times, and (ii) the decisions to inline or not is best left to the .NET JIT.

There are however legitimate uses of the inline keyword, which will be shown in the next Tutorial.

## Friday, September 19, 2008

### Tutorial 7b: Further optimizations

The code for this tutorial shows the optimized version of Tutorial 7 by Jomo Fisher.
I am merely copy-pasting his changes and comments here, for the sake of completeness of this blog.

#light

open System

let run_time_func (func: unit -> 'a) =
let time0 = System.DateTime.Now
let res = func ()
let diff0 = System.DateTime.Now - time0
(diff0, res)

let map_parallel func items =
seq {
for i in items -> async {
return (func i)
}
}

type ps<'num> = struct
val pos:'num
val speed:'num
new(pos, speed) = { pos = pos; speed = speed }
end

type sa<'num> = struct
val speed:'num
val accel:'num
new(speed, accel) = { speed = speed; accel = accel }
end

let gravity =
-9.81

let inline spring pos =
let k = 100.0 in
-k * pos

let inline drag k pos speed = -k * speed

let inline euler_implicit accel_func pos speed delta =
let speed2 = speed + delta * accel_func pos speed in
ps<_>(pos + delta * speed2, speed2)

let inline adapt (f: 'num -> 'num -> 'num) =
fun pos speed ->
let accel = f pos speed in
sa<_>(speed, accel)

let inline runge_kutta_4 deriv_func pos speed delta =
let half_delta = 0.5 * delta
let sixth_delta = delta / 6.0
let sa1: sa<_> = deriv_func pos speed
let sa2 = deriv_func (pos + half_delta * sa1.speed) (speed + half_delta * sa1.accel)
let sa3 = deriv_func (pos + half_delta * sa2.speed) (speed + half_delta * sa2.accel)
let sa4 = deriv_func (pos + delta * sa3.speed) (speed + delta * sa3.accel)
let pos1 = pos + sixth_delta*(sa1.speed + 2.0 * sa2.speed + 2.0 * sa3.speed + sa4.speed)
let speed1 = speed + sixth_delta*(sa1.accel + 2.0 * sa2.accel + 2.0 * sa3.accel + sa4.accel) in
ps<_>(pos1, speed1)

let inline simulate intg_func t0 t_fin delta pos0 speed0 =
let rec repeat t0 pos0 speed0 =
if t0 > t_fin then (pos0, speed0)
else
let t1 = t0 + delta
let ps: ps<_> = oc.Invoke(pos0, speed0, delta)
repeat t1 ps.pos ps.speed
repeat t0 pos0 speed0

let initial_states = [ for s in 1..1000 -> (float s * 1.0, 0.0) ]
let t0 = 0.0
let t_fin = 1000.0
let delta = 0.025

let accel = fun pos speed -> (drag 1.0) pos speed + spring pos + gravity

let ef = (euler_implicit accel)
let rk4f = (runge_kutta_4 (adapt accel))

for i in 0..3 do
let (run_time, res) =
run_time_func (fun () ->
initial_states |> List.map(fun (x,y) ->
simulate ef t0 t_fin (0.25*delta) x y))
printfn "Euler single: %f" run_time.TotalSeconds;

let (run_time, res) =
run_time_func (fun () ->
initial_states |> map_parallel(fun (x,y) ->
simulate ef t0 t_fin (0.25*delta) x y))
printfn "Euler multi: %f" run_time.TotalSeconds;

let (run_time, res) =
run_time_func (fun () ->
initial_states |> List.map(fun (x,y) ->
simulate rk4f t0 t_fin delta x y))
printfn "RK4 single: %f" run_time.TotalSeconds;

let (run_time, res) =
run_time_func (fun () ->
initial_states |>
map_parallel(fun (x,y) ->
simulate rk4f t0 t_fin delta x y))
printfn "RK4 multi: %f" run_time.TotalSeconds;
done;

The "simulate" function now uses an optimized closure. Citing Jomo:
Closure optimization can be used when a function will be called many times. It allows F# to do a bit of expensive work up front that will later be amortized across the many calls to the function. Unfortunately, this is not something that you can stick everywhere and just get a benefit (otherwise, the compiler could just inject it for you). You should definitely profile before and after.

let inline simulate intg_func t0 t_fin delta pos0 speed0 =
let rec repeat t0 pos0 speed0 =
if t0 > t_fin then (pos0, speed0)
else
let t1 = t0 + delta
let ps: ps<_> = oc.Invoke(pos0, speed0, delta)
repeat t1 ps.pos ps.speed
repeat t0 pos0 speed0

My code made a rather extensive use of tuples, used for the returned values of "euler_implicit", "runge_kutta_4" and "adapt". Jomo has changed that to use structs instead. From what I understand, structs in F# are similar to structs in C#, and local variables of this type can be instantiated on the stack, instead of being created on the heap.
I'm still unsure about when to use structs and when not to, but I have noticed in my experiments that they do not always bring a performance boost. In other words, just replacing all your tuples (or other types I'll introduce later) is not always a good strategy.
The definition of the struct types is shown below.

type ps<'num> = struct
val pos:'num
val speed:'num
new(pos, speed) = { pos = pos; speed = speed }
end

type sa<'num> = struct
val speed:'num
val accel:'num
new(speed, accel) = { speed = speed; accel = accel }
end

This defines a struct "ps" parameterized by a type. "<'num>" declares the type parameter. Since the integrator in this tutorial is still limited to a single dimension, we could also have omitted the type parameter:

type ps = struct
val pos: float
val speed: float
new(pos, speed) = { pos = pos; speed = speed }
end

This defines a struct with two data members named pos and speed, and a constructor taking (oh surprise!) a position and a speed. The parts between curly braces means "create a value of type ps where the pos member is set to the parameter pos, and the speed member is set to the speed parameter".

Finally, Jomo also moved the creation of the integration functions out of the loop in the main program (which I obviously should have thought of...):

let ef = (euler_implicit accel)
let rk4f = (runge_kutta_4 (adapt accel))

let (run_time, res) =
run_time_func (fun () ->
initial_states |> List.map(fun (x,y) ->
simulate ef t0 t_fin (0.25*delta) x y))

The performance of the code has now increased to the numbers below.

 Implementation Performance Euler Performance RK4 Managed C++ 172413 326530 F# Tutorial 6 66225 111731 F# Tutorial 7 129032 185185 JF single 153846 160000 JF multi 297619 318471

For the next tutorial, I am considering taking this integrator to the third dimension, or to draw a graph showing the trajectory of a particle. I have yet to decide.

## Wednesday, September 17, 2008

### C++ version of Tutorial 6

Below is the code of the C++ version of the Euler and RK4 integrators.

// RK++NET.cpp : main project file.

#include "stdafx.h"

using namespace System;
#include <map>
#include <vector>
#include <iostream>

const double gravity = -9.81;

template<typename T>
inline
T spring(const T& pos)
{
return -100.0 * pos;
}

template<typename T>
inline
T drag(double k, const T& pos)
{
return -k * pos;
}

template<typename A>
class EulerImplicit
{
public:
EulerImplicit(const A& accel):
accel(accel)
{
}

template<typename T>
inline
void operator()(T& pos, T& speed, const double delta) const
{
speed += delta * accel(pos, speed);
pos += delta * speed;
}

private:
const A& accel;
};

template<typename A>
class RK4
{
public:
RK4(const A& accel):
accel(accel)
{
}

template<typename T>
inline
void operator()(T& pos, T& speed, const double delta) const
{
const double half_delta = 0.5 * delta;

std::pair<T, T> k1 = accel(pos, speed);
std::pair<T, T> k2 = accel(pos + half_delta * k1.first, speed + half_delta * k1.second);
std::pair<T, T> k3 = accel(pos + half_delta * k2.first, speed + half_delta * k2.second);
std::pair<T, T> k4 = accel(pos + delta * k3.first, speed + delta * k3.second);

pos += delta / 6.0 * (k1.first + 2.0 * (k2.first + k3.first) + k4.first);
speed += delta / 6.0 * (k1.second + 2.0 * (k2.second + k3.second) + k4.second);
}

private:
const A& accel;
};

template<typename T>
class Force
{
public:
Force() : m_drag(1.0)
{}

T operator()(const T& pos, const T& speed) const
{
return drag(m_drag, speed) + spring(pos) + gravity;
}

private:
const double m_drag;
};

template<typename A>
{
public:
accel(accel)
{
}

template<typename T>
inline std::pair<T, T> operator()(const T& pos, const T& speed) const
{
std::pair<T, T> ret;
ret.first = speed;
ret.second = accel(pos, speed);
return ret;
}

private:
const A& accel;
};

template<typename T, typename I>
inline
void simulate(const I& intg_func, T& pos, T& speed, double t0, const double t_fin, const double delta)
{
while(t0 < t_fin)
{
t0 += delta;
intg_func(pos, speed, delta);
}
}

typedef std::vector< std::pair<double, double> > Tuples;

int main(array<System::String ^> ^args)
{
const double t0 = 0.0;
const double t_fin = 1000.0;
const double delta = 0.025;

Tuples initial_states;
initial_states.reserve(1000);
for (int i=0; i<1000; ++i)
initial_states.push_back( std::make_pair((double)i, 0.0) );

Tuples states(initial_states);

System::DateTime start0 = System::DateTime::Now;
Force<double> accel;
EulerImplicit< Force<double> > euler(accel);
for (Tuples::iterator it = states.begin();
it != states.end();
++it)
{
simulate(euler, it->first, it->second, t0, t_fin, 0.25 * delta);
}
System::TimeSpan diff0 = System::DateTime::Now - start0;
std::cout << "Time: " << diff0.TotalSeconds;
for (int i = 0; i < 20; ++i)
{
std::cout << " state = (" << states.at(i).first
<< "; " << states.at(i).second << ")" << std::endl;
}

states = initial_states;
start0 = System::DateTime::Now;
for (Tuples::iterator it = states.begin();
it != states.end();
++it)
{
simulate(rk4, it->first, it->second, t0, t_fin, delta);
}
diff0 = System::DateTime::Now - start0;
std::cout << "Time: " << diff0.TotalSeconds;
for (int i = 0; i < 20; ++i)
{
std::cout << " state = (" << states.at(i).first
<< "; " << states.at(i).second << ")" << std::endl;
}

return 0;
}

## Monday, September 15, 2008

### Tutorial 7: Concurrency

It seems that every tutorial on concurrency has to cite The Free Lunch is Over: A Fundamental Turn Toward Concurrency in Software by Herb Sutter, and I won't make an exception. Basically, we are not going to get faster CPUs in the future. What we are going to get is more CPUs. The challenge for programmers is then to write software which can take advantage of the extra CPUs.

The code of this Tutorial shows how to achieve that in F#, and how easy it is.

#light

open System

let run_time_func (func: unit -> 'a) =
let time0 = System.DateTime.Now
let res = func ()
let diff0 = System.DateTime.Now - time0
(diff0, res)

let map_parallel func items =
seq {
for i in items -> async {
return (func i)
}
}

let gravity =
-9.81

let inline spring pos =
let k = 100.0 in
-k * pos

let inline drag k pos speed = -k * speed

let inline euler_implicit accel_func pos speed delta =
let speed2 = speed + delta * accel_func pos speed in
pos + delta * speed2, speed2

let inline adapt (f: 'num -> 'num -> 'num) =
fun pos speed ->
let accel = f pos speed in
speed, accel

let inline runge_kutta_4 deriv_func pos speed delta =
let half_delta = 0.5 * delta
let s1, a1 = deriv_func pos speed
let s2, a2 = deriv_func (pos + half_delta * s1) (speed + half_delta * a1)
let s3, a3 = deriv_func (pos + half_delta * s2) (speed + half_delta * a2)
let s4, a4 = deriv_func (pos + delta * s3) (speed + delta * a3)
let pos1 = pos + delta/6.0*(s1 + 2.0*s2 + 2.0*s3 + s4)
let speed1 = speed + delta/6.0*(a1 + 2.0*a2 + 2.0*a3 + a4) in
pos1, speed1

let inline simulate intg_func t0 t_fin delta pos0 speed0 =
let rec repeat t0 pos0 speed0 =
if t0 > t_fin then (pos0, speed0)
else
let t1 = t0 + delta
let pos1, speed1 = intg_func pos0 speed0 delta
repeat t1 pos1 speed1
repeat t0 pos0 speed0

let initial_states = [ for s in 1..1000 -> (float s * 1.0, 0.0) ]
let t0 = 0.0
let t_fin = 1000.0
let delta = 0.025

let accel pos speed = (drag 1.0) pos speed + spring pos + gravity in
for i in 0..3 do
let (run_time, res) = run_time_func (fun () -> initial_states |>
List.map(fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y))
printfn "Euler single: %f" run_time.TotalSeconds;

let (run_time, res) = run_time_func (fun () -> initial_states |>
map_parallel(fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y))
printfn "Euler multi: %f" run_time.TotalSeconds;

let (run_time, res) = run_time_func (fun () -> initial_states |>
List.map(fun (x,y) -> simulate (runge_kutta_4 (adapt accel)) t0 t_fin delta x y))
printfn "RK4 single: %f" run_time.TotalSeconds;

let (run_time, res) = run_time_func (fun () -> initial_states |>
map_parallel(fun (x,y) -> simulate (runge_kutta_4 (adapt accel)) t0 t_fin delta x y))
printfn "RK4 multi: %f" run_time.TotalSeconds;
done;

There are two changes in this code compared to Tutorial 6. One of them makes it easier to measure execution times. The other is the introduction of asynchronous computations. I will start with the description of the latter.

let map_parallel func items =
seq {
for i in items -> async {
return (func i)
}
}

This defines a function "map_parallel" taking a function and a list of items as parameters. It is similar to "Array.map", which was used in Tutorial 6: it applies the function "func" on each item of the list, and produces a list of the results. What is new here is it does so not in a sequential manner, but in an asynchronous way.

The "map_parallel" function starts by defining a sequence of tasks. The syntax
seq { for i in items -> Expr }
produces a sequence of items whose value is the result of evaluating Expr for each item.

async { return (func i) }
This defines an asynchronous computation, i.e. a computation that can be executed asynchronously. At this point it may seem that actually executing all these computations in parallel and wait for their completion is a complicated thing to do. Happily, there are functions in the standard library of F# that handle all the tricky business of creating threads, starting them, distributing computations on these threads and waiting for their completion.

I don't fully understand what these two functions exactly do, I'm happy enough that they do what I want when composed.

All there is to do is replace the call to Array.map by parallel_map:
map_parallel(fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y)

That's it, the performance numbers have now increased, running on a dual-core (although by less than a factor of two). Below is the updated performance table.

 Implementation Performance Euler Performance RK4 C++ 172413 326530 F# Tutorial 6 66225 111731 F# Tutorial 7 129032 185185

The rest is just utility code intended to make it easier to measure execution times.

let run_time_func (func: unit -> 'a) =
let time0 = System.DateTime.Now
let res = func ()
let diff0 = System.DateTime.Now - time0
(diff0, res)
This defines a function "run_time_func" which takes a function of zero argument and returns a value of some type "a". Notice the use of "unit" to denote "no argument", which is similar to the "void" type of C.
"func ()" invokes this function. "func" alone would have denoted the function itself, instead of calling it.

In order to prevent calling the "simulate" function in the main code, I have added "fun () -> ", thus wrapping the call to "simulate" inside a function taking no argument.
run_time_func (
fun () ->
initial_states |>
List.map(fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y)
)

It is this wrapper that is passed to "run_time_func", and its execution results in the call to "simulate".

The "run_time_func" function returns a tuple composed of the execution time and the result of the function call.

## Sunday, September 14, 2008

### Tutorial 6: Inlining

As usual, code first:
#light

open System

let gravity =
-9.81

let inline spring pos =
let k = 100.0 in
-k * pos

let inline drag k pos speed = -k * speed

let inline euler_implicit accel_func pos speed delta =
let speed2 = speed + delta * accel_func pos speed in
pos + delta * speed2, speed2

let inline adapt (f: 'num -> 'num -> 'num) =
fun pos speed ->
let accel = f pos speed in
speed, accel

let inline runge_kutta_4 deriv_func pos speed delta =
let half_delta = 0.5 * delta
let s1, a1 = deriv_func pos speed
let s2, a2 = deriv_func (pos + half_delta * s1) (speed + half_delta * a1)
let s3, a3 = deriv_func (pos + half_delta * s2) (speed + half_delta * a2)
let s4, a4 = deriv_func (pos + delta * s3) (speed + delta * a3)
let pos1 = pos + delta/6.0 * (s1 + 2.0 * s2 + 2.0 * s3 + s4)
let speed1 = speed + delta/6.0 * (a1 + 2.0 * a2 + 2.0 * a3 + a4) in
pos1, speed1

let inline simulate intg_func t0 t_fin delta pos0 speed0 =
let rec repeat t0 pos0 speed0 =
if t0 > t_fin then (pos0, speed0)
else
let t1 = t0 + delta
let pos1, speed1 = intg_func pos0 speed0 delta
repeat t1 pos1 speed1
repeat t0 pos0 speed0

let initial_states = [| for s in [1..1000] -> (float s, 0.0) |]
let t0 = 0.0
let t_fin = 1000.0
let delta = 0.025

let accel pos speed = (drag 1.0) pos speed + spring pos + gravity in

let time0 = System.DateTime.Now
let final_states = initial_states |>
Array.map (fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y)
let diff0 = System.DateTime.Now - time0
printfn "Time: %f Result: %A" diff0.TotalSeconds final_states;

let time0 = System.DateTime.Now
let final_states = initial_states |>
Array.map (fun (x,y) -> simulate (runge_kutta_4 (adapt accel)) t0 t_fin delta x y)
let diff0 = System.DateTime.Now - time0
printfn "Time: %f Result: %A" diff0.TotalSeconds final_states



This code differs from the code in Tutorial 5 in the following ways:
- Some functions were declared "inline".
- The adapted function required by the RK4 integrator is constructed at the top level.
- The simulation is run on 1000 bodies instead of one.

The "inline" keyword in C++ instructs the compiler to emit the code of the function in place where it is called, instead of using a jump. I guess the effect is the same in F#. Which functions should be inlined is hard to tell. Small functions are obvious candidates. It also appears that F# automatically inlines some functions, even though they are not marked with "inline". For instance, "accel" was inlined inside the code for "runge_kutta_4". It was however not inlined inside "euler_implicit", which may explain why RK4 ended up faster than Euler (see the table further down).

The adapted function is now built once at the top level, instead of being repeatedly recreated inside "runge_kutta_4". It now seems obvious that this is the right way to do it, but if you look back to Tutorial 4 you may understand how I came to doing it the stupid way. Writing code in an incremental fashion, adding small bits of functionality after another often leads to suboptimal code, but that is hardly avoidable. This underlines how important it is for programming languages to support code refactoring in an easy and safe manner.

I modified the code to run a simulation of 1000 bodies instead of just one. My goal was to introduce concurrency, but I'll leave that for later. Here comes the explanations for the new code:
let initial_states = [| for s in [1..1000] -> (float s, 0.0) |]


If you are familiar with Python you will probably have recognized a comprehension. Comprehensions are also supported by F#. This code declares a variable "initial_states" which is an array of pairs of floats, where the first member denotes the initial position, and the second the initial speed. The new bodies are initially immobile and located one on top of another, separated by 1 meter.
Notice how the integer "s" must be explicitly converted to a float. In F#, integers cannot be multiplied by floats.

let final_states = initial_states |>
Array.map (fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y)

The array of initial states is passed to the Array.map function, which takes a function and applies it to each item in the array, effectively producing a new array.
These lines result in the application of the simulate function to each initial state.

The "|>" operator is just syntactical sugar. This code could have been written:
let final_states =
Array.map (fun (x,y) -> simulate (euler_implicit accel) t0 t_fin (0.25*delta) x y)
initial_states


At the end of Tutorial 5, we had managed to write code that was generic, concise and hopefully correct. Sadly, it was also rather slow. How much faster is it now? See the table below.

 Implementation Performance Euler Performance RK4 C++ 172413 326530 F# Tutorial 5 30156 32425 F# Tutorial 6 66225 111731 OCaml 65789 71428

Numbers in the performance columns are (numbers of seconds simulated) x (number of bodies) / (running time). They are an indication of how many computations per second each implementation achieved.
Note that the simulation based on Euler uses a time step four times smaller than that of RK4. I think that is fair, since Euler evaluates the acceleration function once each step, instead of four times for RK4.

The C++ implementation is generic (using templates, no virtual method was used) and uses side-effects. I also had a non-generic version which was about twice as fast, but I deleted it (stupid me).
The OCaml implementation is almost identical to the code in Tutorial 5, which means it is both generic and pure (no side effects).
The good performance of RK4 in F# is somewhat artificial and probably untypical of a real application. Indeed, the acceleration function used in this example is excessively simple.

The C++ code is 186 lines long (including empty lines), the F# code is 60 lines long. I also find the F# code much more readable. Compare with the code of RK4 in C++
template<typename A>
class RK4
{
public:
RK4(const A& accel):
accel(accel)
{
}

template<typename T>
inline
void operator()(T& pos, T& speed, const double delta) const
{
const double half_delta = 0.5 * delta;

std::pair k1 = accel(pos, speed);
std::pair k2 = accel(pos + half_delta * k1.first, speed + half_delta * k1.second);
std::pair k3 = accel(pos + half_delta * k2.first, speed + half_delta * k2.second);
std::pair k4 = accel(pos + delta * k3.first, speed + delta * k3.second);

pos += delta / 6.0 * (k1.first + 2.0 * (k2.first + k3.first) + k4.first);
speed += delta / 6.0 * (k1.second + 2.0 * (k2.second + k3.second) +     k4.second);
}

private:

const A& accel;
};


What should we conclude? I personally think that the 3x performance penalty of F# is acceptable, considering how uncompromising my F# code is regarding purity and genericity. It would be interesting to see how fast the F# code rewritten to use side effects would run.
This little experiment was enough to convince me to move over to F#. I stuck to C++ because of its support for generic programming (using templates), static type checking and for the large amount of code available. F# supports generic programming using a much cleaner syntax, its IDE performs type checks while the code is being typed in, and it has access to the .NET framework.
A last interesting fact to report: I wrote the generic C++ code after I wrote the F# code, without thinking very much. It was easy to translate the F# code to C++. It would have been considerably harder to write the C++ code from scratch. As a matter of fact, I had no clear idea how to implement RK4 in a generic way before I tackled the problem "thinking" in F#. UPDATE (may 2012): The numbers for C++ were actually for managed C++. After further optimizations (see tutorial 7b), F# went ahead of the managed C++ version. I later wrote a native C++ version which if I remember correctly was about 10 times faster.

## Friday, September 12, 2008

### Interlude: Disassembling F# code

No new code in this tutorial. Instead, I thought I would introduce .NET reflector (if you don't already know it). This tools allows to examine any .NET assembly to see what classes and functions it contains. It can also disassemble code into C#.

To refresh our memory, this was the F# code of "simulate":
let simulate intg_func t0 t_fin delta pos0 speed0 =
let rec repeat t0 t_fin pos0 speed0 =
if t0 >= t_fin then pos0, speed0
else
let t1 = t0 + delta
let pos1, speed1 = intg_func pos0 speed0 delta in
repeat t1 t_fin pos1 speed1
repeat t0 t_fin pos0 speed0


The disassembled code in C# looks approximately as follows (I have simplified and cleaned up the text a bit):
internal static Tuple
simulate@37(FastFunc intg_func, double t0, double t_fin, double delta, double pos0, double speed0)
{
FastFunc f = new clo@39(intg_func, delta);
return FastFunc.InvokeFast4(f, t0, t_fin, pos0, speed0);
}


"clo@39" must be some sort of wrapper for the "repeat" inner function I defined in "simulate". Notice how the generated code handles "intg_func" and "delta" differently from the other variables "t0", "t_fin", "pos0" and "speed0". The former are passed once during the construction of "clo@39", the latter are passed as arguments when invoking the "repeat" function.
Looking back to the F# code, notice that "intg_func" and "delta" indeed are special. Although they are used by "repeat", they are not parameters of it. This is called a closure. The reason for using a closure will become clear if you continue reading.

Let us look at "clo@39". It has an "Invoke" method which is called when "repeat" is called in the F# code:
public override Tuple Invoke(double t0, double t_fin, double pos0, double speed0)
{
while (t0 < t_fin)
{
double num = t0 + this.delta;
Tuple tuple = FastFunc.InvokeFast3(this.intg_func, pos0, speed0, this.delta);
speed0 = tuple.Item2;
pos0 = tuple.Item1;
t_fin = t_fin;
t0 = num;
}
return new Tuple(pos0, speed0);
}

I was not surprised when I noticed it, but the F# compiler has turned my recursive function "repeat" into a non-recursive loop. This is an important optimization that most functional languages support. All tail-recursive functions can easily be turned into non-recursive functions, which saves a lot of stack space.

The result would be beautiful, if it were not for this strange thing: "t_fin = t_fin;". I can see that the original F# code does indeed copy "t_fin" every iteration, and in that sense the generated code remains faithful to the original code. I wish F# had optimized this unnecessary copying away, but it probably has its reasons for not doing so. Happily, there is a work-around, which is to use a closure. This is precisely what I did for "intg_func" and "delta".
One might even say that this work-around is more than that, it actually is the right way of doing things. Constant parameters should not be passed explicitly as parameters to recursive functions.

Let us continue and look into "runge_kutta_4". First, the F# code from the previous tutorial:
let runge_kutta_4 accel_func pos speed delta =
let half_delta = 0.5 * delta
let s1, a1 = deriv_func pos speed
...


The generated code, translated to C#:
internal static Tuple runge_kutta_4@26(FastFunc accel_func, double pos, double speed, double delta)
{
double num = 0.5 * delta;
FastFunc f = new clo@28(accel_func);
Tuple tuple = FastFunc.InvokeFast2(f, pos, speed);
...

Nothing really suprising here, but I just took notice of how the adapted variant of "accel_func" is recreated in each call. That is obviously not very efficient, considering how often this function is called.

In the next tutorial, I will present an optimized version of the code and compare it to implementations in C++ and OCaml.

## Thursday, September 11, 2008

### Tutorial 5: Naive RK4

This tutorial shows an implementation of the Runge-Kutta (order 4). The entire code follows below, followed by explanations on the new features since Tutorial 4.

#light

open System

let gravity =
-9.81

let spring pos =
let k = 10000.0 in
-k * pos

let drag k =
fun pos speed -> -k * speed

let euler_implicit accel_func pos speed delta =
let speed2 = speed + delta * accel_func pos speed in
pos + delta * speed2, speed2

let adapt (f: 'num -> 'num -> 'num) =
fun pos speed ->
let accel = f pos speed in
speed, accel

let runge_kutta_4 accel_func pos speed delta =
let half_delta = 0.5 * delta
let s1, a1 = deriv_func pos speed
let s2, a2 = deriv_func (pos + half_delta * s1) (speed + half_delta * a1)
let s3, a3 = deriv_func (pos + half_delta * s2) (speed + half_delta * a2)
let s4, a4 = deriv_func (pos + delta * s3) (speed + delta * a3)
let pos1 = pos + delta/6.0*(s1 + 2.0*s2 + 2.0*s3 + s4)
let speed1 = speed + delta/6.0*(a1 + 2.0*a2 + 2.0*a3 + a4) in
pos1, speed1

let simulate intg_func t0 t_fin delta pos0 speed0 =
let rec repeat t0 t_fin pos0 speed0 =
if t0 >= t_fin then pos0, speed0
else
let t1 = t0 + delta
let pos1, speed1 = intg_func pos0 speed0 delta in
repeat t1 t_fin pos1 speed1
repeat t0 t_fin pos0 speed0

let pos = 0.0
let speed = 0.0
let t0 = 0.0
let t_fin = 500000.0
let delta = 0.025

let accel = fun pos speed -> (drag 100.0) pos speed + spring pos + gravity in
let time0 = System.DateTime.Now
let (pos_fin, speed_fin) = simulate (euler_implicit accel) t0 t_fin delta pos speed
let diff0 = System.DateTime.Now - time0
printfn "%f pos = %f speed = %f" diff0.TotalSeconds pos_fin speed_fin;

let time0 = System.DateTime.Now
let pos_fin, speed_fin = simulate (runge_kutta_4 accel) t0 t_fin delta pos speed
let diff0 = System.DateTime.Now - time0
printfn "%f pos = %f speed = %f" diff0.TotalSeconds pos_fin speed_fin



Before diving into the explanations, let us have a look at the definition of Runge-Kutta 4, according to wikipedia:
Let an initial value problem be specified as follows.
$y' = f(t, y), \quad y(t_0) = y_0.$

Then, the RK4 method for this problem is given by the following equations:

\begin{align} y_{n+1} &= y_n + {h \over 6} \left(k_1 + 2k_2 + 2k_3 + k_4 \right) \\ t_{n+1} &= t_n + h \\ \end{align}

where yn + 1 is the RK4 approximation of y(tn + 1), and

\begin{align} k_1 &= f \left( t_n, y_n \right) \\ k_2 &= f \left( t_n + {h \over 2}, y_n + {h \over 2} k_1\right) \\ k_3 &= f \left( t_n + {h \over 2}, y_n + {h \over 2} k_2\right) \\ k_4 &= f \left( t_n + h, y_n + h k_3 \right) \\ \end{align}
In the context of this tutorial, function "f" can be seen as a function computing the speed and the acceleration, depending on the current time "t" and the current position and speed "y". Actually, in this tutorial, the forces do not depend on the current time, and the first parameter of "f" is therefore omitted.
Using a functional language like F# to implement Runge-Kutta 4 (RK4 for short) is particularly fitting, as the function "f" is an input to the algorithm. It seems easy to implement this algorithm using a function taking a function "f", and indeed it is:
let runge_kutta_4 accel_func pos speed delta =
let half_delta = 0.5 * delta
let s1, a1 = deriv_func pos speed
let s2, a2 = deriv_func (pos + half_delta * s1) (speed + half_delta * a1)
let s3, a3 = deriv_func (pos + half_delta * s2) (speed + half_delta * a2)
let s4, a4 = deriv_func (pos + delta * s3) (speed + delta * a3)
let pos1 = pos + delta/6.0*(s1 + 2.0*s2 + 2.0*s3 + s4)
let speed1 = speed + delta/6.0*(a1 + 2.0*a2 + 2.0*a3 + a4) in
pos1, speed1

There is a small catch here: The main program builds an "accel_func" which returns a single number denoting the acceleration, where "runge_kutta_4" expects a function returning a pair denoting the speed and the acceleration. This small problem is solved using the "adapt" function:

let adapt (f: 'num -> 'num -> 'num) =


"adapt" is a function taking a parameter "f". To make things easier to understand for the reader (and for the writer too:), a type annotation is used. The type annotation means "a function taking some type num, and returning a function taking the same type num, which returns a value of type num", or in other words, "a function taking two parameters of type num and returning a value of type num". The single quote before "num" means that num is a type variable, or in C++ terminology, a template parameter. In these tutorials I have used one-dimensional floats, but a typical 3d game would use a 3-dimensional vector. "float -> float -> float" would also have worked, but would be less generic. I could also have left out the type annotation altogether, and let the F# compiler figure out the types.

fun pos speed ->
let accel = f pos speed in
speed, accel

I hope no one is getting headaches yet :) "adapt" returns a function taking a position and a speed and returning the speed and the acceleration which is computed by calling the "f" function. "adapt" is indeed an adapter, but instead of working with values, as one would probably do in non-functional languages, "adapt" manipulates functions directly.

let euler_implicit accel_func pos speed delta =
let speed2 = speed + delta * accel_func pos speed in
pos + delta * speed2, speed2

Not much new here, just notice how the "accel" parameter was renamed to "accel_func". "euler_implicit" has been modified in this installment to have the same signature as "runge_kutta_4".

let (pos_fin, speed_fin) = simulate (euler_implicit accel) t0 t_fin delta pos speed

Here is the beautiful bit: the main program calls simulate, passing the integration method ("euler_implicit"). The simulation function is generic enough to work independently of the integration method.
Notice also how "(euler_implicit accel)" is passed as an argument, which is a function built by "plugging" the "accel" function into "euler_implicit". The "simulate" function never sees the "accel" function.

This tutorial also shows how to time the execution of the program using a function from the .NET framework. I have implemented RK4 in C++ in order to compare the execution times. I was disappointed at first, seeing how the C++ implementation outperformed the F# version by a factor of 10.

This is however not the last episode in this saga. Will F# make a last-minute save and catch up with C++ ?

### Tutorial 4: Manipulating functions

As usual, let me show you the code first.

#light

open System

let gravity =
-9.81

let spring pos =
let k = 10.0 in
-k * pos

let drag k =
fun speed -> -k * speed

let euler_implicit accel pos speed delta =
let speed2 = speed + delta * accel in
pos + delta * speed2, speed2

let rec simulate t0 t_fin delta pos0 speed0 accel_func =
if t0 >= t_fin then pos0, speed0
else
let t1 = t0 + delta
let accel = accel_func pos0 speed0
let pos1, speed1 = euler_implicit accel pos0 speed0 delta in
simulate t1 t_fin delta pos1 speed1 accel_func

let pos = 0.0
let speed = 0.0
let t0 = 0.0
let t_fin = 10.0
let delta = 1e-3

let accel = fun pos speed -> (drag 1.0) speed + spring pos + gravity

let pos_fin, speed_fin = simulate t0 t_fin delta pos speed accel
printfn "pos = %3.3f\nspeed = %3.3f\n" pos_fin speed_fin;



The following portions of text are new in this fourth tutorial.

let spring pos =
let k = 10.0 in
-k * pos

This defines a function "spring" which takes a position and returns an acceleration.
"k" is a constant denoting the stiffness of the spring. Having a constant stiffness is not very practical, but it shall do for now.

let drag k =
fun speed -> -k * speed

The first line defines a function "drag" which takes a single argument "k", denoting the drag coefficient. The second line demonstrates one of the nice features of functional programming languages: it defines an unnamed function taking a speed and returning a speed, all in one line. This function is itself returned by the drag function. In F# functions are no different than other values, and can be returned from functions. "drag" can be seen as a "function factory". You give it a drag coefficient, and it returns a function similar in nature to the "spring" function above.

let rec simulate t0 t_fin delta pos0 speed0 accel_func =

Notice how the last argument was renamed from "accel" in Tutorial 3 to "accel_func".
It is now expected that "accel_func" should be a function. There is nothing in the syntax that tells the F# compiler about that, the "_func" suffix is just a naming convention intended to users of "simulate".

let accel = accel_func pos0 speed0

This line calls accel_func, passing the position and the speed. The value returned is an acceleration which is passed to "euler_implicit".

let accel = fun pos speed -> (drag 1.0) speed + spring pos + gravity

This is just another way of defining a function. I could just as well have written "let accel pos speed = ...".
This function "accel" takes a position and a speed, and returns the sum of three accelerations:
- a drag, with coefficient 1.0,
- a spring (whose hidden, constant stiffness was defined to 10.0), and
- gravity.

let pos_fin, speed_fin = simulate t0 t_fin delta pos speed accel

The "accel" function is then passed to "simulate", which will use it to sample acceleration each time step.

ignore(Console.ReadKey(true))

Notice the "ignore" function, which was not present in the previous tutorial. Since F# main programs are supposed to return nothing (which in F# is of type "unit"), and since Console.ReadKey returns something, "ignore" is used to avoid a warning by the F# compiler.

This concludes this fourth tutorial, in which we saw that functions could be created dynamically, returned from and passed to functions, just as any other value (e.g. floats).
In the next tutorial we will see an implementation of the famous Runge-Kutta 4 method of solving differential equations.

## Saturday, September 6, 2008

### Tutorial 3: Looping

The code for this tutorial is similar to the previous'.

#light

open System

let gravity = -9.81

let euler_implicit accel pos speed delta =
let speed2 = speed + delta * accel in
pos + delta * speed2, speed2

let rec simulate t0 t_fin delta pos0 speed0 accel =
if t0 >= t_fin then pos0, speed0
else
let t1 = t0 + delta in
let pos1, speed1 = euler_implicit accel pos0 speed0 delta in
simulate t1 t_fin delta pos1 speed1 accel

let pos = 0.0 in
let speed = 0.0 in
let t0 = 0.0 in
let t_fin = 10.0 in
let delta = 1e-3 in

let pos_fin, speed_fin = simulate t0 t_fin delta pos speed gravity in
printfn "pos = %3.3f\nspeed = %3.3f\n" pos_fin speed_fin;



Here come the explanations, focused on the new function "simulate":

let rec simulate t0 t_fin delta pos0 speed0 accel =


This defines a recursive function named "simulate". Recursion is used as a substitute for looping. I will not explain recursion here. It is simple in essence, but can be hard to grasp at first for someone used to think in C. Note that "rec" in "let rec simulate" is used to tell the compiler that "simulate" calls itself. Follows a list of parameters:
• t0 is the initial time when the simulation starts,
• t_fin is the final time when the simulation finishes,
• pos0 is the initial position,
• speed0 is the initial speed,
• accel is the constant acceleration.

if t0 >= t_fin then pos0, speed0


If the initial time is after the final time, the function directly returns the initial position and speed.

else
let t1 = t0 + delta in
let pos1, speed1 = euler_implicit accel pos0 speed0 delta in


Otherwise, the integration function is called to compute the position and the speed at t0 + delta...

simulate t1 t_fin delta pos1 speed1 accel

... and "simulate" is called recursively, with the initial time, position and speed set to the values that were just computed.

Here is the output of the program:
pos = -490.647
speed = -98.110


We have seen how to simulate loops using recursion and if-then-else conditions. This is an alternative to using a while loop. Notice that F# has no problem handling the deep recursion. Thanks to the optimizations supported by the F# compiler, recursion can be used heavily by programmers without having to worry about running out of stack space. There is a catch, though: the recursive call must be the last instruction in the function.

In the next tutorial, we will see how to simulate spring and dampers. This will take advantage of the fact that functions in F# are "first-class citizen" and can be used as any other kind of parameter.

### Tutorial 2: Simple functions

Now that we know how to print text, we will see in this second tutorial how to compute things worth printing.
#light

open System

let gravity = -9.81

let euler_explicit accel pos speed delta =
pos + delta * speed, speed + delta * accel

let euler_implicit accel pos speed delta =
let speed2 = speed + delta * accel in
pos + delta * speed2, speed2

let pos, speed = euler_explicit gravity 0.0 0.0 1.0 in
printfn "pos = %f speed = %f\n" pos speed

let pos, speed = euler_implicit gravity 0.0 0.0 1.0 in
printfn "pos = %f speed = %f\n" pos speed



Now come the explanations for each new line in this tutorial.

let gravity = -9.81

Declares a constant representing the acceleration due to gravity.

let euler_explicit accel pos speed delta =
pos + delta * speed, speed + delta * accel

A first version of Euler integration. A C programmer would have written it:
    pos += delta * speed;
speed += delta * accel;

The F# and the C version differ notably in the way they handle the newly computed data. The C code uses variables which are changed during the computation, while the F# version computes new values which are returned, leaving the original values unchanged. Neither F# nor C force you to do it that way, but this is the natural way to do it in these languages.
The functional way demonstrated in the F# code is more flexible, because the old values are not lost. Should we later realize we need them, we will be able to use them without refactoring the code. It also clarifies the role of each variable, since the content of each variable never changes.
The procedural way, shown in the C snipplet, is likely more efficient, at least on current processors. Side-effect-free computing easily takes advantage of concurrency, meaning that the F# way may well become more efficient in practice when (if?) we get processors with a large number of cores.

Back to the F# code:
let euler_implicit pos speed delta =
let speed2 = speed + delta * accel in
pos + delta * speed2, speed2

A variant of Euler integration popular among game developers. The position is updated using the speed at the next time step. Notice the definition of "speed2", which is used as a shorthand for the speed at the next time step. The scope of "speed2" is limited to the expression following "in". Again, the C counterpart would be:
    speed += delta * accel;
pos += delta * speed;


Continuing with the F# code:
let pos, speed = euler_explicit gravity 0.0 0.0 1.0 in
printfn "pos = %f speed = %f\n" pos speed

let pos, speed = euler_implicit gravity 0.0 0.0 1.0 in
printfn "pos = %f speed = %f\n" pos speed

Finally, these lines call each integration method with null initial positions and speeds. The value of the time step is quite large in this example (1 second), we will see in the next tutorial how to incrementally compute the position and speed using a smaller time step.

The output of the program should look like this:
pos = 0.000000 speed = -9.810000

pos = -9.810000 speed = -9.810000


Note that the position is not quite correct, but that was expected, given the large integration step.

We have just learned how to define and use constants and functions. Note that the syntax is the same in both cases, which is not shocking. Indeed, constants can be seen as a functions of zero parameters. In the next tutorial we will look at a few constructs to control the flow of the program.

### Tutorial 1: Hello World

Following the tradition, here is the code for a program printing "Hello world" in F#.
#light
open System
printfn "Hello world\n"


Below follows an explanation of each line.

#light

This tells the compiler we are using the lightweight syntax. Apparently the syntax of F# comes in several flavors. This syntax works well for me.

open System

This includes a library containing the function called at the end to wait for a key press.

printfn "Hello world\n"

Prints Hello world on the console.

Console.ReadKey(true)

Waits for a key press.

That was it for now. We have learned the bare minimum to do some simple text I/O. In the next tutorial we will see how to use functions to implement the Euler integration method.

### Introduction

This blog will follow my steps toward learning F#. I am hoping to use F# and XNA to write small games for the XBox 360. I have no idea yet on the feasibility of that, but if it works, it would open a whole world of fun opportunities.

I would normally not write a tutorial about something I don't know, but seeing the current lack of tutorials for F#, it might be beneficial for others to follow my progress. (UPDATE: Gosh, I can sure sound pretentious sometimes... the tutorials that do exist are pretty good. See for instance F# in 20 minutes. Comments pointing at other tutorials are welcome.)

I will start with a series of tutorials dedicated to implementing a Runge-Kutta integrator.

I will not explain how to install and set up an F# development environment, that information is available on the net. Personally, I use MS Visual Studio 2008 Shell, which is supported by F# out of the box. Let me just say that I am very impressed by this IDE. Errors such as type mismatches are caught as I type my code, and a simple explanation is given when I hover over the faulty section with the mouse.