Skip to content

Commit 846e6b5

Browse files
WIP: Hacky(!) but faster nbody implementations
1 parent 175c335 commit 846e6b5

File tree

2 files changed

+492
-0
lines changed

2 files changed

+492
-0
lines changed

nbody/nbody_unsafe_simd.jl

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
# Based on https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/nbody-rust-7.html
2+
3+
module NBody
4+
5+
using StaticArrays, SIMD, Printf
6+
using Base: llvmcall
7+
8+
const solar_mass = 4π^2
9+
const days_per_year = 365.24
10+
const NBODIES = 5
11+
const NPAIRS = Int(NBODIES * (NBODIES - 1) / 2)
12+
const PAIRS = Tuple((i,j) for i = 1:5, j = 1:5 if j > i)
13+
14+
struct Bodies
15+
x::MMatrix{NBODIES, 3, Float64}
16+
v::MMatrix{NBODIES, 3, Float64}
17+
m::NTuple{NBODIES, Float64}
18+
end
19+
20+
macro const_unroll(for_loop)
21+
cond = for_loop.args[1]
22+
body = for_loop.args[2]
23+
24+
conds = (cond.head == :block) ? cond.args : Any[cond]
25+
bind_syms = [cond.args[1] for cond = conds]
26+
const_bounds = collect(Iterators.product((eval(cond.args[2]) for cond = conds)...))
27+
bind_exprs = []
28+
for bind_vals = const_bounds
29+
binding_list = Any[]
30+
for (sym, val) = collect(Iterators.zip(bind_syms, bind_vals))
31+
push!(binding_list, Expr(:(=), esc(sym), esc(val)))
32+
end
33+
push!(bind_exprs, Expr(:let, Expr(:block, binding_list...), esc(body)))
34+
end
35+
36+
return Expr(:block, bind_exprs...)
37+
end
38+
39+
function init_bodies!(bodies)
40+
x, v = bodies.x, bodies.v
41+
# Sun
42+
x[1, :] = [0, 0, 0]
43+
v[1, :] = [0, 0, 0]
44+
45+
# Jupiter
46+
x[2, :] = [
47+
4.84143144246472090e+00,
48+
-1.16032004402742839e+00,
49+
-1.03622044471123109e-01,
50+
]
51+
v[2, :] = [
52+
1.66007664274403694e-03,
53+
7.69901118419740425e-03,
54+
-6.90460016972063023e-05,
55+
] .* days_per_year
56+
57+
# Saturn
58+
x[3, :] = [
59+
8.34336671824457987e+00,
60+
4.12479856412430479e+00,
61+
-4.03523417114321381e-01,
62+
]
63+
v[3, :] = [
64+
-2.76742510726862411e-03,
65+
4.99852801234917238e-03,
66+
2.30417297573763929e-05,
67+
] .* days_per_year
68+
69+
# Uranus
70+
x[4, :] = [
71+
1.28943695621391310e+01,
72+
-1.51111514016986312e+01,
73+
-2.23307578892655734e-01,
74+
]
75+
v[4, :] = [
76+
2.96460137564761618e-03,
77+
2.37847173959480950e-03,
78+
-2.96589568540237556e-05,
79+
] .* days_per_year
80+
81+
# Neptune
82+
x[5, :] = [
83+
1.53796971148509165e+01,
84+
-2.59193146099879641e+01,
85+
1.79258772950371181e-01,
86+
]
87+
v[5, :] = [
88+
2.68067772490389322e-03,
89+
1.62824170038242295e-03,
90+
-9.51592254519715870e-05,
91+
] * days_per_year
92+
end
93+
94+
const __m128 = NTuple{4, VecElement{Float32}}
95+
const __m128d = NTuple{2, VecElement{Float64}}
96+
const v2d = Vec{2, Float64}
97+
98+
@inline function rsqrt_pd(v2::v2d)
99+
v2d(rsqrt_ccall(v2.elts))
100+
end
101+
102+
@inline function rsqrt_pd_newton(v2::v2d)
103+
guess = rsqrt_pd(v2)
104+
# We only need one Newton step to achieve desired accuracy
105+
guess = guess * 1.5 - ((0.5 * v2) * guess) * (guess * guess)
106+
guess
107+
end
108+
109+
rsqrt(f::__m128) = ccall("llvm.x86.sse.rsqrt.ps", llvmcall, __m128, (__m128, ), f);
110+
_mm_cvtpd_ps(f::__m128d) = ccall("llvm.x86.sse2.cvtpd2ps", llvmcall, __m128, (__m128d, ), f);
111+
_mm_cvtps_pd(f::__m128) = llvmcall(("", "
112+
%2 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
113+
%3 = fpext <2 x float> %2 to <2 x double>
114+
ret <2 x double> %3"),
115+
__m128d,
116+
Tuple{__m128}, f)
117+
@inline rsqrt_ccall(f::__m128d) = _mm_cvtps_pd(rsqrt(_mm_cvtpd_ps(f)))
118+
119+
@inline function advance(#x, v, m, dt, dx, dmag)
120+
x::MMatrix{NBODIES, 3, Float64, NBODIES * 3},
121+
v::MMatrix{NBODIES, 3, Float64, NBODIES * 3},
122+
m::NTuple{NBODIES, Float64},
123+
dt::Float64,
124+
dx::MMatrix{NPAIRS, 3, Float64, NPAIRS * 3},
125+
dmag::MVector{NPAIRS, Float64})
126+
127+
dmag_v2d_ptr = Base.unsafe_convert(Ptr{v2d}, pointer_from_objref(dmag))
128+
dx_v2d_ptr = Base.unsafe_convert(Ptr{v2d}, pointer_from_objref(dx))
129+
130+
# Unroll loop to calculate distances + store two at a time
131+
@inbounds for k1 = 1:2:length(PAIRS)
132+
k2 = k1 + 1
133+
k_v2d = k2 ÷ 2
134+
135+
i1, j1 = PAIRS[k1]
136+
i2, j2 = PAIRS[k2]
137+
138+
dx1 = v2d((x[i1, 1], x[i2, 1])) - v2d((x[j1, 1], x[j2, 1]))
139+
dx2 = v2d((x[i1, 2], x[i2, 2])) - v2d((x[j1, 2], x[j2, 2]))
140+
dx3 = v2d((x[i1, 3], x[i2, 3])) - v2d((x[j1, 3], x[j2, 3]))
141+
unsafe_store!(dx_v2d_ptr, dx1, k_v2d)
142+
unsafe_store!(dx_v2d_ptr, dx2, k_v2d + NPAIRS ÷ 2)
143+
unsafe_store!(dx_v2d_ptr, dx3, k_v2d + NPAIRS)
144+
145+
dsq = dx1^2 + dx2^2 + dx3^2
146+
drsqrt = rsqrt_pd_newton(dsq)
147+
mag = dt * drsqrt / dsq
148+
unsafe_store!(dmag_v2d_ptr, mag, k_v2d)
149+
end
150+
151+
k = 1
152+
@inbounds for k = 1:length(PAIRS)
153+
i, j = PAIRS[k]
154+
155+
dmag_i = dmag[k] * m[i]
156+
dmag_j = dmag[k] * m[j]
157+
for d = 1:3
158+
dx_k = dx[k, d]
159+
v[i, d] -= dx_k * dmag_j
160+
v[j, d] += dx_k * dmag_i
161+
end
162+
end
163+
164+
@inbounds begin
165+
@const_unroll for i = 1:NBODIES
166+
@const_unroll for d = 1:3
167+
x[i, d] += dt * v[i, d]
168+
end
169+
end
170+
end
171+
end
172+
173+
function energy(bodies)
174+
x, v, m = bodies.x, bodies.v, bodies.m
175+
e = 0.0
176+
for i = 1:NBODIES
177+
e += 0.5 * m[i] * sum(v[i, :].^2)
178+
for j = i + 1:NBODIES
179+
dx = x[i, :] - x[j, :]
180+
distance = sqrt(sum(dx .* dx))
181+
e -= (m[i] * m[j]) / distance
182+
end
183+
end
184+
return e
185+
end
186+
187+
function init_sun!(bodies)
188+
px = [0.0, 0.0, 0.0]
189+
for i = 1:NBODIES
190+
px += bodies.v[i, :] * bodies.m[i]
191+
end
192+
bodies.v[1, :] = -px ./ solar_mass
193+
end
194+
195+
function main(iterations::Int64)
196+
n = iterations
197+
198+
x = zeros(MMatrix{NBODIES, 3, Float64, 15})
199+
v = zeros(MMatrix{NBODIES, 3, Float64, 15})
200+
m = NTuple{NBODIES, Float64}((
201+
1.0,
202+
9.54791938424326609e-04,
203+
2.85885980666130812e-04,
204+
4.36624404335156298e-05,
205+
5.15138902046611451e-05,
206+
) .* solar_mass)
207+
bodies = Bodies(x, v, m)
208+
209+
init_bodies!(bodies)
210+
init_sun!(bodies)
211+
@printf("%.9f\n", energy(bodies))
212+
213+
# Buffers
214+
dx = zeros(MMatrix{NPAIRS, 3, Float64, 30})
215+
dmag = zeros(MVector{NPAIRS, Float64})
216+
for _ = 1:n
217+
advance(x, v, m, 0.01, dx, dmag)
218+
end
219+
220+
@printf("%.9f\n", energy(bodies))
221+
end
222+
223+
end
224+
225+
@time NBody.main(parse(Int64, ARGS[1]))
226+
@time NBody.main(parse(Int64, ARGS[1]))
227+
# using StaticArrays, InteractiveUtils
228+
# code_native(nb.advance,
229+
# (MMatrix{nb.NBODIES, 3, Float64, nb.NBODIES * 3},
230+
# MMatrix{nb.NBODIES, 3, Float64, nb.NBODIES * 3},
231+
# NTuple{nb.NBODIES, Float64},
232+
# Float64,
233+
# MMatrix{nb.NPAIRS, 3, Float64, nb.NPAIRS * 3},
234+
# MVector{nb.NPAIRS, Float64}))

0 commit comments

Comments
 (0)