visit
// sqrt1.cpp
#include <cmath>
void compute_sqrt1(const double* x, int n, double* y) noexcept {
for (int i=0; i<n; ++i) {
y[i] = std::sqrt(x[i]);
}
}
g++ -c -O3 -march=native sqrt1.cpp
With
-O3
, gcc will optimize the code heavily but will still produce code that is standard compliant. The -march=native
option tells gcc to produce code targeting the native architecture’s instruction set. The resulting binaries may not be portable even between different x86-64 CPUs.Now, let’s benchmark the function. We’ll use to measure how long it takes to compute the square roots of 1,000,000 numbers:// benchmark.cpp
#include <random>
#include <memory>
#include <benchmark/benchmark.h>
void compute_sqrt1(const double* x, int n, double* y) noexcept;
static void generate_random_numbers(double* x, int n) {
std::mt19937 rng{0};
std::uniform_real_distribution<double> dist{0, 100};
for (int i=0; i<n; ++i) {
x[i] = dist(rng);
}
}
static void BM_Sqrt1(benchmark::State& state) {
const int n = state.range(0);
std::unique_ptr<double[]> xptr{new double[n]};
generate_random_numbers(xptr.get(), n);
for (auto _ : state) {
std::unique_ptr<double[]> yptr{new double[n]};
compute_sqrt1(xptr.get(), n, yptr.get());
benchmark::DoNotOptimize(yptr);
}
}
BENCHMARK(BM_Sqrt1)->Arg(1000000);
BENCHMARK_MAIN();
g++ -O3 -march=native -o benchmark benchmark.cpp sqrt1.o
./benchmark
Running ./benchmark
Run on (6 X 2600 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x6)
L1 Instruction 32 KiB (x6)
L2 Unified 256 KiB (x6)
L3 Unified 9216 KiB (x6)
Load Average: 0.17, 0.07, 0.05
-----------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------
BM_Sqrt1/1000000 4984457 ns 4946631 ns 115
// sqrt2.cpp
#include <cmath>
void compute_sqrt2(const double* x, int n, double* y) noexcept {
for (int i=0; i<n; ++i) {
y[i] = std::sqrt(x[i]);
}
}
g++ -c -O3 -march=native -fno-math-errno sqrt2.cpp
The only difference between
compute_sqrt1
and compute_sqrt2
is that we added the extra option -fno-math-errno
when compiling. I’ll explain later what -fno-math-errno
does; but for now, I’ll only point out that the produced code is no longer standard compliant.Let’s benchmark
compute_sqrt2
.// benchmark.cpp
...
static void BM_Sqrt2(benchmark::State& state) {
const int n = state.range(0);
std::unique_ptr<double[]> xptr{new double[n]};
generate_random_numbers(xptr.get(), n);
for (auto _ : state) {
std::unique_ptr<double[]> yptr{new double[n]};
compute_sqrt2(xptr.get(), n, yptr.get());
benchmark::DoNotOptimize(yptr);
}
}
BENCHMARK(BM_Sqrt2)->Arg(1000000);
...
g++ -O3 -march=native -o benchmark benchmark.cpp sqrt2.o
./benchmark
Running ./benchmark
Run on (6 X 2600 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x6)
L1 Instruction 32 KiB (x6)
L2 Unified 256 KiB (x6)
L3 Unified 9216 KiB (x6)
Load Average: 0.17, 0.07, 0.05
-----------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------
BM_Sqrt2/1000000 1195070 ns 1192078 ns 553
Yikes!
compute_sqrt2
is more than 4 times faster than compute_sqrt1
.What’s different? Let’s drill down into the assembly to find out. We can produce the assembly for the code by runningg++ -S -c -O3 -march=native sqrt1.cpp
g++ -S -c -O3 -march=native -fno-math-errno sqrt2.cpp
.L3:
vmovsd (%rdi), %xmm0
vucomisd %xmm0, %xmm2
vsqrtsd %xmm0, %xmm1, %xmm1
ja .L12
addq $8, %rdi
vmovsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %r12, %rdi
jne .L3
1: vmovsd (%rdi), %xmm0
# Load a value from memory into the register %xmm0
2: vucomisd %xmm0, %xmm2
# Compare the value of %xmm0 with %xmm2 and set the register
# EFLAGS with the result
3: vsqrtsd %xmm0, %xmm1, %xmm1
# Compute the square root of %xmm0 and store in %xmm1
4: ja .L12
# Inspects EFLAGS and jumps if %xmm2 is above %xmm0
.L6:
addl $1, %r8d
vsqrtpd (%r10,%rax), %ymm0
vextractf128 $0x1, %ymm0, 16(%rcx,%rax)
vmovups %xmm0, (%rcx,%rax)
addq $32, %rax
cmpl %r8d, %r11d
ja .L6
The complete benchmark can be found at .
Previously published at