digitalmars.D - Inlining Code Test
- Craig Black (260/260) Dec 12 2010 The following program illustrates the problems with inlining in the dmd
- Iain Buclaw (56/316) Dec 12 2010 Testing on GDC with a Netbook, results from 3 consecutive runs are:
- Andrej Mitrovic (6/327) Dec 12 2010 My crappy old Pentium 4 has gone totally mad:
- Craig Black (4/360) Dec 12 2010 Try it without -profile. That tends to slow everything down to a halt.
- Andrej Mitrovic (6/371) Dec 12 2010 Ye that's better:
- Craig Black (246/246) Dec 12 2010 If the problem is not inlining, then why does the same code in C++ does ...
- Iain Buclaw (283/529) Dec 13 2010 D can use inline asm in much the same way, but that's deterring from you...
- Craig Black (3/32) Dec 13 2010 Curious benches there. Seems GCC isn't inlining (or other optimization)...
- Iain Buclaw (41/83) Dec 13 2010 OK, found and fixed the problem with GDC-4.4. My results are now:
- Craig Black (18/103) Dec 14 2010 Thanks for your help and interest. Basically you are saying I should us...
- Nick Voronin (40/48) Dec 16 2010 Compiled with dmd v2.050/win32 -g -O -inline -release
- Nick Voronin (17/46) Dec 17 2010 Looks like alignment of local variables is somewhat broken.
- bearophile (23/24) Dec 17 2010 This is a very nice bug. If not already present then please add it to Bu...
- Nick Voronin (20/46) Dec 17 2010 I found this on bugzilla.
- bearophile (4/8) Dec 17 2010 It's borderline a performance bug. You may add that code to the issue 22...
- Don (14/21) Dec 17 2010 Interestingly, since I filed that bug, DMD for MacOSX was created, which...
- Nick Voronin (5/20) Dec 17 2010 Thanks. Filed it as http://d.puremagic.com/issues/show_bug.cgi?id=5355
- Nick Voronin (5/22) Dec 18 2010 On second thought large globals in static segment (as log says) are prob...
- Don (3/13) Dec 20 2010 Good catch! Yes, that makes perfect sense.
The following program illustrates the problems with inlining in the dmd compiler. Perhaps with some more work I can reduce it to a smaller test case. I was playing around with a simple Array template, and noticed that similar C++ code performs much better. This is due, at least in part, to opIndex not being properly inlined by dmd. There are two sort functions, quickSort1 and quickSort2. quickSort1 indexes an Array data structure. quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my core i7. import std.stdio; import std.date; import std.random; import std.conv; import std.algorithm; import std.range; struct Array(T) { public: this(int length) { resize(length); } this(T[] a) { append(a); } this(this) { if(!base.array) return; ArrayBase!T old; old = base; base.array = null; reserve(old.length(), old.length()); copyData(old); old.array = null; } void clear() { base.clear(); } void resize(int sz) { assert(sz >= 0); if(sz == 0) return clear(); if(!base.array) reserve(sz, sz); *base.lengthPtr() = sz; } void reserve(int capacity, int length) { assert(length <= capacity); if(capacity == 0) return clear(); ArrayBase!T old; if(base.array) { if(base.capacity() >= capacity) return; old.array = base.array; base.array = null; } base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]); *base.lengthPtr() = length; *base.capacityPtr() = capacity; for(int i = 0; i < capacity; i++) emplace!T(base.data()+i); if(old.array) copyData(old); } int length() const { return base.length(); } int capacity() const { return base.capacity(); } bool empty() const { return base.array == null; } ref T at(int i) { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); return base.data()[i]; } ref const T at(int i) const { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); return base.data()[i]; } const ref T opIndex(int i) const { return at(i); } void opIndexAssign(T t, int i) { at(i) = t; } void opIndexAssign(ref T t, int i) { at(i) = t; } void opAssign(ref const Array!T array) { copy(array); } void opAssign(T[] array) { int len = array.length; resize(len); for(int i = 0; i < len; i++) at(i) = array[i]; } void copy(ref const Array!T array) { if(array.empty()) return clear(); int len = array.length(); reserve(len, len); for(int i = 0; i < len; i++) at(i) = array[i]; } void opOpAssign(string op, A...)(A a) if(op == "~") { appendComposite(a); } ref T front() { return at(0); } const ref T front() const { return at(0); } ref T back() { return at(length()-1); } const ref T back() const { return at(length()-1); } ref T appendOne() { int sz = length(); int sp = capacity(); if(sp > sz) (*base.lengthPtr())++; else { sz++; sp = max(2, sp+sp/2, sz); reserve(sp, sz); } return back(); } void append(A...)(A a) { static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || is(typeof(a[0]) == T[]))) { appendComposite(a[0]); } else { appendTuple(a); } } void appendTuple(A...)(A a) { foreach(x; a) appendOne() = x; } void appendComposite(A)(A a) { int prevLen = length(); resize(prevLen + a.length); for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i]; } private: static struct ArrayBase(T) { public: ~this() { clear(); } void clear() { if(array) delete array; } int length() const { return array ? *lengthPtr() : 0; } int capacity() const { return array ? *capacityPtr() : 0; } int* lengthPtr() const { return cast(int*)(array); } int* capacityPtr() const { return cast(int*)(array+4); } T* data() const { return cast(T*)(array+8); } ubyte* array; } ArrayBase!T base; void copyData(ref ArrayBase!T array) { int copyLen = min(length, array.length()); for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } } } static bool less(T)(T a, T b) { return a < b; } void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition1(T, alias L = less!T)(ref Array!T a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r) { if(p + 7 > r) return insertionSort1!(T, L)(a, p, r); if (p < r) { int q = partition1!(T, L)(a, p, r); quickSort1!(T, L)(a, p, q - 1); quickSort1!(T, L)(a, q + 1, r); } } void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, a.length()-1); } void insertionSort2(T, alias L = less!T)(T *a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition2(T, alias L = less!T)(T *a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort2(T, alias L = less!T)(T *a, int p, int r) { if(p + 7 > r) return insertionSort2!(T, L)(a, p, r); if (p < r) { int q = partition2!(T, L)(a, p, r); quickSort2!(T, L)(a, p, q - 1); quickSort2!(T, L)(a, q + 1, r); } } void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, length-1); } double[] vals; void bench1() { Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort1(v); } } void bench2() { Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort2(&v[0], v.length); } } void main() { Mt19937 gen; vals.length = 1000; for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0); ulong[] times = [0, 0]; for(int i = 0; i < 100; i++) { auto times2 = benchmark!(bench1, bench2)(1); times[0] += times2[0]; times[1] += times2[1]; } writeln("Sorting with Array.opIndex: ", times[0]); writeln("Sorting with pointers: ", times[1]); writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster"); }
Dec 12 2010
== Quote from Craig Black (craigblack2 cox.net)'s articleThe following program illustrates the problems with inlining in the dmd compiler. Perhaps with some more work I can reduce it to a smaller test case. I was playing around with a simple Array template, and noticed that similar C++ code performs much better. This is due, at least in part, to opIndex not being properly inlined by dmd. There are two sort functions, quickSort1 and quickSort2. quickSort1 indexes an Array data structure. quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my core i7. import std.stdio; import std.date; import std.random; import std.conv; import std.algorithm; import std.range; struct Array(T) { public: this(int length) { resize(length); } this(T[] a) { append(a); } this(this) { if(!base.array) return; ArrayBase!T old; old = base; base.array = null; reserve(old.length(), old.length()); copyData(old); old.array = null; } void clear() { base.clear(); } void resize(int sz) { assert(sz >= 0); if(sz == 0) return clear(); if(!base.array) reserve(sz, sz); *base.lengthPtr() = sz; } void reserve(int capacity, int length) { assert(length <= capacity); if(capacity == 0) return clear(); ArrayBase!T old; if(base.array) { if(base.capacity() >= capacity) return; old.array = base.array; base.array = null; } base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]); *base.lengthPtr() = length; *base.capacityPtr() = capacity; for(int i = 0; i < capacity; i++) emplace!T(base.data()+i); if(old.array) copyData(old); } int length() const { return base.length(); } int capacity() const { return base.capacity(); } bool empty() const { return base.array == null; } ref T at(int i) { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); return base.data()[i]; } ref const T at(int i) const { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); return base.data()[i]; } const ref T opIndex(int i) const { return at(i); } void opIndexAssign(T t, int i) { at(i) = t; } void opIndexAssign(ref T t, int i) { at(i) = t; } void opAssign(ref const Array!T array) { copy(array); } void opAssign(T[] array) { int len = array.length; resize(len); for(int i = 0; i < len; i++) at(i) = array[i]; } void copy(ref const Array!T array) { if(array.empty()) return clear(); int len = array.length(); reserve(len, len); for(int i = 0; i < len; i++) at(i) = array[i]; } void opOpAssign(string op, A...)(A a) if(op == "~") { appendComposite(a); } ref T front() { return at(0); } const ref T front() const { return at(0); } ref T back() { return at(length()-1); } const ref T back() const { return at(length()-1); } ref T appendOne() { int sz = length(); int sp = capacity(); if(sp > sz) (*base.lengthPtr())++; else { sz++; sp = max(2, sp+sp/2, sz); reserve(sp, sz); } return back(); } void append(A...)(A a) { static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || is(typeof(a[0]) == T[]))) { appendComposite(a[0]); } else { appendTuple(a); } } void appendTuple(A...)(A a) { foreach(x; a) appendOne() = x; } void appendComposite(A)(A a) { int prevLen = length(); resize(prevLen + a.length); for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i]; } private: static struct ArrayBase(T) { public: ~this() { clear(); } void clear() { if(array) delete array; } int length() const { return array ? *lengthPtr() : 0; } int capacity() const { return array ? *capacityPtr() : 0; } int* lengthPtr() const { return cast(int*)(array); } int* capacityPtr() const { return cast(int*)(array+4); } T* data() const { return cast(T*)(array+8); } ubyte* array; } ArrayBase!T base; void copyData(ref ArrayBase!T array) { int copyLen = min(length, array.length()); for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } } } static bool less(T)(T a, T b) { return a < b; } void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition1(T, alias L = less!T)(ref Array!T a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r) { if(p + 7 > r) return insertionSort1!(T, L)(a, p, r); if (p < r) { int q = partition1!(T, L)(a, p, r); quickSort1!(T, L)(a, p, q - 1); quickSort1!(T, L)(a, q + 1, r); } } void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, a.length()-1); } void insertionSort2(T, alias L = less!T)(T *a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition2(T, alias L = less!T)(T *a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort2(T, alias L = less!T)(T *a, int p, int r) { if(p + 7 > r) return insertionSort2!(T, L)(a, p, r); if (p < r) { int q = partition2!(T, L)(a, p, r); quickSort2!(T, L)(a, p, q - 1); quickSort2!(T, L)(a, q + 1, r); } } void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, length-1); } double[] vals; void bench1() { Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort1(v); } } void bench2() { Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort2(&v[0], v.length); } } void main() { Mt19937 gen; vals.length = 1000; for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0); ulong[] times = [0, 0]; for(int i = 0; i < 100; i++) { auto times2 = benchmark!(bench1, bench2)(1); times[0] += times2[0]; times[1] += times2[1]; } writeln("Sorting with Array.opIndex: ", times[0]); writeln("Sorting with pointers: ", times[1]); writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster"); }Testing on GDC with a Netbook, results from 3 consecutive runs are: Without -frelease ------- Sorting with Array.opIndex: 27981 Sorting with pointers: 5602 79.9793 percent faster Sorting with Array.opIndex: 25565 Sorting with pointers: 5179 79.7418 percent faster Sorting with Array.opIndex: 28657 Sorting with pointers: 5772 79.8583 percent faster ------- With -frelease ------- Sorting with Array.opIndex: 10591 Sorting with pointers: 4771 54.9523 percent faster Sorting with Array.opIndex: 10289 Sorting with pointers: 4710 54.223 percent faster Sorting with Array.opIndex: 11305 Sorting with pointers: 5216 53.8611 percent faster ------- With -frelease -fno-bounds-check ------- Sorting with Array.opIndex: 11651 Sorting with pointers: 5236 55.0597 percent faster Sorting with Array.opIndex: 9873 Sorting with pointers: 4559 53.8236 percent faster Sorting with Array.opIndex: 10361 Sorting with pointers: 4745 54.2033 percent faster ------- GDC doesn't use DMD's FE inliner, but results from the GCC backend's inliner: ------- Considering inline candidate check. Inlining check into fillUp. Merging blocks 9 and 10 Merging blocks 9 and 11 Considering inline candidate initialize. Inlining initialize into emplace. Merging blocks 2 and 3 Merging blocks 2 and 4 Considering inline candidate bench2. Not inlining: code size would grow by 77 insns. Considering inline candidate bench1. Not inlining: code size would grow by 49 insns. ------- So there's _me_ seriously doubting that inlining has anything to do with the 50% increase. Regards Iain
Dec 12 2010
My crappy old Pentium 4 has gone totally mad: Sorting with Array.opIndex: 45080 Sorting with pointers: 45608 4.092e+16 percent faster (??) Compiled with dmd -profile -O -release -inline -noboundscheck On 12/13/10, Iain Buclaw <ibuclaw ubuntu.com> wrote:== Quote from Craig Black (craigblack2 cox.net)'s articleThe following program illustrates the problems with inlining in the dmd compiler. Perhaps with some more work I can reduce it to a smaller test case. I was playing around with a simple Array template, and noticed that similar C++ code performs much better. This is due, at least in part, to opIndex not being properly inlined by dmd. There are two sort functions, quickSort1 and quickSort2. quickSort1 indexes an Array data structure. quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my core i7. import std.stdio; import std.date; import std.random; import std.conv; import std.algorithm; import std.range; struct Array(T) { public: this(int length) { resize(length); } this(T[] a) { append(a); } this(this) { if(!base.array) return; ArrayBase!T old; old = base; base.array = null; reserve(old.length(), old.length()); copyData(old); old.array = null; } void clear() { base.clear(); } void resize(int sz) { assert(sz >= 0); if(sz == 0) return clear(); if(!base.array) reserve(sz, sz); *base.lengthPtr() = sz; } void reserve(int capacity, int length) { assert(length <= capacity); if(capacity == 0) return clear(); ArrayBase!T old; if(base.array) { if(base.capacity() >= capacity) return; old.array = base.array; base.array = null; } base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]); *base.lengthPtr() = length; *base.capacityPtr() = capacity; for(int i = 0; i < capacity; i++) emplace!T(base.data()+i); if(old.array) copyData(old); } int length() const { return base.length(); } int capacity() const { return base.capacity(); } bool empty() const { return base.array == null; } ref T at(int i) { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); return base.data()[i]; } ref const T at(int i) const { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); return base.data()[i]; } const ref T opIndex(int i) const { return at(i); } void opIndexAssign(T t, int i) { at(i) = t; } void opIndexAssign(ref T t, int i) { at(i) = t; } void opAssign(ref const Array!T array) { copy(array); } void opAssign(T[] array) { int len = array.length; resize(len); for(int i = 0; i < len; i++) at(i) = array[i]; } void copy(ref const Array!T array) { if(array.empty()) return clear(); int len = array.length(); reserve(len, len); for(int i = 0; i < len; i++) at(i) = array[i]; } void opOpAssign(string op, A...)(A a) if(op == "~") { appendComposite(a); } ref T front() { return at(0); } const ref T front() const { return at(0); } ref T back() { return at(length()-1); } const ref T back() const { return at(length()-1); } ref T appendOne() { int sz = length(); int sp = capacity(); if(sp > sz) (*base.lengthPtr())++; else { sz++; sp = max(2, sp+sp/2, sz); reserve(sp, sz); } return back(); } void append(A...)(A a) { static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || is(typeof(a[0]) == T[]))) { appendComposite(a[0]); } else { appendTuple(a); } } void appendTuple(A...)(A a) { foreach(x; a) appendOne() = x; } void appendComposite(A)(A a) { int prevLen = length(); resize(prevLen + a.length); for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i]; } private: static struct ArrayBase(T) { public: ~this() { clear(); } void clear() { if(array) delete array; } int length() const { return array ? *lengthPtr() : 0; } int capacity() const { return array ? *capacityPtr() : 0; } int* lengthPtr() const { return cast(int*)(array); } int* capacityPtr() const { return cast(int*)(array+4); } T* data() const { return cast(T*)(array+8); } ubyte* array; } ArrayBase!T base; void copyData(ref ArrayBase!T array) { int copyLen = min(length, array.length()); for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } } } static bool less(T)(T a, T b) { return a < b; } void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition1(T, alias L = less!T)(ref Array!T a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r) { if(p + 7 > r) return insertionSort1!(T, L)(a, p, r); if (p < r) { int q = partition1!(T, L)(a, p, r); quickSort1!(T, L)(a, p, q - 1); quickSort1!(T, L)(a, q + 1, r); } } void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, a.length()-1); } void insertionSort2(T, alias L = less!T)(T *a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition2(T, alias L = less!T)(T *a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort2(T, alias L = less!T)(T *a, int p, int r) { if(p + 7 > r) return insertionSort2!(T, L)(a, p, r); if (p < r) { int q = partition2!(T, L)(a, p, r); quickSort2!(T, L)(a, p, q - 1); quickSort2!(T, L)(a, q + 1, r); } } void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, length-1); } double[] vals; void bench1() { Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort1(v); } } void bench2() { Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort2(&v[0], v.length); } } void main() { Mt19937 gen; vals.length = 1000; for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0); ulong[] times = [0, 0]; for(int i = 0; i < 100; i++) { auto times2 = benchmark!(bench1, bench2)(1); times[0] += times2[0]; times[1] += times2[1]; } writeln("Sorting with Array.opIndex: ", times[0]); writeln("Sorting with pointers: ", times[1]); writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster"); }Testing on GDC with a Netbook, results from 3 consecutive runs are: Without -frelease ------- Sorting with Array.opIndex: 27981 Sorting with pointers: 5602 79.9793 percent faster Sorting with Array.opIndex: 25565 Sorting with pointers: 5179 79.7418 percent faster Sorting with Array.opIndex: 28657 Sorting with pointers: 5772 79.8583 percent faster ------- With -frelease ------- Sorting with Array.opIndex: 10591 Sorting with pointers: 4771 54.9523 percent faster Sorting with Array.opIndex: 10289 Sorting with pointers: 4710 54.223 percent faster Sorting with Array.opIndex: 11305 Sorting with pointers: 5216 53.8611 percent faster ------- With -frelease -fno-bounds-check ------- Sorting with Array.opIndex: 11651 Sorting with pointers: 5236 55.0597 percent faster Sorting with Array.opIndex: 9873 Sorting with pointers: 4559 53.8236 percent faster Sorting with Array.opIndex: 10361 Sorting with pointers: 4745 54.2033 percent faster ------- GDC doesn't use DMD's FE inliner, but results from the GCC backend's inliner: ------- Considering inline candidate check. Inlining check into fillUp. Merging blocks 9 and 10 Merging blocks 9 and 11 Considering inline candidate initialize. Inlining initialize into emplace. Merging blocks 2 and 3 Merging blocks 2 and 4 Considering inline candidate bench2. Not inlining: code size would grow by 77 insns. Considering inline candidate bench1. Not inlining: code size would grow by 49 insns. ------- So there's _me_ seriously doubting that inlining has anything to do with the 50% increase. Regards Iain
Dec 12 2010
Try it without -profile. That tends to slow everything down to a halt. -Craig "Andrej Mitrovic" <andrej.mitrovich gmail.com> wrote in message news:mailman.948.1292198993.21107.digitalmars-d puremagic.com...My crappy old Pentium 4 has gone totally mad: Sorting with Array.opIndex: 45080 Sorting with pointers: 45608 4.092e+16 percent faster (??) Compiled with dmd -profile -O -release -inline -noboundscheck On 12/13/10, Iain Buclaw <ibuclaw ubuntu.com> wrote:== Quote from Craig Black (craigblack2 cox.net)'s articleThe following program illustrates the problems with inlining in the dmd compiler. Perhaps with some more work I can reduce it to a smaller test case. I was playing around with a simple Array template, and noticed that similar C++ code performs much better. This is due, at least in part, to opIndex not being properly inlined by dmd. There are two sort functions, quickSort1 and quickSort2. quickSort1 indexes an Array data structure. quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my core i7. import std.stdio; import std.date; import std.random; import std.conv; import std.algorithm; import std.range; struct Array(T) { public: this(int length) { resize(length); } this(T[] a) { append(a); } this(this) { if(!base.array) return; ArrayBase!T old; old = base; base.array = null; reserve(old.length(), old.length()); copyData(old); old.array = null; } void clear() { base.clear(); } void resize(int sz) { assert(sz >= 0); if(sz == 0) return clear(); if(!base.array) reserve(sz, sz); *base.lengthPtr() = sz; } void reserve(int capacity, int length) { assert(length <= capacity); if(capacity == 0) return clear(); ArrayBase!T old; if(base.array) { if(base.capacity() >= capacity) return; old.array = base.array; base.array = null; } base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]); *base.lengthPtr() = length; *base.capacityPtr() = capacity; for(int i = 0; i < capacity; i++) emplace!T(base.data()+i); if(old.array) copyData(old); } int length() const { return base.length(); } int capacity() const { return base.capacity(); } bool empty() const { return base.array == null; } ref T at(int i) { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); return base.data()[i]; } ref const T at(int i) const { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); return base.data()[i]; } const ref T opIndex(int i) const { return at(i); } void opIndexAssign(T t, int i) { at(i) = t; } void opIndexAssign(ref T t, int i) { at(i) = t; } void opAssign(ref const Array!T array) { copy(array); } void opAssign(T[] array) { int len = array.length; resize(len); for(int i = 0; i < len; i++) at(i) = array[i]; } void copy(ref const Array!T array) { if(array.empty()) return clear(); int len = array.length(); reserve(len, len); for(int i = 0; i < len; i++) at(i) = array[i]; } void opOpAssign(string op, A...)(A a) if(op == "~") { appendComposite(a); } ref T front() { return at(0); } const ref T front() const { return at(0); } ref T back() { return at(length()-1); } const ref T back() const { return at(length()-1); } ref T appendOne() { int sz = length(); int sp = capacity(); if(sp > sz) (*base.lengthPtr())++; else { sz++; sp = max(2, sp+sp/2, sz); reserve(sp, sz); } return back(); } void append(A...)(A a) { static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || is(typeof(a[0]) == T[]))) { appendComposite(a[0]); } else { appendTuple(a); } } void appendTuple(A...)(A a) { foreach(x; a) appendOne() = x; } void appendComposite(A)(A a) { int prevLen = length(); resize(prevLen + a.length); for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i]; } private: static struct ArrayBase(T) { public: ~this() { clear(); } void clear() { if(array) delete array; } int length() const { return array ? *lengthPtr() : 0; } int capacity() const { return array ? *capacityPtr() : 0; } int* lengthPtr() const { return cast(int*)(array); } int* capacityPtr() const { return cast(int*)(array+4); } T* data() const { return cast(T*)(array+8); } ubyte* array; } ArrayBase!T base; void copyData(ref ArrayBase!T array) { int copyLen = min(length, array.length()); for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } } } static bool less(T)(T a, T b) { return a < b; } void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition1(T, alias L = less!T)(ref Array!T a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r) { if(p + 7 > r) return insertionSort1!(T, L)(a, p, r); if (p < r) { int q = partition1!(T, L)(a, p, r); quickSort1!(T, L)(a, p, q - 1); quickSort1!(T, L)(a, q + 1, r); } } void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, a.length()-1); } void insertionSort2(T, alias L = less!T)(T *a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition2(T, alias L = less!T)(T *a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort2(T, alias L = less!T)(T *a, int p, int r) { if(p + 7 > r) return insertionSort2!(T, L)(a, p, r); if (p < r) { int q = partition2!(T, L)(a, p, r); quickSort2!(T, L)(a, p, q - 1); quickSort2!(T, L)(a, q + 1, r); } } void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, length-1); } double[] vals; void bench1() { Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort1(v); } } void bench2() { Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort2(&v[0], v.length); } } void main() { Mt19937 gen; vals.length = 1000; for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0); ulong[] times = [0, 0]; for(int i = 0; i < 100; i++) { auto times2 = benchmark!(bench1, bench2)(1); times[0] += times2[0]; times[1] += times2[1]; } writeln("Sorting with Array.opIndex: ", times[0]); writeln("Sorting with pointers: ", times[1]); writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster"); }Testing on GDC with a Netbook, results from 3 consecutive runs are: Without -frelease ------- Sorting with Array.opIndex: 27981 Sorting with pointers: 5602 79.9793 percent faster Sorting with Array.opIndex: 25565 Sorting with pointers: 5179 79.7418 percent faster Sorting with Array.opIndex: 28657 Sorting with pointers: 5772 79.8583 percent faster ------- With -frelease ------- Sorting with Array.opIndex: 10591 Sorting with pointers: 4771 54.9523 percent faster Sorting with Array.opIndex: 10289 Sorting with pointers: 4710 54.223 percent faster Sorting with Array.opIndex: 11305 Sorting with pointers: 5216 53.8611 percent faster ------- With -frelease -fno-bounds-check ------- Sorting with Array.opIndex: 11651 Sorting with pointers: 5236 55.0597 percent faster Sorting with Array.opIndex: 9873 Sorting with pointers: 4559 53.8236 percent faster Sorting with Array.opIndex: 10361 Sorting with pointers: 4745 54.2033 percent faster ------- GDC doesn't use DMD's FE inliner, but results from the GCC backend's inliner: ------- Considering inline candidate check. Inlining check into fillUp. Merging blocks 9 and 10 Merging blocks 9 and 11 Considering inline candidate initialize. Inlining initialize into emplace. Merging blocks 2 and 3 Merging blocks 2 and 4 Considering inline candidate bench2. Not inlining: code size would grow by 77 insns. Considering inline candidate bench1. Not inlining: code size would grow by 49 insns. ------- So there's _me_ seriously doubting that inlining has anything to do with the 50% increase. Regards Iain
Dec 12 2010
Ye that's better: Sorting with Array.opIndex: 2859 Sorting with pointers: 1765 38.2651 percent faster And it only took 2 seconds. With profile it takes a full minute. On 12/13/10, Craig Black <craigblack2 cox.net> wrote:Try it without -profile. That tends to slow everything down to a halt. -Craig "Andrej Mitrovic" <andrej.mitrovich gmail.com> wrote in message news:mailman.948.1292198993.21107.digitalmars-d puremagic.com...My crappy old Pentium 4 has gone totally mad: Sorting with Array.opIndex: 45080 Sorting with pointers: 45608 4.092e+16 percent faster (??) Compiled with dmd -profile -O -release -inline -noboundscheck On 12/13/10, Iain Buclaw <ibuclaw ubuntu.com> wrote:== Quote from Craig Black (craigblack2 cox.net)'s articleThe following program illustrates the problems with inlining in the dmd compiler. Perhaps with some more work I can reduce it to a smaller test case. I was playing around with a simple Array template, and noticed that similar C++ code performs much better. This is due, at least in part, to opIndex not being properly inlined by dmd. There are two sort functions, quickSort1 and quickSort2. quickSort1 indexes an Array data structure. quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my core i7. import std.stdio; import std.date; import std.random; import std.conv; import std.algorithm; import std.range; struct Array(T) { public: this(int length) { resize(length); } this(T[] a) { append(a); } this(this) { if(!base.array) return; ArrayBase!T old; old = base; base.array = null; reserve(old.length(), old.length()); copyData(old); old.array = null; } void clear() { base.clear(); } void resize(int sz) { assert(sz >= 0); if(sz == 0) return clear(); if(!base.array) reserve(sz, sz); *base.lengthPtr() = sz; } void reserve(int capacity, int length) { assert(length <= capacity); if(capacity == 0) return clear(); ArrayBase!T old; if(base.array) { if(base.capacity() >= capacity) return; old.array = base.array; base.array = null; } base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]); *base.lengthPtr() = length; *base.capacityPtr() = capacity; for(int i = 0; i < capacity; i++) emplace!T(base.data()+i); if(old.array) copyData(old); } int length() const { return base.length(); } int capacity() const { return base.capacity(); } bool empty() const { return base.array == null; } ref T at(int i) { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); return base.data()[i]; } ref const T at(int i) const { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1)); return base.data()[i]; } const ref T opIndex(int i) const { return at(i); } void opIndexAssign(T t, int i) { at(i) = t; } void opIndexAssign(ref T t, int i) { at(i) = t; } void opAssign(ref const Array!T array) { copy(array); } void opAssign(T[] array) { int len = array.length; resize(len); for(int i = 0; i < len; i++) at(i) = array[i]; } void copy(ref const Array!T array) { if(array.empty()) return clear(); int len = array.length(); reserve(len, len); for(int i = 0; i < len; i++) at(i) = array[i]; } void opOpAssign(string op, A...)(A a) if(op == "~") { appendComposite(a); } ref T front() { return at(0); } const ref T front() const { return at(0); } ref T back() { return at(length()-1); } const ref T back() const { return at(length()-1); } ref T appendOne() { int sz = length(); int sp = capacity(); if(sp > sz) (*base.lengthPtr())++; else { sz++; sp = max(2, sp+sp/2, sz); reserve(sp, sz); } return back(); } void append(A...)(A a) { static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || is(typeof(a[0]) == T[]))) { appendComposite(a[0]); } else { appendTuple(a); } } void appendTuple(A...)(A a) { foreach(x; a) appendOne() = x; } void appendComposite(A)(A a) { int prevLen = length(); resize(prevLen + a.length); for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i]; } private: static struct ArrayBase(T) { public: ~this() { clear(); } void clear() { if(array) delete array; } int length() const { return array ? *lengthPtr() : 0; } int capacity() const { return array ? *capacityPtr() : 0; } int* lengthPtr() const { return cast(int*)(array); } int* capacityPtr() const { return cast(int*)(array+4); } T* data() const { return cast(T*)(array+8); } ubyte* array; } ArrayBase!T base; void copyData(ref ArrayBase!T array) { int copyLen = min(length, array.length()); for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } } } static bool less(T)(T a, T b) { return a < b; } void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition1(T, alias L = less!T)(ref Array!T a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r) { if(p + 7 > r) return insertionSort1!(T, L)(a, p, r); if (p < r) { int q = partition1!(T, L)(a, p, r); quickSort1!(T, L)(a, p, q - 1); quickSort1!(T, L)(a, q + 1, r); } } void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, a.length()-1); } void insertionSort2(T, alias L = less!T)(T *a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition2(T, alias L = less!T)(T *a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort2(T, alias L = less!T)(T *a, int p, int r) { if(p + 7 > r) return insertionSort2!(T, L)(a, p, r); if (p < r) { int q = partition2!(T, L)(a, p, r); quickSort2!(T, L)(a, p, q - 1); quickSort2!(T, L)(a, q + 1, r); } } void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, length-1); } double[] vals; void bench1() { Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort1(v); } } void bench2() { Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort2(&v[0], v.length); } } void main() { Mt19937 gen; vals.length = 1000; for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0); ulong[] times = [0, 0]; for(int i = 0; i < 100; i++) { auto times2 = benchmark!(bench1, bench2)(1); times[0] += times2[0]; times[1] += times2[1]; } writeln("Sorting with Array.opIndex: ", times[0]); writeln("Sorting with pointers: ", times[1]); writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster"); }Testing on GDC with a Netbook, results from 3 consecutive runs are: Without -frelease ------- Sorting with Array.opIndex: 27981 Sorting with pointers: 5602 79.9793 percent faster Sorting with Array.opIndex: 25565 Sorting with pointers: 5179 79.7418 percent faster Sorting with Array.opIndex: 28657 Sorting with pointers: 5772 79.8583 percent faster ------- With -frelease ------- Sorting with Array.opIndex: 10591 Sorting with pointers: 4771 54.9523 percent faster Sorting with Array.opIndex: 10289 Sorting with pointers: 4710 54.223 percent faster Sorting with Array.opIndex: 11305 Sorting with pointers: 5216 53.8611 percent faster ------- With -frelease -fno-bounds-check ------- Sorting with Array.opIndex: 11651 Sorting with pointers: 5236 55.0597 percent faster Sorting with Array.opIndex: 9873 Sorting with pointers: 4559 53.8236 percent faster Sorting with Array.opIndex: 10361 Sorting with pointers: 4745 54.2033 percent faster ------- GDC doesn't use DMD's FE inliner, but results from the GCC backend's inliner: ------- Considering inline candidate check. Inlining check into fillUp. Merging blocks 9 and 10 Merging blocks 9 and 11 Considering inline candidate initialize. Inlining initialize into emplace. Merging blocks 2 and 3 Merging blocks 2 and 4 Considering inline candidate bench2. Not inlining: code size would grow by 77 insns. Considering inline candidate bench1. Not inlining: code size would grow by 49 insns. ------- So there's _me_ seriously doubting that inlining has anything to do with the 50% increase. Regards Iain
Dec 12 2010
If the problem is not inlining, then why does the same code in C++ does not suffer from this performance limitation (when compiled with Visual C++ 2008)? #include <stdio.h> #include <assert.h> #include <stdlib.h> template <class T> T min(T a, T b) { return a < b ? a : b; } template <class T> T max(T a, T b) { return a > b ? a : b; } inline void *operator new(size_t, void *ptr) { return ptr; } inline void operator delete(void *, void *) {} template <class T> class Array { public: Array() {} Array(int length) { resize(length); } Array(const Array<T> &a) { append(a); } void clear() { base.clear(); } void resize(int sz) { assert(sz >= 0); if(sz == 0) return clear(); if(!base.array) reserve(sz, sz); *base.lengthPtr() = sz; } void reserve(int capacity, int length) { assert(length <= capacity); if(capacity == 0) return clear(); ArrayBase old; if(base.array) { if(base.capacity() >= capacity) return; old.array = base.array; base.array = 0; } base.array = (char*)(new char[capacity*sizeof(T)+8]); *base.lengthPtr() = length; *base.capacityPtr() = capacity; for(int i = 0; i < capacity; i++) new(base.data()+i)T; if(old.array) copyData(old); } int length() const { return base.length(); } int capacity() const { return base.capacity(); } bool empty() const { return base.array == 0; } T &at(int i) { assert(!empty()); assert(i >= 0 && i < length()); return base.data()[i]; } const T &at(int i) const { assert(!empty()); assert(i >= 0 && i < length()); return base.data()[i]; } T &operator [](int i) { return at(i); } const T &operator [](int i) const { return at(i); } void operator = (const Array &array) { copy(array); } void copy(const Array &array) { if(array.empty()) return clear(); int len = array.length(); reserve(len, len); for(int i = 0; i < len; i++) at(i) = array[i]; } T &front() { return at(0); } const T &front() const { return at(0); } T &back() { return at(length()-1); } const T &back() const { return at(length()-1); } T &append() { int sz = length(); int sp = capacity(); if(sp > sz) (*base.lengthPtr())++; else { sz++; sp = max(max(2, sp+sp/2), sz); reserve(sp, sz); } return back(); } void append(const Array<T> &a) { int prevLen = length(); resize(prevLen + a.length()); for(int i = 0; i < a.length(); i++) at(i+prevLen) = a[i]; } private: class ArrayBase { public: ArrayBase() { array = 0; } ~ArrayBase() { clear(); } void clear() { if(array) delete array; } int length() const { return array ? *lengthPtr() : 0; } int capacity() const { return array ? *capacityPtr() : 0; } int* lengthPtr() const { return (int*)array; } int* capacityPtr() const { return (int*)(array+4); } T* data() const { return (T*)(array+8); } char* array; }; ArrayBase base; void copyData(ArrayBase &array) { int copyLen = min(length(), array.length()); for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } } }; template <class T> struct Less { bool operator() (T a, T b) { return a < b; } }; template <class T> void swap(T &a, T &b) { T temp = a; a = b; b = temp; } template <class T, class L> void insertionSort1(Array<T> &a, int low, int high, L less) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(less(a[j], a[min])) min = j; swap(a[i], a[min]); } } template<class T, class L> int partition1(Array<T> &a, int p, int r, L less) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (less(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } template<class T, class L> void quickSort1(Array<T> &a, int p, int r, L less) { if(p + 7 > r) return insertionSort1(a, p, r, less); if (p < r) { int q = partition1(a, p, r, less); quickSort1(a, p, q - 1, less); quickSort1(a, q + 1, r, less); } } template <class T, class L> void sort1(Array<T> &a) { quickSort1(a, 0, a.length()-1, Less<T>()); } template <class T, class L> void insertionSort2(T *a, int low, int high, L less) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(less(a[j], a[min])) min = j; swap(a[i], a[min]); } } template<class T, class L> int partition2(T *a, int p, int r, L less) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (less(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } template<class T, class L> void quickSort2(T *a, int p, int r, L less) { if(p + 7 > r) return insertionSort2(a, p, r, less); if (p < r) { int q = partition2(a, p, r, less); quickSort2(a, p, q - 1, less); quickSort2(a, q + 1, r, less); } } template <class T, class L> void sort2(Array<T> &a) { quickSort2(&a[0], 0, a.length()-1, Less<T>()); } typedef unsigned long long uint64; uint64 getCycle() { __asm { RDTSC } } Array<double> vals; uint64 bench1() { uint64 startTime = getCycle(); Array<double> v; for(int i = 0; i < 100; i++) { v = vals; sort1<double, Less<double> >(v); } return getCycle()-startTime; } uint64 bench2() { uint64 startTime = getCycle(); Array<double> v; for(int i = 0; i < 100; i++) { v = vals; sort2<double, Less<double> >(v); } return getCycle()-startTime; } void main() { vals.resize(1000); for(int i = 0; i < 1000; i++) vals[i] = (rand() % 1000000) / 1000.0; uint64 time1 = 0; uint64 time2 = 0; for(int i = 0; i < 100; i++) { time1 += bench1(); time2 += bench2(); } printf("Sorting with Array: %f\n", time1/100000.0); printf("Sorting with pointers: %f\n", time2/100000.0); printf("%f percent faster\n", 100.0 * (time1-time2) / time1); }
Dec 12 2010
== Quote from Craig Black (craigblack2 cox.net)'s articleIf the problem is not inlining, then why does the same code in C++ does not suffer from this performance limitation (when compiled with Visual C++ 2008)? #include <stdio.h> #include <assert.h> #include <stdlib.h> template <class T> T min(T a, T b) { return a < b ? a : b; } template <class T> T max(T a, T b) { return a > b ? a : b; } inline void *operator new(size_t, void *ptr) { return ptr; } inline void operator delete(void *, void *) {} template <class T> class Array { public: Array() {} Array(int length) { resize(length); } Array(const Array<T> &a) { append(a); } void clear() { base.clear(); } void resize(int sz) { assert(sz >= 0); if(sz == 0) return clear(); if(!base.array) reserve(sz, sz); *base.lengthPtr() = sz; } void reserve(int capacity, int length) { assert(length <= capacity); if(capacity == 0) return clear(); ArrayBase old; if(base.array) { if(base.capacity() >= capacity) return; old.array = base.array; base.array = 0; } base.array = (char*)(new char[capacity*sizeof(T)+8]); *base.lengthPtr() = length; *base.capacityPtr() = capacity; for(int i = 0; i < capacity; i++) new(base.data()+i)T; if(old.array) copyData(old); } int length() const { return base.length(); } int capacity() const { return base.capacity(); } bool empty() const { return base.array == 0; } T &at(int i) { assert(!empty()); assert(i >= 0 && i < length()); return base.data()[i]; } const T &at(int i) const { assert(!empty()); assert(i >= 0 && i < length()); return base.data()[i]; } T &operator [](int i) { return at(i); } const T &operator [](int i) const { return at(i); } void operator = (const Array &array) { copy(array); } void copy(const Array &array) { if(array.empty()) return clear(); int len = array.length(); reserve(len, len); for(int i = 0; i < len; i++) at(i) = array[i]; } T &front() { return at(0); } const T &front() const { return at(0); } T &back() { return at(length()-1); } const T &back() const { return at(length()-1); } T &append() { int sz = length(); int sp = capacity(); if(sp > sz) (*base.lengthPtr())++; else { sz++; sp = max(max(2, sp+sp/2), sz); reserve(sp, sz); } return back(); } void append(const Array<T> &a) { int prevLen = length(); resize(prevLen + a.length()); for(int i = 0; i < a.length(); i++) at(i+prevLen) = a[i]; } private: class ArrayBase { public: ArrayBase() { array = 0; } ~ArrayBase() { clear(); } void clear() { if(array) delete array; } int length() const { return array ? *lengthPtr() : 0; } int capacity() const { return array ? *capacityPtr() : 0; } int* lengthPtr() const { return (int*)array; } int* capacityPtr() const { return (int*)(array+4); } T* data() const { return (T*)(array+8); } char* array; }; ArrayBase base; void copyData(ArrayBase &array) { int copyLen = min(length(), array.length()); for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } } }; template <class T> struct Less { bool operator() (T a, T b) { return a < b; } }; template <class T> void swap(T &a, T &b) { T temp = a; a = b; b = temp; } template <class T, class L> void insertionSort1(Array<T> &a, int low, int high, L less) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(less(a[j], a[min])) min = j; swap(a[i], a[min]); } } template<class T, class L> int partition1(Array<T> &a, int p, int r, L less) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (less(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } template<class T, class L> void quickSort1(Array<T> &a, int p, int r, L less) { if(p + 7 > r) return insertionSort1(a, p, r, less); if (p < r) { int q = partition1(a, p, r, less); quickSort1(a, p, q - 1, less); quickSort1(a, q + 1, r, less); } } template <class T, class L> void sort1(Array<T> &a) { quickSort1(a, 0, a.length()-1, Less<T>()); } template <class T, class L> void insertionSort2(T *a, int low, int high, L less) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(less(a[j], a[min])) min = j; swap(a[i], a[min]); } } template<class T, class L> int partition2(T *a, int p, int r, L less) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (less(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } template<class T, class L> void quickSort2(T *a, int p, int r, L less) { if(p + 7 > r) return insertionSort2(a, p, r, less); if (p < r) { int q = partition2(a, p, r, less); quickSort2(a, p, q - 1, less); quickSort2(a, q + 1, r, less); } } template <class T, class L> void sort2(Array<T> &a) { quickSort2(&a[0], 0, a.length()-1, Less<T>()); } typedef unsigned long long uint64; uint64 getCycle() { __asm { RDTSC } } Array<double> vals; uint64 bench1() { uint64 startTime = getCycle(); Array<double> v; for(int i = 0; i < 100; i++) { v = vals; sort1<double, Less<double> >(v); } return getCycle()-startTime; } uint64 bench2() { uint64 startTime = getCycle(); Array<double> v; for(int i = 0; i < 100; i++) { v = vals; sort2<double, Less<double> >(v); } return getCycle()-startTime; } void main() { vals.resize(1000); for(int i = 0; i < 1000; i++) vals[i] = (rand() % 1000000) / 1000.0; uint64 time1 = 0; uint64 time2 = 0; for(int i = 0; i < 100; i++) { time1 += bench1(); time2 += bench2(); } printf("Sorting with Array: %f\n", time1/100000.0); printf("Sorting with pointers: %f\n", time2/100000.0); printf("%f percent faster\n", 100.0 * (time1-time2) / time1); }D can use inline asm in much the same way, but that's deterring from your point. =) import std.c.stdio; import std.random; import std.conv; import std.algorithm; import std.range; struct Array(T) { public: this(int length) { resize(length); } this(T[] a) { append(a); } this(this) { if(!base.array) return; ArrayBase!T old; old = base; base.array = null; reserve(old.length(), old.length()); copyData(old); old.array = null; } void clear() { base.clear(); } void resize(int sz) { assert(sz >= 0); if(sz == 0) return clear(); if(!base.array) reserve(sz, sz); *base.lengthPtr() = sz; } void reserve(int capacity, int length) { assert(length <= capacity); if(capacity == 0) return clear(); ArrayBase!T old; if(base.array) { if(base.capacity() >= capacity) return; old.array = base.array; base.array = null; } base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]); *base.lengthPtr() = length; *base.capacityPtr() = capacity; for(int i = 0; i < capacity; i++) emplace!T(base.data()+i); if(old.array) copyData(old); } int length() const { return base.length(); } int capacity() const { return base.capacity(); } bool empty() const { return base.array == null; } ref T at(int i) { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()- 1)); return base.data()[i]; } ref const T at(int i) const { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()- 1)); return base.data()[i]; } const ref T opIndex(int i) const { return at(i); } void opIndexAssign(T t, int i) { at(i) = t; } void opIndexAssign(ref T t, int i) { at(i) = t; } void opAssign(ref const Array!T array) { copy(array); } void opAssign(T[] array) { int len = array.length; resize(len); for(int i = 0; i < len; i++) at(i) = array[i]; } void copy(ref const Array!T array) { if(array.empty()) return clear(); int len = array.length(); reserve(len, len); for(int i = 0; i < len; i++) at(i) = array[i]; } void opOpAssign(string op, A...)(A a) if(op == "~") { appendComposite(a); } ref T front() { return at(0); } const ref T front() const { return at(0); } ref T back() { return at(length()-1); } const ref T back() const { return at(length()-1); } ref T appendOne() { int sz = length(); int sp = capacity(); if(sp > sz) (*base.lengthPtr())++; else { sz++; sp = max(2, sp+sp/2, sz); reserve(sp, sz); } return back(); } void append(A...)(A a) { static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || is(typeof(a[0]) == T[]))) { appendComposite(a[0]); } else { appendTuple(a); } } void appendTuple(A...)(A a) { foreach(x; a) appendOne() = x; } void appendComposite(A)(A a) { int prevLen = length(); resize(prevLen + a.length); for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i]; } private: static struct ArrayBase(T) { public: ~this() { clear(); } void clear() { if(array) delete array; } int length() const { return array ? *lengthPtr() : 0; } int capacity() const { return array ? *capacityPtr() : 0; } int* lengthPtr() const { return cast(int*)(array); } int* capacityPtr() const { return cast(int*)(array+4); } T* data() const { return cast(T*)(array+8); } ubyte* array; } ArrayBase!T base; void copyData(ref ArrayBase!T array) { int copyLen = min(length, array.length()); for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } } } static bool less(T)(T a, T b) { return a < b; } void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition1(T, alias L = less!T)(ref Array!T a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r) { if(p + 7 > r) return insertionSort1!(T, L)(a, p, r); if (p < r) { int q = partition1!(T, L)(a, p, r); quickSort1!(T, L)(a, p, q - 1); quickSort1!(T, L)(a, q + 1, r); } } void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, a.length()-1); } void insertionSort2(T, alias L = less!T)(T *a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition2(T, alias L = less!T)(T *a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort2(T, alias L = less!T)(T *a, int p, int r) { if(p + 7 > r) return insertionSort2!(T, L)(a, p, r); if (p < r) { int q = partition2!(T, L)(a, p, r); quickSort2!(T, L)(a, p, q - 1); quickSort2!(T, L)(a, q + 1, r); } } void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, length-1); } ulong getCycle() { asm { rdtsc; } } double[] vals; ulong bench1() { ulong startTime = getCycle(); Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort1(v); } return getCycle() - startTime; } ulong bench2() { ulong startTime = getCycle(); Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort2(&v[0], v.length); } return getCycle() - startTime; } void main() { Mt19937 gen; vals.length = 1000; for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0); ulong time1; ulong time2; for(int i = 0; i < 100; i++) { time1 += bench1(); time2 += bench2(); } printf("Sorting with Array.opIndex: %f\n", time1/1e5); printf("Sorting with pointers: %f\n", time2/1e5); printf("%f percent faster\n", 100.0 * (time1-time2) / time1); } Testing your C++ program (altered getCycle() for GCC) Times I get: ------- Sorting with Array: 46869.159840 Sorting with pointers: 38688.937320 17.453316 percent faster Sorting with Array: 46631.903760 Sorting with pointers: 38520.609360 17.394302 percent faster Sorting with Array: 46674.330720 Sorting with pointers: 38545.202520 17.416700 percent faster ------- On a , I thought I might try an older version of GCC for the D program. Really surprisingly, I got: ------- Sorting with Array.opIndex: 43075.059840 Sorting with pointers: 40019.701920 7.093102 percent faster Sorting with Array.opIndex: 42940.085640 Sorting with pointers: 39594.089040 7.792245 percent faster Sorting with Array.opIndex: 44389.127280 Sorting with pointers: 41159.016960 7.276805 percent faster ------- This will need some thinking through as to just what happened between GDC-4.3 -> GDC-4.4 :~) Regards
Dec 13 2010
Testing your C++ program (altered getCycle() for GCC) Times I get: ------- Sorting with Array: 46869.159840 Sorting with pointers: 38688.937320 17.453316 percent faster Sorting with Array: 46631.903760 Sorting with pointers: 38520.609360 17.394302 percent faster Sorting with Array: 46674.330720 Sorting with pointers: 38545.202520 17.416700 percent faster ------- On a , I thought I might try an older version of GCC for the D program. Really surprisingly, I got: ------- Sorting with Array.opIndex: 43075.059840 Sorting with pointers: 40019.701920 7.093102 percent faster Sorting with Array.opIndex: 42940.085640 Sorting with pointers: 39594.089040 7.792245 percent faster Sorting with Array.opIndex: 44389.127280 Sorting with pointers: 41159.016960 7.276805 percent faster ------- This will need some thinking through as to just what happened between GDC-4.3 -> GDC-4.4 :~) RegardsCurious benches there. Seems GCC isn't inlining (or other optimization) as well as Visual C++ here. I'm getting a neglible ~2% difference. -Craig
Dec 13 2010
== Quote from Craig Black (craigblack2 cox.net)'s articleOK, found and fixed the problem with GDC-4.4. My results are now: g++-4.4 -O3 -march=native -ftree-vectorize ------- Sorting with Array: 45861.599640 Sorting with pointers: 38417.632680 16.231372 percent faster Sorting with Array: 46421.979120 Sorting with pointers: 38474.709840 17.119626 percent faster Sorting with Array: 48622.885200 Sorting with pointers: 39593.385600 18.570473 percent faster ------- gdc-4.4 -frelease -O3 -march=native -ftree-vectorize ------- Sorting with Array.opIndex: 41454.052200 Sorting with pointers: 38331.517560 7.532520 percent faster Sorting with Array.opIndex: 41109.852720 Sorting with pointers: 37836.883320 7.961521 percent faster Sorting with Array.opIndex: 40587.349320 Sorting with pointers: 37390.488120 7.876497 percent faster ------- dmd-2.050 -release -O -inline ------- Sorting with Array.opIndex: 53561.598720 Sorting with pointers: 48348.626760 9.732667 percent faster Sorting with Array.opIndex: 55861.658280 Sorting with pointers: 49909.880760 10.654495 percent faster Sorting with Array.opIndex: 56887.660800 Sorting with pointers: 51470.453400 9.522640 percent faster ------- I noticed that with dmd -inline needs to be explicitly set. Otherwise there appears to be nothing to suggest a wild difference between using opIndex and pointers. :~) RegardsTesting your C++ program (altered getCycle() for GCC) Times I get: ------- Sorting with Array: 46869.159840 Sorting with pointers: 38688.937320 17.453316 percent faster Sorting with Array: 46631.903760 Sorting with pointers: 38520.609360 17.394302 percent faster Sorting with Array: 46674.330720 Sorting with pointers: 38545.202520 17.416700 percent faster ------- On a , I thought I might try an older version of GCC for the D program. Really surprisingly, I got: ------- Sorting with Array.opIndex: 43075.059840 Sorting with pointers: 40019.701920 7.093102 percent faster Sorting with Array.opIndex: 42940.085640 Sorting with pointers: 39594.089040 7.792245 percent faster Sorting with Array.opIndex: 44389.127280 Sorting with pointers: 41159.016960 7.276805 percent faster ------- This will need some thinking through as to just what happened between GDC-4.3 -> GDC-4.4 :~) RegardsCurious benches there. Seems GCC isn't inlining (or other optimization) as well as Visual C++ here. I'm getting a neglible ~2% difference. -Craig
Dec 13 2010
"Iain Buclaw" <ibuclaw ubuntu.com> wrote in message news:ie6ahr$1coc$1 digitalmars.com...== Quote from Craig Black (craigblack2 cox.net)'s articleThanks for your help and interest. Basically you are saying I should use GDC instead? I've heard rumors that GDC is not as stable. Can you comment on this? I am still getting different results than you with dmd, but maybe that's because we have different machines? My results on a core i7 1.86 GHz notebook: dmd-2.050 -O -release -inline Sorting with Array.opIndex: 19160.2 Sorting with pointers: 15006.4 21.6791 percent faster Sorting with Array.opIndex: 20200.8 Sorting with pointers: 15699.1 22.2887 percent faster Sorting with Array.opIndex: 19563.3 Sorting with pointers: 15418.4 21.1872 percent faster -CraigOK, found and fixed the problem with GDC-4.4. My results are now: g++-4.4 -O3 -march=native -ftree-vectorize ------- Sorting with Array: 45861.599640 Sorting with pointers: 38417.632680 16.231372 percent faster Sorting with Array: 46421.979120 Sorting with pointers: 38474.709840 17.119626 percent faster Sorting with Array: 48622.885200 Sorting with pointers: 39593.385600 18.570473 percent faster ------- gdc-4.4 -frelease -O3 -march=native -ftree-vectorize ------- Sorting with Array.opIndex: 41454.052200 Sorting with pointers: 38331.517560 7.532520 percent faster Sorting with Array.opIndex: 41109.852720 Sorting with pointers: 37836.883320 7.961521 percent faster Sorting with Array.opIndex: 40587.349320 Sorting with pointers: 37390.488120 7.876497 percent faster ------- dmd-2.050 -release -O -inline ------- Sorting with Array.opIndex: 53561.598720 Sorting with pointers: 48348.626760 9.732667 percent faster Sorting with Array.opIndex: 55861.658280 Sorting with pointers: 49909.880760 10.654495 percent faster Sorting with Array.opIndex: 56887.660800 Sorting with pointers: 51470.453400 9.522640 percent faster ------- I noticed that with dmd -inline needs to be explicitly set. Otherwise there appears to be nothing to suggest a wild difference between using opIndex and pointers. :~) RegardsTesting your C++ program (altered getCycle() for GCC) Times I get: ------- Sorting with Array: 46869.159840 Sorting with pointers: 38688.937320 17.453316 percent faster Sorting with Array: 46631.903760 Sorting with pointers: 38520.609360 17.394302 percent faster Sorting with Array: 46674.330720 Sorting with pointers: 38545.202520 17.416700 percent faster ------- On a , I thought I might try an older version of GCC for the D program. Really surprisingly, I got: ------- Sorting with Array.opIndex: 43075.059840 Sorting with pointers: 40019.701920 7.093102 percent faster Sorting with Array.opIndex: 42940.085640 Sorting with pointers: 39594.089040 7.792245 percent faster Sorting with Array.opIndex: 44389.127280 Sorting with pointers: 41159.016960 7.276805 percent faster ------- This will need some thinking through as to just what happened between GDC-4.3 -> GDC-4.4 :~) RegardsCurious benches there. Seems GCC isn't inlining (or other optimization) as well as Visual C++ here. I'm getting a neglible ~2% difference. -Craig
Dec 14 2010
On Mon, 13 Dec 2010 01:50:50 +0300, Craig Black <craigblack2 cox.net> wrote:The following program illustrates the problems with inlining in the dmd compiler. Perhaps with some more work I can reduce it to a smaller test case. I was playing around with a simple Array template, and noticed that similar C++ code performs much better. This is due, at least in part, to opIndex not being properly inlined by dmd. There are two sort functions, quickSort1 and quickSort2. quickSort1 indexes an Array data structure. quickSort2 indexes raw pointers. quickSort2 is roughly 20% faster on my core i7.Compiled with dmd v2.050/win32 -g -O -inline -release First, I looked in debugger on actual asm and I must say inlining is done very well. Code for two versions is almost identical with slight overhead in case of Array for there is extra level of indirection in data access, inlining or not. Second, I have anywhere from 3.3 to 6.7% difference in performance, but no more than that. Tested on Core2Duo E6300, Windows XP SP3. I increased number of iterations for benchmark!() to 5 to reduce volatility of results. That's the only change to source I did. Third... Now here is a funny thing. Absolute times and difference between implementation depends on how do you run the program. I was dumbfounded as of how does it matter, but the fact is that aforementioned avg 5% difference I get if I run it with command line as "inline.exe". If I run it as "inline" without extension I get difference around 15% and absolute times are notably smaller. X:\d\tests\craig>inline.exe Sorting with Array.opIndex: 6533 Sorting with pointers: 6264 4.11756 percent faster X:\d\tests\craig>inline Sorting with Array.opIndex: 5390 Sorting with pointers: 4674 13.2839 percent faster Something like that. It's not a fluke. I tested it on my old AthlonXp with XP SP2 and saw exactly the same picture (btw, difference in % between implementation was about the same). I ran both variants under stracent and found no difference except one pointer on the stack when LeaveCriticalSection and GetCurrentThreadId are called was always off by 4 bytes. This made me thinking. The only observable difference is length of command line. And indeed, renaming program showed that only length of command line is a reason, not the content. Further tests suggest that some value is either aligned to 8 byte or not depending on length of command line and this makes all the difference (which happens to be greater than difference between implementations of sorting). I couldn't find what value causes slowdown though. -- Using Opera's revolutionary email client: http://www.opera.com/mail/
Dec 16 2010
Looks like alignment of local variables is somewhat broken. import std.stdio; void main() { int a; double d; writeln(&a," ", &d); }testdl.exe12FE70 12FE78testdl12FE74 12FE7C It's not like double is not aligned, otherwise it would be placed right after int, 4 bytes apart, but it is aligned with some assumption which may or may not hold depending on command line. Bug? I hope doubles _should_ be aligned by compiler? On Fri, 17 Dec 2010 09:50:53 +0300, Nick Voronin <elfy.nv gmail.com> wrote:Third... Now here is a funny thing. Absolute times and difference between implementation depends on how do you run the program. I was dumbfounded as of how does it matter, but the fact is that aforementioned avg 5% difference I get if I run it with command line as "inline.exe". If I run it as "inline" without extension I get difference around 15% and absolute times are notably smaller. X:\d\tests\craig>inline.exe Sorting with Array.opIndex: 6533 Sorting with pointers: 6264 4.11756 percent faster X:\d\tests\craig>inline Sorting with Array.opIndex: 5390 Sorting with pointers: 4674 13.2839 percent faster Something like that. It's not a fluke. I tested it on my old AthlonXp with XP SP2 and saw exactly the same picture (btw, difference in % between implementation was about the same). I ran both variants under stracent and found no difference except one pointer on the stack when LeaveCriticalSection and GetCurrentThreadId are called was always off by 4 bytes. This made me thinking. The only observable difference is length of command line. And indeed, renaming program showed that only length of command line is a reason, not the content. Further tests suggest that some value is either aligned to 8 byte or not depending on length of command line and this makes all the difference (which happens to be greater than difference between implementations of sorting). I couldn't find what value causes slowdown though.-- Using Opera's revolutionary email client: http://www.opera.com/mail/
Dec 17 2010
Nick Voronin:Looks like alignment of local variables is somewhat broken.This is a very nice bug. If not already present then please add it to Bugzilla. If you don't want to add it, then I will add it myself. I have modified your code like this: import core.stdc.stdio: printf; void main() { int a; double d; printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8); } And this shows the performance difference: import core.stdc.stdio: printf; import std.date: getUTCtime, ticksPerSecond; void main() { double d = 0.0; auto t0 = getUTCtime(); for (size_t i = 0; i < 100_000_000; i++) d += 1; auto t1 = getUTCtime(); printf("%lf\n", d); printf("%u\n", (cast(size_t)&d) % 8); printf("%lf\n", (cast(double)t1 - cast(double)t0) / ticksPerSecond); } Bye, bearophile
Dec 17 2010
I found this on bugzilla. http://d.puremagic.com/issues/show_bug.cgi?id=2278 So it's not really a bug. Yet there is no simple workaround. And gap between int and double looks strange when stack frame itself is unaligned. btw, is there no explicit alignment for variables in D at all? align(8) double d; compiles if d is global, but it does nothing. import core.stdc.stdio: printf; align(8) int a; align(8) double d; void main() { printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8); } This _always_ prints "4 4" for me. It seems that globals are not affected by environment, but not aligned anyway. Honestly, I'm lost :) It all works not as expected but then again I can't find any evidences that my expectations are valid. On Sat, 18 Dec 2010 01:57:44 +0300, bearophile <bearophileHUGS lycos.com> wrote:Nick Voronin:-- Using Opera's revolutionary email client: http://www.opera.com/mail/Looks like alignment of local variables is somewhat broken.This is a very nice bug. If not already present then please add it to Bugzilla. If you don't want to add it, then I will add it myself. I have modified your code like this: import core.stdc.stdio: printf; void main() { int a; double d; printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8); } And this shows the performance difference: import core.stdc.stdio: printf; import std.date: getUTCtime, ticksPerSecond; void main() { double d = 0.0; auto t0 = getUTCtime(); for (size_t i = 0; i < 100_000_000; i++) d += 1; auto t1 = getUTCtime(); printf("%lf\n", d); printf("%u\n", (cast(size_t)&d) % 8); printf("%lf\n", (cast(double)t1 - cast(double)t0) / ticksPerSecond); } Bye, bearophile
Dec 17 2010
Nick Voronin:I found this on bugzilla. http://d.puremagic.com/issues/show_bug.cgi?id=2278 So it's not really a bug.It's borderline a performance bug. You may add that code to the issue 2278 as further examples. Bye, bearophile
Dec 17 2010
Nick Voronin wrote:I found this on bugzilla. http://d.puremagic.com/issues/show_bug.cgi?id=2278 So it's not really a bug. Yet there is no simple workaround. And gap between int and double looks strange when stack frame itself is unaligned.Interestingly, since I filed that bug, DMD for MacOSX was created, which ensures that the stack stays aligned in the way I described! So the DMD backend is perfectly capable of doing it now.btw, is there no explicit alignment for variables in D at all? align(8) double d; compiles if d is global, but it does nothing.That's a regression. Large globals are always aligned to a 16-byte boundary (see changelog for 2.007) However this code: import core.stdc.stdio: printf; int a; double[4] d; void main() { printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8); } shows that it stopped doing that somewhere between 2.027 and 2.030.
Dec 17 2010
On Sat, 18 Dec 2010 04:17:46 +0300, Don <nospam nospam.com> wrote:Interestingly, since I filed that bug, DMD for MacOSX was created, which ensures that the stack stays aligned in the way I described! So the DMD backend is perfectly capable of doing it now.I added some examples to that bugreport. Just in case :)Thanks. Filed it as http://d.puremagic.com/issues/show_bug.cgi?id=5355 -- Using Opera's revolutionary email client: http://www.opera.com/mail/btw, is there no explicit alignment for variables in D at all? align(8) double d; compiles if d is global, but it does nothing.That's a regression. Large globals are always aligned to a 16-byte boundary (see changelog for 2.007) However this code: import core.stdc.stdio: printf; int a; double[4] d; void main() { printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8); } shows that it stopped doing that somewhere between 2.027 and 2.030.
Dec 17 2010
On Sat, 18 Dec 2010 02:17:46 +0100 Don <nospam nospam.com> wrote:Nick Voronin wrote:On second thought large globals in static segment (as log says) are probably only those with __gshared prefix. And they do look aligned.btw, is there no explicit alignment for variables in D at all? align(8) double d; compiles if d is global, but it does nothing.That's a regression. Large globals are always aligned to a 16-byte boundary (see changelog for 2.007)However this code: import core.stdc.stdio: printf; int a; double[4] d; void main() { printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8); } shows that it stopped doing that somewhere between 2.027 and 2.030.-- Nick Voronin <elfy.nv gmail.com>
Dec 18 2010
Nick Voronin wrote:On Sat, 18 Dec 2010 02:17:46 +0100 Don <nospam nospam.com> wrote:Good catch! Yes, that makes perfect sense. So the bug is that align() is ignored for TLS variables.Nick Voronin wrote:On second thought large globals in static segment (as log says) are probably only those with __gshared prefix. And they do look aligned.btw, is there no explicit alignment for variables in D at all? align(8) double d; compiles if d is global, but it does nothing.That's a regression. Large globals are always aligned to a 16-byte boundary (see changelog for 2.007)
Dec 20 2010