digitalmars.D - Inlining Code Test

Craig Black (260/260) Dec 12 2010 The following program illustrates the problems with inlining in the dmd

Iain Buclaw (56/316) Dec 12 2010 Testing on GDC with a Netbook, results from 3 consecutive runs are:

Andrej Mitrovic (6/327) Dec 12 2010 My crappy old Pentium 4 has gone totally mad:

Craig Black (4/360) Dec 12 2010 Try it without -profile. That tends to slow everything down to a halt.

Andrej Mitrovic (6/371) Dec 12 2010 Ye that's better:

Craig Black (246/246) Dec 12 2010 If the problem is not inlining, then why does the same code in C++ does ...

Iain Buclaw (283/529) Dec 13 2010 D can use inline asm in much the same way, but that's deterring from you...

Craig Black (3/32) Dec 13 2010 Curious benches there. Seems GCC isn't inlining (or other optimization)...

Iain Buclaw (41/83) Dec 13 2010 OK, found and fixed the problem with GDC-4.4. My results are now:

Craig Black (18/103) Dec 14 2010 Thanks for your help and interest. Basically you are saying I should us...

Nick Voronin (40/48) Dec 16 2010 Compiled with dmd v2.050/win32 -g -O -inline -release

Nick Voronin (17/46) Dec 17 2010 Looks like alignment of local variables is somewhat broken.

bearophile (23/24) Dec 17 2010 This is a very nice bug. If not already present then please add it to Bu...

Nick Voronin (20/46) Dec 17 2010 I found this on bugzilla.

bearophile (4/8) Dec 17 2010 It's borderline a performance bug. You may add that code to the issue 22...
Don (14/21) Dec 17 2010 Interestingly, since I filed that bug, DMD for MacOSX was created, which...

Nick Voronin (5/20) Dec 17 2010 Thanks. Filed it as http://d.puremagic.com/issues/show_bug.cgi?id=5355
Nick Voronin (5/22) Dec 18 2010 On second thought large globals in static segment (as log says) are prob...

Don (3/13) Dec 20 2010 Good catch! Yes, that makes perfect sense.

"Craig Black" <craigblack2 cox.net> writes:

The following program illustrates the problems with inlining in the dmd 
compiler.  Perhaps with some more work I can reduce it to a smaller test 
case.  I was playing around with a simple Array template, and noticed that 
similar C++ code performs much better.  This is due, at least in part, to 
opIndex not being properly inlined by dmd.  There are two sort functions, 
quickSort1 and quickSort2.  quickSort1 indexes an Array data structure. 
quickSort2 indexes raw pointers.  quickSort2 is roughly 20% faster on my 
core i7.

import std.stdio;
import std.date;
import std.random;
import std.conv;
import std.algorithm;
import std.range;

struct Array(T)
{
public:

  this(int length) { resize(length); }
  this(T[] a) { append(a); }

  this(this)
  {
    if(!base.array) return;
    ArrayBase!T old;
    old = base;
    base.array = null;
    reserve(old.length(), old.length());
    copyData(old);
    old.array = null;
  }

  void clear() { base.clear(); }

  void resize(int sz)
  {
    assert(sz >= 0);
    if(sz == 0) return clear();
    if(!base.array) reserve(sz, sz);
    *base.lengthPtr() = sz;
  }

  void reserve(int capacity, int length)
  {
    assert(length <= capacity);
    if(capacity == 0) return clear();
    ArrayBase!T old;
    if(base.array)
    {
      if(base.capacity() >= capacity) return;
      old.array = base.array;
      base.array = null;
    }
    base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
    *base.lengthPtr() = length;
    *base.capacityPtr() = capacity;

    for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
    if(old.array) copyData(old);
  }

  int length() const { return base.length(); }
  int capacity() const { return base.capacity(); }

  bool empty() const { return base.array == null; }

  ref T at(int i)
  {
    assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ 
" out of bounds of empty array");
    assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ 
to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
    return base.data()[i];
  }

  ref const T at(int i) const
  {
    assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ 
" out of bounds of empty array");
    assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ 
to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
    return base.data()[i];
  }

  const ref T opIndex(int i) const { return at(i); }
  void opIndexAssign(T t, int i) { at(i) = t; }
  void opIndexAssign(ref T t, int i) { at(i) = t; }

  void opAssign(ref const Array!T array)
  {
    copy(array);
  }

  void opAssign(T[] array)
  {
    int len = array.length;
    resize(len);
    for(int i = 0; i < len; i++) at(i) = array[i];
  }

  void copy(ref const Array!T array)
  {
    if(array.empty()) return clear();
    int len = array.length();
    reserve(len, len);
    for(int i = 0; i < len; i++) at(i) = array[i];
  }

  void opOpAssign(string op, A...)(A a) if(op == "~")
  {
    appendComposite(a);
  }

  ref T front() { return at(0); }
  const ref T front() const { return at(0); }
  ref T back() { return at(length()-1); }
  const ref T back() const { return at(length()-1); }

  ref T appendOne()
  {
    int sz = length();
    int sp = capacity();
    if(sp > sz) (*base.lengthPtr())++;
    else
    {
      sz++;
      sp = max(2, sp+sp/2, sz);
      reserve(sp, sz);
    }
    return back();
  }

  void append(A...)(A a)
  {
    static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || 
is(typeof(a[0]) == T[])))
    {
      appendComposite(a[0]);
    } else {
      appendTuple(a);
    }
  }

  void appendTuple(A...)(A a)
  {
    foreach(x; a) appendOne() = x;
  }

  void appendComposite(A)(A a)
  {
    int prevLen = length();
    resize(prevLen + a.length);
    for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
  }

private:

      static struct ArrayBase(T)
      {
      public:

            ~this() { clear(); }
            void clear() { if(array) delete array; }

            int length() const { return array ? *lengthPtr() : 0; }
            int capacity() const { return array ? *capacityPtr() : 0; }
            int* lengthPtr() const { return cast(int*)(array); }
            int* capacityPtr() const { return cast(int*)(array+4); }
            T* data() const { return cast(T*)(array+8); }

            ubyte* array;
      }

  ArrayBase!T base;

  void copyData(ref ArrayBase!T array)
  {
    int copyLen = min(length, array.length());
    for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
  }
}

static bool less(T)(T a, T b) { return a < b; }

void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
{
  for(int i = low; i <= high; i++)
  {
    int min = i;
    for(int j = i + 1; j <= high; j++)
      if(L(a[j], a[min])) min = j;
    swap(a[i], a[min]);
  }
}

int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
      T x = a[r];
      int j = p - 1;
      for (int i = p; i < r; i++)
      {
            if (L(x, a[i])) continue;
            swap(a[i], a[++j]);
      }
      a[r] = a[j + 1];
      a[j + 1] = x;
      return j + 1;
}

void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
  if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
  if (p < r)
      {
            int q = partition1!(T, L)(a, p, r);
            quickSort1!(T, L)(a, p, q - 1);
            quickSort1!(T, L)(a, q + 1, r);
      }
}

void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, 
a.length()-1); }

void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
{
  for(int i = low; i <= high; i++)
  {
    int min = i;
    for(int j = i + 1; j <= high; j++)
      if(L(a[j], a[min])) min = j;
    swap(a[i], a[min]);
  }
}

int partition2(T, alias L = less!T)(T *a, int p, int r)
{
      T x = a[r];
      int j = p - 1;
      for (int i = p; i < r; i++)
      {
            if (L(x, a[i])) continue;
            swap(a[i], a[++j]);
      }
      a[r] = a[j + 1];
      a[j + 1] = x;
      return j + 1;
}

void quickSort2(T, alias L = less!T)(T *a, int p, int r)
{
  if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
  if (p < r)
      {
            int q = partition2!(T, L)(a, p, r);
            quickSort2!(T, L)(a, p, q - 1);
            quickSort2!(T, L)(a, q + 1, r);
      }
}

void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, 
length-1); }

double[] vals;

void bench1()
{
  Array!double v;
  for(int i = 0; i < 100; i++)
  {
    v = vals;
    sort1(v);
  }
}

void bench2()
{
  Array!double v;
  for(int i = 0; i < 100; i++)
  {
    v = vals;
    sort2(&v[0], v.length);
  }
}

void main()
{
  Mt19937 gen;
  vals.length = 1000;
  for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);

  ulong[] times = [0, 0];
  for(int i = 0; i < 100; i++)
  {
    auto times2 = benchmark!(bench1, bench2)(1);
    times[0] += times2[0];
    times[1] += times2[1];
  }
  writeln("Sorting with Array.opIndex: ", times[0]);
  writeln("Sorting with pointers: ", times[1]);
  writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
}

Dec 12 2010

Iain Buclaw <ibuclaw ubuntu.com> writes:

== Quote from Craig Black (craigblack2 cox.net)'s article
 The following program illustrates the problems with inlining in the dmd
 compiler.  Perhaps with some more work I can reduce it to a smaller test
 case.  I was playing around with a simple Array template, and noticed that
 similar C++ code performs much better.  This is due, at least in part, to
 opIndex not being properly inlined by dmd.  There are two sort functions,
 quickSort1 and quickSort2.  quickSort1 indexes an Array data structure.
 quickSort2 indexes raw pointers.  quickSort2 is roughly 20% faster on my
 core i7.
 import std.stdio;
 import std.date;
 import std.random;
 import std.conv;
 import std.algorithm;
 import std.range;
 struct Array(T)
 {
 public:
   this(int length) { resize(length); }
   this(T[] a) { append(a); }
   this(this)
   {
     if(!base.array) return;
     ArrayBase!T old;
     old = base;
     base.array = null;
     reserve(old.length(), old.length());
     copyData(old);
     old.array = null;
   }
   void clear() { base.clear(); }
   void resize(int sz)
   {
     assert(sz >= 0);
     if(sz == 0) return clear();
     if(!base.array) reserve(sz, sz);
     *base.lengthPtr() = sz;
   }
   void reserve(int capacity, int length)
   {
     assert(length <= capacity);
     if(capacity == 0) return clear();
     ArrayBase!T old;
     if(base.array)
     {
       if(base.capacity() >= capacity) return;
       old.array = base.array;
       base.array = null;
     }
     base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
     *base.lengthPtr() = length;
     *base.capacityPtr() = capacity;
     for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
     if(old.array) copyData(old);
   }
   int length() const { return base.length(); }
   int capacity() const { return base.capacity(); }
   bool empty() const { return base.array == null; }
   ref T at(int i)
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   ref const T at(int i) const
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   const ref T opIndex(int i) const { return at(i); }
   void opIndexAssign(T t, int i) { at(i) = t; }
   void opIndexAssign(ref T t, int i) { at(i) = t; }
   void opAssign(ref const Array!T array)
   {
     copy(array);
   }
   void opAssign(T[] array)
   {
     int len = array.length;
     resize(len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void copy(ref const Array!T array)
   {
     if(array.empty()) return clear();
     int len = array.length();
     reserve(len, len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void opOpAssign(string op, A...)(A a) if(op == "~")
   {
     appendComposite(a);
   }
   ref T front() { return at(0); }
   const ref T front() const { return at(0); }
   ref T back() { return at(length()-1); }
   const ref T back() const { return at(length()-1); }
   ref T appendOne()
   {
     int sz = length();
     int sp = capacity();
     if(sp > sz) (*base.lengthPtr())++;
     else
     {
       sz++;
       sp = max(2, sp+sp/2, sz);
       reserve(sp, sz);
     }
     return back();
   }
   void append(A...)(A a)
   {
     static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
 is(typeof(a[0]) == T[])))
     {
       appendComposite(a[0]);
     } else {
       appendTuple(a);
     }
   }
   void appendTuple(A...)(A a)
   {
     foreach(x; a) appendOne() = x;
   }
   void appendComposite(A)(A a)
   {
     int prevLen = length();
     resize(prevLen + a.length);
     for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
   }
 private:
       static struct ArrayBase(T)
       {
       public:
             ~this() { clear(); }
             void clear() { if(array) delete array; }
             int length() const { return array ? *lengthPtr() : 0; }
             int capacity() const { return array ? *capacityPtr() : 0; }
             int* lengthPtr() const { return cast(int*)(array); }
             int* capacityPtr() const { return cast(int*)(array+4); }
             T* data() const { return cast(T*)(array+8); }
             ubyte* array;
       }
   ArrayBase!T base;
   void copyData(ref ArrayBase!T array)
   {
     int copyLen = min(length, array.length());
     for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
   }
 }
 static bool less(T)(T a, T b) { return a < b; }
 void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
   if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition1!(T, L)(a, p, r);
             quickSort1!(T, L)(a, p, q - 1);
             quickSort1!(T, L)(a, q + 1, r);
       }
 }
 void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
 a.length()-1); }
 void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition2(T, alias L = less!T)(T *a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort2(T, alias L = less!T)(T *a, int p, int r)
 {
   if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition2!(T, L)(a, p, r);
             quickSort2!(T, L)(a, p, q - 1);
             quickSort2!(T, L)(a, q + 1, r);
       }
 }
 void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0,
 length-1); }
 double[] vals;
 void bench1()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort1(v);
   }
 }
 void bench2()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort2(&v[0], v.length);
   }
 }
 void main()
 {
   Mt19937 gen;
   vals.length = 1000;
   for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
   ulong[] times = [0, 0];
   for(int i = 0; i < 100; i++)
   {
     auto times2 = benchmark!(bench1, bench2)(1);
     times[0] += times2[0];
     times[1] += times2[1];
   }
   writeln("Sorting with Array.opIndex: ", times[0]);
   writeln("Sorting with pointers: ", times[1]);
   writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
 }

Testing on GDC with a Netbook, results from 3 consecutive runs are:

Without -frelease
-------
Sorting with Array.opIndex: 27981
Sorting with pointers: 5602
79.9793 percent faster

Sorting with Array.opIndex: 25565
Sorting with pointers: 5179
79.7418 percent faster

Sorting with Array.opIndex: 28657
Sorting with pointers: 5772
79.8583 percent faster
-------


With -frelease
-------
Sorting with Array.opIndex: 10591
Sorting with pointers: 4771
54.9523 percent faster

Sorting with Array.opIndex: 10289
Sorting with pointers: 4710
54.223 percent faster

Sorting with Array.opIndex: 11305
Sorting with pointers: 5216
53.8611 percent faster
-------


With -frelease -fno-bounds-check
-------
Sorting with Array.opIndex: 11651
Sorting with pointers: 5236
55.0597 percent faster

Sorting with Array.opIndex: 9873
Sorting with pointers: 4559
53.8236 percent faster

Sorting with Array.opIndex: 10361
Sorting with pointers: 4745
54.2033 percent faster
-------


GDC doesn't use DMD's FE inliner, but results from the GCC backend's inliner:
-------
Considering inline candidate check.
 Inlining check into fillUp.
Merging blocks 9 and 10
Merging blocks 9 and 11

Considering inline candidate initialize.
 Inlining initialize into emplace.
Merging blocks 2 and 3
Merging blocks 2 and 4

Considering inline candidate bench2.
Not inlining: code size would grow by 77 insns.
Considering inline candidate bench1.
Not inlining: code size would grow by 49 insns.
-------


So there's _me_ seriously doubting that inlining has anything to do with the
50% increase.

Regards
Iain

Dec 12 2010

Andrej Mitrovic <andrej.mitrovich gmail.com> writes:

My crappy old Pentium 4 has gone totally mad:

Sorting with Array.opIndex: 45080
Sorting with pointers: 45608
4.092e+16 percent faster (??)

Compiled with dmd -profile -O -release -inline -noboundscheck

On 12/13/10, Iain Buclaw <ibuclaw ubuntu.com> wrote:
 == Quote from Craig Black (craigblack2 cox.net)'s article
 The following program illustrates the problems with inlining in the dmd
 compiler.  Perhaps with some more work I can reduce it to a smaller test
 case.  I was playing around with a simple Array template, and noticed that
 similar C++ code performs much better.  This is due, at least in part, to
 opIndex not being properly inlined by dmd.  There are two sort functions,
 quickSort1 and quickSort2.  quickSort1 indexes an Array data structure.
 quickSort2 indexes raw pointers.  quickSort2 is roughly 20% faster on my
 core i7.
 import std.stdio;
 import std.date;
 import std.random;
 import std.conv;
 import std.algorithm;
 import std.range;
 struct Array(T)
 {
 public:
   this(int length) { resize(length); }
   this(T[] a) { append(a); }
   this(this)
   {
     if(!base.array) return;
     ArrayBase!T old;
     old = base;
     base.array = null;
     reserve(old.length(), old.length());
     copyData(old);
     old.array = null;
   }
   void clear() { base.clear(); }
   void resize(int sz)
   {
     assert(sz >= 0);
     if(sz == 0) return clear();
     if(!base.array) reserve(sz, sz);
     *base.lengthPtr() = sz;
   }
   void reserve(int capacity, int length)
   {
     assert(length <= capacity);
     if(capacity == 0) return clear();
     ArrayBase!T old;
     if(base.array)
     {
       if(base.capacity() >= capacity) return;
       old.array = base.array;
       base.array = null;
     }
     base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
     *base.lengthPtr() = length;
     *base.capacityPtr() = capacity;
     for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
     if(old.array) copyData(old);
   }
   int length() const { return base.length(); }
   int capacity() const { return base.capacity(); }
   bool empty() const { return base.array == null; }
   ref T at(int i)
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   ref const T at(int i) const
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   const ref T opIndex(int i) const { return at(i); }
   void opIndexAssign(T t, int i) { at(i) = t; }
   void opIndexAssign(ref T t, int i) { at(i) = t; }
   void opAssign(ref const Array!T array)
   {
     copy(array);
   }
   void opAssign(T[] array)
   {
     int len = array.length;
     resize(len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void copy(ref const Array!T array)
   {
     if(array.empty()) return clear();
     int len = array.length();
     reserve(len, len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void opOpAssign(string op, A...)(A a) if(op == "~")
   {
     appendComposite(a);
   }
   ref T front() { return at(0); }
   const ref T front() const { return at(0); }
   ref T back() { return at(length()-1); }
   const ref T back() const { return at(length()-1); }
   ref T appendOne()
   {
     int sz = length();
     int sp = capacity();
     if(sp > sz) (*base.lengthPtr())++;
     else
     {
       sz++;
       sp = max(2, sp+sp/2, sz);
       reserve(sp, sz);
     }
     return back();
   }
   void append(A...)(A a)
   {
     static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
 is(typeof(a[0]) == T[])))
     {
       appendComposite(a[0]);
     } else {
       appendTuple(a);
     }
   }
   void appendTuple(A...)(A a)
   {
     foreach(x; a) appendOne() = x;
   }
   void appendComposite(A)(A a)
   {
     int prevLen = length();
     resize(prevLen + a.length);
     for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
   }
 private:
       static struct ArrayBase(T)
       {
       public:
             ~this() { clear(); }
             void clear() { if(array) delete array; }
             int length() const { return array ? *lengthPtr() : 0; }
             int capacity() const { return array ? *capacityPtr() : 0; }
             int* lengthPtr() const { return cast(int*)(array); }
             int* capacityPtr() const { return cast(int*)(array+4); }
             T* data() const { return cast(T*)(array+8); }
             ubyte* array;
       }
   ArrayBase!T base;
   void copyData(ref ArrayBase!T array)
   {
     int copyLen = min(length, array.length());
     for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
   }
 }
 static bool less(T)(T a, T b) { return a < b; }
 void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
   if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition1!(T, L)(a, p, r);
             quickSort1!(T, L)(a, p, q - 1);
             quickSort1!(T, L)(a, q + 1, r);
       }
 }
 void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
 a.length()-1); }
 void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition2(T, alias L = less!T)(T *a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort2(T, alias L = less!T)(T *a, int p, int r)
 {
   if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition2!(T, L)(a, p, r);
             quickSort2!(T, L)(a, p, q - 1);
             quickSort2!(T, L)(a, q + 1, r);
       }
 }
 void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a,
 0,
 length-1); }
 double[] vals;
 void bench1()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort1(v);
   }
 }
 void bench2()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort2(&v[0], v.length);
   }
 }
 void main()
 {
   Mt19937 gen;
   vals.length = 1000;
   for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
   ulong[] times = [0, 0];
   for(int i = 0; i < 100; i++)
   {
     auto times2 = benchmark!(bench1, bench2)(1);
     times[0] += times2[0];
     times[1] += times2[1];
   }
   writeln("Sorting with Array.opIndex: ", times[0]);
   writeln("Sorting with pointers: ", times[1]);
   writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
 }

 Testing on GDC with a Netbook, results from 3 consecutive runs are:

 Without -frelease
 -------
 Sorting with Array.opIndex: 27981
 Sorting with pointers: 5602
 79.9793 percent faster

 Sorting with Array.opIndex: 25565
 Sorting with pointers: 5179
 79.7418 percent faster

 Sorting with Array.opIndex: 28657
 Sorting with pointers: 5772
 79.8583 percent faster
 -------


 With -frelease
 -------
 Sorting with Array.opIndex: 10591
 Sorting with pointers: 4771
 54.9523 percent faster

 Sorting with Array.opIndex: 10289
 Sorting with pointers: 4710
 54.223 percent faster

 Sorting with Array.opIndex: 11305
 Sorting with pointers: 5216
 53.8611 percent faster
 -------


 With -frelease -fno-bounds-check
 -------
 Sorting with Array.opIndex: 11651
 Sorting with pointers: 5236
 55.0597 percent faster

 Sorting with Array.opIndex: 9873
 Sorting with pointers: 4559
 53.8236 percent faster

 Sorting with Array.opIndex: 10361
 Sorting with pointers: 4745
 54.2033 percent faster
 -------


 GDC doesn't use DMD's FE inliner, but results from the GCC backend's
 inliner:
 -------
 Considering inline candidate check.
  Inlining check into fillUp.
 Merging blocks 9 and 10
 Merging blocks 9 and 11

 Considering inline candidate initialize.
  Inlining initialize into emplace.
 Merging blocks 2 and 3
 Merging blocks 2 and 4

 Considering inline candidate bench2.
 Not inlining: code size would grow by 77 insns.
 Considering inline candidate bench1.
 Not inlining: code size would grow by 49 insns.
 -------


 So there's _me_ seriously doubting that inlining has anything to do with the
 50% increase.

 Regards
 Iain

Dec 12 2010

"Craig Black" <craigblack2 cox.net> writes:

Try it without -profile.  That tends to slow everything down to a halt.

-Craig

"Andrej Mitrovic" <andrej.mitrovich gmail.com> wrote in message 
news:mailman.948.1292198993.21107.digitalmars-d puremagic.com...
 My crappy old Pentium 4 has gone totally mad:

 Sorting with Array.opIndex: 45080
 Sorting with pointers: 45608
 4.092e+16 percent faster (??)

 Compiled with dmd -profile -O -release -inline -noboundscheck

 On 12/13/10, Iain Buclaw <ibuclaw ubuntu.com> wrote:
 == Quote from Craig Black (craigblack2 cox.net)'s article
 The following program illustrates the problems with inlining in the dmd
 compiler.  Perhaps with some more work I can reduce it to a smaller test
 case.  I was playing around with a simple Array template, and noticed 
 that
 similar C++ code performs much better.  This is due, at least in part, 
 to
 opIndex not being properly inlined by dmd.  There are two sort 
 functions,
 quickSort1 and quickSort2.  quickSort1 indexes an Array data structure.
 quickSort2 indexes raw pointers.  quickSort2 is roughly 20% faster on my
 core i7.
 import std.stdio;
 import std.date;
 import std.random;
 import std.conv;
 import std.algorithm;
 import std.range;
 struct Array(T)
 {
 public:
   this(int length) { resize(length); }
   this(T[] a) { append(a); }
   this(this)
   {
     if(!base.array) return;
     ArrayBase!T old;
     old = base;
     base.array = null;
     reserve(old.length(), old.length());
     copyData(old);
     old.array = null;
   }
   void clear() { base.clear(); }
   void resize(int sz)
   {
     assert(sz >= 0);
     if(sz == 0) return clear();
     if(!base.array) reserve(sz, sz);
     *base.lengthPtr() = sz;
   }
   void reserve(int capacity, int length)
   {
     assert(length <= capacity);
     if(capacity == 0) return clear();
     ArrayBase!T old;
     if(base.array)
     {
       if(base.capacity() >= capacity) return;
       old.array = base.array;
       base.array = null;
     }
     base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
     *base.lengthPtr() = length;
     *base.capacityPtr() = capacity;
     for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
     if(old.array) copyData(old);
   }
   int length() const { return base.length(); }
   int capacity() const { return base.capacity(); }
   bool empty() const { return base.array == null; }
   ref T at(int i)
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ 
 to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " 
 ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   ref const T at(int i) const
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ 
 to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " 
 ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   const ref T opIndex(int i) const { return at(i); }
   void opIndexAssign(T t, int i) { at(i) = t; }
   void opIndexAssign(ref T t, int i) { at(i) = t; }
   void opAssign(ref const Array!T array)
   {
     copy(array);
   }
   void opAssign(T[] array)
   {
     int len = array.length;
     resize(len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void copy(ref const Array!T array)
   {
     if(array.empty()) return clear();
     int len = array.length();
     reserve(len, len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void opOpAssign(string op, A...)(A a) if(op == "~")
   {
     appendComposite(a);
   }
   ref T front() { return at(0); }
   const ref T front() const { return at(0); }
   ref T back() { return at(length()-1); }
   const ref T back() const { return at(length()-1); }
   ref T appendOne()
   {
     int sz = length();
     int sp = capacity();
     if(sp > sz) (*base.lengthPtr())++;
     else
     {
       sz++;
       sp = max(2, sp+sp/2, sz);
       reserve(sp, sz);
     }
     return back();
   }
   void append(A...)(A a)
   {
     static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
 is(typeof(a[0]) == T[])))
     {
       appendComposite(a[0]);
     } else {
       appendTuple(a);
     }
   }
   void appendTuple(A...)(A a)
   {
     foreach(x; a) appendOne() = x;
   }
   void appendComposite(A)(A a)
   {
     int prevLen = length();
     resize(prevLen + a.length);
     for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
   }
 private:
       static struct ArrayBase(T)
       {
       public:
             ~this() { clear(); }
             void clear() { if(array) delete array; }
             int length() const { return array ? *lengthPtr() : 0; }
             int capacity() const { return array ? *capacityPtr() : 0; }
             int* lengthPtr() const { return cast(int*)(array); }
             int* capacityPtr() const { return cast(int*)(array+4); }
             T* data() const { return cast(T*)(array+8); }
             ubyte* array;
       }
   ArrayBase!T base;
   void copyData(ref ArrayBase!T array)
   {
     int copyLen = min(length, array.length());
     for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
   }
 }
 static bool less(T)(T a, T b) { return a < b; }
 void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int 
 high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
   if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition1!(T, L)(a, p, r);
             quickSort1!(T, L)(a, p, q - 1);
             quickSort1!(T, L)(a, q + 1, r);
       }
 }
 void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
 a.length()-1); }
 void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition2(T, alias L = less!T)(T *a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort2(T, alias L = less!T)(T *a, int p, int r)
 {
   if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition2!(T, L)(a, p, r);
             quickSort2!(T, L)(a, p, q - 1);
             quickSort2!(T, L)(a, q + 1, r);
       }
 }
 void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a,
 0,
 length-1); }
 double[] vals;
 void bench1()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort1(v);
   }
 }
 void bench2()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort2(&v[0], v.length);
   }
 }
 void main()
 {
   Mt19937 gen;
   vals.length = 1000;
   for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
   ulong[] times = [0, 0];
   for(int i = 0; i < 100; i++)
   {
     auto times2 = benchmark!(bench1, bench2)(1);
     times[0] += times2[0];
     times[1] += times2[1];
   }
   writeln("Sorting with Array.opIndex: ", times[0]);
   writeln("Sorting with pointers: ", times[1]);
   writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
 }

 Testing on GDC with a Netbook, results from 3 consecutive runs are:

 Without -frelease
 -------
 Sorting with Array.opIndex: 27981
 Sorting with pointers: 5602
 79.9793 percent faster

 Sorting with Array.opIndex: 25565
 Sorting with pointers: 5179
 79.7418 percent faster

 Sorting with Array.opIndex: 28657
 Sorting with pointers: 5772
 79.8583 percent faster
 -------


 With -frelease
 -------
 Sorting with Array.opIndex: 10591
 Sorting with pointers: 4771
 54.9523 percent faster

 Sorting with Array.opIndex: 10289
 Sorting with pointers: 4710
 54.223 percent faster

 Sorting with Array.opIndex: 11305
 Sorting with pointers: 5216
 53.8611 percent faster
 -------


 With -frelease -fno-bounds-check
 -------
 Sorting with Array.opIndex: 11651
 Sorting with pointers: 5236
 55.0597 percent faster

 Sorting with Array.opIndex: 9873
 Sorting with pointers: 4559
 53.8236 percent faster

 Sorting with Array.opIndex: 10361
 Sorting with pointers: 4745
 54.2033 percent faster
 -------


 GDC doesn't use DMD's FE inliner, but results from the GCC backend's
 inliner:
 -------
 Considering inline candidate check.
  Inlining check into fillUp.
 Merging blocks 9 and 10
 Merging blocks 9 and 11

 Considering inline candidate initialize.
  Inlining initialize into emplace.
 Merging blocks 2 and 3
 Merging blocks 2 and 4

 Considering inline candidate bench2.
 Not inlining: code size would grow by 77 insns.
 Considering inline candidate bench1.
 Not inlining: code size would grow by 49 insns.
 -------


 So there's _me_ seriously doubting that inlining has anything to do with 
 the
 50% increase.

 Regards
 Iain

Dec 12 2010

Andrej Mitrovic <andrej.mitrovich gmail.com> writes:

Ye that's better:

Sorting with Array.opIndex: 2859
Sorting with pointers: 1765
38.2651 percent faster

And it only took 2 seconds. With profile it takes a full minute.

On 12/13/10, Craig Black <craigblack2 cox.net> wrote:
 Try it without -profile.  That tends to slow everything down to a halt.

 -Craig

 "Andrej Mitrovic" <andrej.mitrovich gmail.com> wrote in message
 news:mailman.948.1292198993.21107.digitalmars-d puremagic.com...
 My crappy old Pentium 4 has gone totally mad:

 Sorting with Array.opIndex: 45080
 Sorting with pointers: 45608
 4.092e+16 percent faster (??)

 Compiled with dmd -profile -O -release -inline -noboundscheck

 On 12/13/10, Iain Buclaw <ibuclaw ubuntu.com> wrote:
 == Quote from Craig Black (craigblack2 cox.net)'s article
 The following program illustrates the problems with inlining in the dmd
 compiler.  Perhaps with some more work I can reduce it to a smaller test
 case.  I was playing around with a simple Array template, and noticed
 that
 similar C++ code performs much better.  This is due, at least in part,
 to
 opIndex not being properly inlined by dmd.  There are two sort
 functions,
 quickSort1 and quickSort2.  quickSort1 indexes an Array data structure.
 quickSort2 indexes raw pointers.  quickSort2 is roughly 20% faster on my
 core i7.
 import std.stdio;
 import std.date;
 import std.random;
 import std.conv;
 import std.algorithm;
 import std.range;
 struct Array(T)
 {
 public:
   this(int length) { resize(length); }
   this(T[] a) { append(a); }
   this(this)
   {
     if(!base.array) return;
     ArrayBase!T old;
     old = base;
     base.array = null;
     reserve(old.length(), old.length());
     copyData(old);
     old.array = null;
   }
   void clear() { base.clear(); }
   void resize(int sz)
   {
     assert(sz >= 0);
     if(sz == 0) return clear();
     if(!base.array) reserve(sz, sz);
     *base.lengthPtr() = sz;
   }
   void reserve(int capacity, int length)
   {
     assert(length <= capacity);
     if(capacity == 0) return clear();
     ArrayBase!T old;
     if(base.array)
     {
       if(base.capacity() >= capacity) return;
       old.array = base.array;
       base.array = null;
     }
     base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
     *base.lengthPtr() = length;
     *base.capacityPtr() = capacity;
     for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
     if(old.array) copyData(old);
   }
   int length() const { return base.length(); }
   int capacity() const { return base.capacity(); }
   bool empty() const { return base.array == null; }
   ref T at(int i)
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index "

 ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   ref const T at(int i) const
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index "

 ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   const ref T opIndex(int i) const { return at(i); }
   void opIndexAssign(T t, int i) { at(i) = t; }
   void opIndexAssign(ref T t, int i) { at(i) = t; }
   void opAssign(ref const Array!T array)
   {
     copy(array);
   }
   void opAssign(T[] array)
   {
     int len = array.length;
     resize(len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void copy(ref const Array!T array)
   {
     if(array.empty()) return clear();
     int len = array.length();
     reserve(len, len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void opOpAssign(string op, A...)(A a) if(op == "~")
   {
     appendComposite(a);
   }
   ref T front() { return at(0); }
   const ref T front() const { return at(0); }
   ref T back() { return at(length()-1); }
   const ref T back() const { return at(length()-1); }
   ref T appendOne()
   {
     int sz = length();
     int sp = capacity();
     if(sp > sz) (*base.lengthPtr())++;
     else
     {
       sz++;
       sp = max(2, sp+sp/2, sz);
       reserve(sp, sz);
     }
     return back();
   }
   void append(A...)(A a)
   {
     static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
 is(typeof(a[0]) == T[])))
     {
       appendComposite(a[0]);
     } else {
       appendTuple(a);
     }
   }
   void appendTuple(A...)(A a)
   {
     foreach(x; a) appendOne() = x;
   }
   void appendComposite(A)(A a)
   {
     int prevLen = length();
     resize(prevLen + a.length);
     for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
   }
 private:
       static struct ArrayBase(T)
       {
       public:
             ~this() { clear(); }
             void clear() { if(array) delete array; }
             int length() const { return array ? *lengthPtr() : 0; }
             int capacity() const { return array ? *capacityPtr() : 0; }
             int* lengthPtr() const { return cast(int*)(array); }
             int* capacityPtr() const { return cast(int*)(array+4); }
             T* data() const { return cast(T*)(array+8); }
             ubyte* array;
       }
   ArrayBase!T base;
   void copyData(ref ArrayBase!T array)
   {
     int copyLen = min(length, array.length());
     for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
   }
 }
 static bool less(T)(T a, T b) { return a < b; }
 void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int
 high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
   if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition1!(T, L)(a, p, r);
             quickSort1!(T, L)(a, p, q - 1);
             quickSort1!(T, L)(a, q + 1, r);
       }
 }
 void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
 a.length()-1); }
 void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition2(T, alias L = less!T)(T *a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort2(T, alias L = less!T)(T *a, int p, int r)
 {
   if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition2!(T, L)(a, p, r);
             quickSort2!(T, L)(a, p, q - 1);
             quickSort2!(T, L)(a, q + 1, r);
       }
 }
 void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a,
 0,
 length-1); }
 double[] vals;
 void bench1()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort1(v);
   }
 }
 void bench2()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort2(&v[0], v.length);
   }
 }
 void main()
 {
   Mt19937 gen;
   vals.length = 1000;
   for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
   ulong[] times = [0, 0];
   for(int i = 0; i < 100; i++)
   {
     auto times2 = benchmark!(bench1, bench2)(1);
     times[0] += times2[0];
     times[1] += times2[1];
   }
   writeln("Sorting with Array.opIndex: ", times[0]);
   writeln("Sorting with pointers: ", times[1]);
   writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
 }

 Testing on GDC with a Netbook, results from 3 consecutive runs are:

 Without -frelease
 -------
 Sorting with Array.opIndex: 27981
 Sorting with pointers: 5602
 79.9793 percent faster

 Sorting with Array.opIndex: 25565
 Sorting with pointers: 5179
 79.7418 percent faster

 Sorting with Array.opIndex: 28657
 Sorting with pointers: 5772
 79.8583 percent faster
 -------


 With -frelease
 -------
 Sorting with Array.opIndex: 10591
 Sorting with pointers: 4771
 54.9523 percent faster

 Sorting with Array.opIndex: 10289
 Sorting with pointers: 4710
 54.223 percent faster

 Sorting with Array.opIndex: 11305
 Sorting with pointers: 5216
 53.8611 percent faster
 -------


 With -frelease -fno-bounds-check
 -------
 Sorting with Array.opIndex: 11651
 Sorting with pointers: 5236
 55.0597 percent faster

 Sorting with Array.opIndex: 9873
 Sorting with pointers: 4559
 53.8236 percent faster

 Sorting with Array.opIndex: 10361
 Sorting with pointers: 4745
 54.2033 percent faster
 -------


 GDC doesn't use DMD's FE inliner, but results from the GCC backend's
 inliner:
 -------
 Considering inline candidate check.
  Inlining check into fillUp.
 Merging blocks 9 and 10
 Merging blocks 9 and 11

 Considering inline candidate initialize.
  Inlining initialize into emplace.
 Merging blocks 2 and 3
 Merging blocks 2 and 4

 Considering inline candidate bench2.
 Not inlining: code size would grow by 77 insns.
 Considering inline candidate bench1.
 Not inlining: code size would grow by 49 insns.
 -------


 So there's _me_ seriously doubting that inlining has anything to do with
 the
 50% increase.

 Regards
 Iain

Dec 12 2010

"Craig Black" <craigblack2 cox.net> writes:

If the problem is not inlining, then why does the same code in C++ does not 
suffer from this performance limitation (when compiled with Visual C++ 
2008)?

#include <stdio.h>
#include <assert.h>
#include <stdlib.h>

template <class T>
T min(T a, T b) { return a < b ? a : b; }
template <class T>
T max(T a, T b) { return a > b ? a : b; }

inline void *operator new(size_t, void *ptr) { return ptr; }
inline void operator delete(void *, void *) {}

template <class T>
class Array
{
public:

  Array() {}
  Array(int length) { resize(length); }
  Array(const Array<T> &a) { append(a); }

  void clear() { base.clear(); }

  void resize(int sz)
  {
    assert(sz >= 0);
    if(sz == 0) return clear();
    if(!base.array) reserve(sz, sz);
    *base.lengthPtr() = sz;
  }

  void reserve(int capacity, int length)
  {
    assert(length <= capacity);
    if(capacity == 0) return clear();
    ArrayBase old;
    if(base.array)
    {
      if(base.capacity() >= capacity) return;
      old.array = base.array;
      base.array = 0;
    }
    base.array = (char*)(new char[capacity*sizeof(T)+8]);
    *base.lengthPtr() = length;
    *base.capacityPtr() = capacity;
    for(int i = 0; i < capacity; i++) new(base.data()+i)T;
    if(old.array) copyData(old);
  }

  int length() const { return base.length(); }
  int capacity() const { return base.capacity(); }

  bool empty() const { return base.array == 0; }

  T &at(int i)
  {
   assert(!empty());
    assert(i >= 0 && i < length());
    return base.data()[i];
  }

  const T &at(int i) const
  {
    assert(!empty());
    assert(i >= 0 && i < length());
    return base.data()[i];
  }

  T &operator [](int i) { return at(i); }
  const T &operator [](int i) const { return at(i); }

  void operator = (const Array &array)
  {
    copy(array);
  }

  void copy(const Array &array)
  {
    if(array.empty()) return clear();
    int len = array.length();
    reserve(len, len);
    for(int i = 0; i < len; i++) at(i) = array[i];
  }

  T &front() { return at(0); }
  const T &front() const { return at(0); }
  T &back() { return at(length()-1); }
  const T &back() const { return at(length()-1); }

  T &append()
  {
    int sz = length();
    int sp = capacity();
    if(sp > sz) (*base.lengthPtr())++;
    else
    {
      sz++;
      sp = max(max(2, sp+sp/2), sz);
      reserve(sp, sz);
    }
    return back();
  }

  void append(const Array<T> &a)
  {
    int prevLen = length();
    resize(prevLen + a.length());
    for(int i = 0; i < a.length(); i++) at(i+prevLen) = a[i];
  }

private:

  class ArrayBase
  {
  public:

    ArrayBase() { array = 0; }
    ~ArrayBase() { clear(); }
    void clear() { if(array) delete array; }

    int length() const { return array ? *lengthPtr() : 0; }
    int capacity() const { return array ? *capacityPtr() : 0; }
    int* lengthPtr() const { return (int*)array; }
    int* capacityPtr() const { return (int*)(array+4); }
    T* data() const { return (T*)(array+8); }

    char* array;
  };

  ArrayBase base;

  void copyData(ArrayBase &array)
  {
    int copyLen = min(length(), array.length());
    for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
  }
};

template <class T>
struct Less
{
  bool operator() (T a, T b) { return a < b; }
};

template <class T>
void swap(T &a, T &b)
{
      T temp = a;
      a = b;
      b = temp;
}

template <class T, class L>
void insertionSort1(Array<T> &a, int low, int high, L less)
{
  for(int i = low; i <= high; i++)
  {
    int min = i;
    for(int j = i + 1; j <= high; j++)
      if(less(a[j], a[min])) min = j;
    swap(a[i], a[min]);
  }
}

template<class T, class L>
int partition1(Array<T> &a, int p, int r, L less)
{
  T x = a[r];
  int j = p - 1;
  for (int i = p; i < r; i++)
  {
    if (less(x, a[i])) continue;
    swap(a[i], a[++j]);
  }
  a[r] = a[j + 1];
  a[j + 1] = x;
  return j + 1;
}

template<class T, class L>
void quickSort1(Array<T> &a, int p, int r, L less)
{
  if(p + 7 > r) return insertionSort1(a, p, r, less);
  if (p < r)
  {
    int q = partition1(a, p, r, less);
    quickSort1(a, p, q - 1, less);
    quickSort1(a, q + 1, r, less);
  }
}

template <class T, class L>
void sort1(Array<T> &a) { quickSort1(a, 0, a.length()-1, Less<T>()); }

template <class T, class L>
void insertionSort2(T *a, int low, int high, L less)
{
  for(int i = low; i <= high; i++)
  {
    int min = i;
    for(int j = i + 1; j <= high; j++)
      if(less(a[j], a[min])) min = j;
    swap(a[i], a[min]);
  }
}

template<class T, class L>
int partition2(T *a, int p, int r, L less)
{
  T x = a[r];
  int j = p - 1;
  for (int i = p; i < r; i++)
  {
    if (less(x, a[i])) continue;
    swap(a[i], a[++j]);
  }
  a[r] = a[j + 1];
  a[j + 1] = x;
  return j + 1;
}

template<class T, class L>
void quickSort2(T *a, int p, int r, L less)
{
  if(p + 7 > r) return insertionSort2(a, p, r, less);
  if (p < r)
  {
    int q = partition2(a, p, r, less);
    quickSort2(a, p, q - 1, less);
    quickSort2(a, q + 1, r, less);
  }
}

template <class T, class L>
void sort2(Array<T> &a) { quickSort2(&a[0], 0, a.length()-1, Less<T>()); }

typedef unsigned long long uint64;

uint64 getCycle() {
    __asm { RDTSC }
}

Array<double> vals;

uint64 bench1()
{
      uint64 startTime = getCycle();
  Array<double> v;
  for(int i = 0; i < 100; i++)
  {
    v = vals;
    sort1<double, Less<double> >(v);
  }
      return getCycle()-startTime;
}

uint64 bench2()
{
      uint64 startTime = getCycle();
  Array<double> v;
  for(int i = 0; i < 100; i++)
  {
    v = vals;
    sort2<double, Less<double> >(v);
  }
      return getCycle()-startTime;
}

void main()
{
  vals.resize(1000);
  for(int i = 0; i < 1000; i++) vals[i] = (rand() % 1000000) / 1000.0;

  uint64 time1 = 0;
  uint64 time2 = 0;

  for(int i = 0; i < 100; i++)
  {
    time1 += bench1();
    time2 += bench2();
  }
  printf("Sorting with Array: %f\n", time1/100000.0);
  printf("Sorting with pointers: %f\n", time2/100000.0);
  printf("%f percent faster\n", 100.0 * (time1-time2) / time1);
}

Dec 12 2010

Iain Buclaw <ibuclaw ubuntu.com> writes:

== Quote from Craig Black (craigblack2 cox.net)'s article
 If the problem is not inlining, then why does the same code in C++ does not
 suffer from this performance limitation (when compiled with Visual C++
 2008)?
 #include <stdio.h>
 #include <assert.h>
 #include <stdlib.h>
 template <class T>
 T min(T a, T b) { return a < b ? a : b; }
 template <class T>
 T max(T a, T b) { return a > b ? a : b; }
 inline void *operator new(size_t, void *ptr) { return ptr; }
 inline void operator delete(void *, void *) {}
 template <class T>
 class Array
 {
 public:
   Array() {}
   Array(int length) { resize(length); }
   Array(const Array<T> &a) { append(a); }
   void clear() { base.clear(); }
   void resize(int sz)
   {
     assert(sz >= 0);
     if(sz == 0) return clear();
     if(!base.array) reserve(sz, sz);
     *base.lengthPtr() = sz;
   }
   void reserve(int capacity, int length)
   {
     assert(length <= capacity);
     if(capacity == 0) return clear();
     ArrayBase old;
     if(base.array)
     {
       if(base.capacity() >= capacity) return;
       old.array = base.array;
       base.array = 0;
     }
     base.array = (char*)(new char[capacity*sizeof(T)+8]);
     *base.lengthPtr() = length;
     *base.capacityPtr() = capacity;
     for(int i = 0; i < capacity; i++) new(base.data()+i)T;
     if(old.array) copyData(old);
   }
   int length() const { return base.length(); }
   int capacity() const { return base.capacity(); }
   bool empty() const { return base.array == 0; }
   T &at(int i)
   {
    assert(!empty());
     assert(i >= 0 && i < length());
     return base.data()[i];
   }
   const T &at(int i) const
   {
     assert(!empty());
     assert(i >= 0 && i < length());
     return base.data()[i];
   }
   T &operator [](int i) { return at(i); }
   const T &operator [](int i) const { return at(i); }
   void operator = (const Array &array)
   {
     copy(array);
   }
   void copy(const Array &array)
   {
     if(array.empty()) return clear();
     int len = array.length();
     reserve(len, len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   T &front() { return at(0); }
   const T &front() const { return at(0); }
   T &back() { return at(length()-1); }
   const T &back() const { return at(length()-1); }
   T &append()
   {
     int sz = length();
     int sp = capacity();
     if(sp > sz) (*base.lengthPtr())++;
     else
     {
       sz++;
       sp = max(max(2, sp+sp/2), sz);
       reserve(sp, sz);
     }
     return back();
   }
   void append(const Array<T> &a)
   {
     int prevLen = length();
     resize(prevLen + a.length());
     for(int i = 0; i < a.length(); i++) at(i+prevLen) = a[i];
   }
 private:
   class ArrayBase
   {
   public:
     ArrayBase() { array = 0; }
     ~ArrayBase() { clear(); }
     void clear() { if(array) delete array; }
     int length() const { return array ? *lengthPtr() : 0; }
     int capacity() const { return array ? *capacityPtr() : 0; }
     int* lengthPtr() const { return (int*)array; }
     int* capacityPtr() const { return (int*)(array+4); }
     T* data() const { return (T*)(array+8); }
     char* array;
   };
   ArrayBase base;
   void copyData(ArrayBase &array)
   {
     int copyLen = min(length(), array.length());
     for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
   }
 };
 template <class T>
 struct Less
 {
   bool operator() (T a, T b) { return a < b; }
 };
 template <class T>
 void swap(T &a, T &b)
 {
       T temp = a;
       a = b;
       b = temp;
 }
 template <class T, class L>
 void insertionSort1(Array<T> &a, int low, int high, L less)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(less(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 template<class T, class L>
 int partition1(Array<T> &a, int p, int r, L less)
 {
   T x = a[r];
   int j = p - 1;
   for (int i = p; i < r; i++)
   {
     if (less(x, a[i])) continue;
     swap(a[i], a[++j]);
   }
   a[r] = a[j + 1];
   a[j + 1] = x;
   return j + 1;
 }
 template<class T, class L>
 void quickSort1(Array<T> &a, int p, int r, L less)
 {
   if(p + 7 > r) return insertionSort1(a, p, r, less);
   if (p < r)
   {
     int q = partition1(a, p, r, less);
     quickSort1(a, p, q - 1, less);
     quickSort1(a, q + 1, r, less);
   }
 }
 template <class T, class L>
 void sort1(Array<T> &a) { quickSort1(a, 0, a.length()-1, Less<T>()); }
 template <class T, class L>
 void insertionSort2(T *a, int low, int high, L less)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(less(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 template<class T, class L>
 int partition2(T *a, int p, int r, L less)
 {
   T x = a[r];
   int j = p - 1;
   for (int i = p; i < r; i++)
   {
     if (less(x, a[i])) continue;
     swap(a[i], a[++j]);
   }
   a[r] = a[j + 1];
   a[j + 1] = x;
   return j + 1;
 }
 template<class T, class L>
 void quickSort2(T *a, int p, int r, L less)
 {
   if(p + 7 > r) return insertionSort2(a, p, r, less);
   if (p < r)
   {
     int q = partition2(a, p, r, less);
     quickSort2(a, p, q - 1, less);
     quickSort2(a, q + 1, r, less);
   }
 }
 template <class T, class L>
 void sort2(Array<T> &a) { quickSort2(&a[0], 0, a.length()-1, Less<T>()); }
 typedef unsigned long long uint64;
 uint64 getCycle() {
     __asm { RDTSC }
 }
 Array<double> vals;
 uint64 bench1()
 {
       uint64 startTime = getCycle();
   Array<double> v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort1<double, Less<double> >(v);
   }
       return getCycle()-startTime;
 }
 uint64 bench2()
 {
       uint64 startTime = getCycle();
   Array<double> v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort2<double, Less<double> >(v);
   }
       return getCycle()-startTime;
 }
 void main()
 {
   vals.resize(1000);
   for(int i = 0; i < 1000; i++) vals[i] = (rand() % 1000000) / 1000.0;
   uint64 time1 = 0;
   uint64 time2 = 0;
   for(int i = 0; i < 100; i++)
   {
     time1 += bench1();
     time2 += bench2();
   }
   printf("Sorting with Array: %f\n", time1/100000.0);
   printf("Sorting with pointers: %f\n", time2/100000.0);
   printf("%f percent faster\n", 100.0 * (time1-time2) / time1);
 }

D can use inline asm in much the same way, but that's deterring from your
point. =)


import std.c.stdio;
import std.random;
import std.conv;
import std.algorithm;
import std.range;

struct Array(T)
{
public:

 this(int length) { resize(length); }
 this(T[] a) { append(a); }

 this(this)
 {
  if(!base.array) return;
  ArrayBase!T old;
  old = base;
  base.array = null;
  reserve(old.length(), old.length());
  copyData(old);
  old.array = null;
 }

 void clear() { base.clear(); }

 void resize(int sz)
 {
  assert(sz >= 0);
  if(sz == 0) return clear();
  if(!base.array) reserve(sz, sz);
  *base.lengthPtr() = sz;
 }

 void reserve(int capacity, int length)
 {
  assert(length <= capacity);
  if(capacity == 0) return clear();
  ArrayBase!T old;
  if(base.array)
  {
    if(base.capacity() >= capacity) return;
    old.array = base.array;
    base.array = null;
  }
  base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
  *base.lengthPtr() = length;
  *base.capacityPtr() = capacity;

  for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
  if(old.array) copyData(old);
 }

 int length() const { return base.length(); }
 int capacity() const { return base.capacity(); }

 bool empty() const { return base.array == null; }

 ref T at(int i)
 {
  assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out
of bounds of empty array");
  assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
to!string(i) ~ " out of bounds 0-" ~ to!string(length()-
1));
  return base.data()[i];
 }

 ref const T at(int i) const
 {
  assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out
of bounds of empty array");
  assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
to!string(i) ~ " out of bounds 0-" ~ to!string(length()-
1));
  return base.data()[i];
 }

 const ref T opIndex(int i) const { return at(i); }
 void opIndexAssign(T t, int i) { at(i) = t; }
 void opIndexAssign(ref T t, int i) { at(i) = t; }

 void opAssign(ref const Array!T array)
 {
  copy(array);
 }

 void opAssign(T[] array)
 {
  int len = array.length;
  resize(len);
  for(int i = 0; i < len; i++) at(i) = array[i];
 }

 void copy(ref const Array!T array)
 {
  if(array.empty()) return clear();
  int len = array.length();
  reserve(len, len);
  for(int i = 0; i < len; i++) at(i) = array[i];
 }

 void opOpAssign(string op, A...)(A a) if(op == "~")
 {
  appendComposite(a);
 }

 ref T front() { return at(0); }
 const ref T front() const { return at(0); }
 ref T back() { return at(length()-1); }
 const ref T back() const { return at(length()-1); }

 ref T appendOne()
 {
  int sz = length();
  int sp = capacity();
  if(sp > sz) (*base.lengthPtr())++;
  else
  {
    sz++;
    sp = max(2, sp+sp/2, sz);
    reserve(sp, sz);
  }
  return back();
 }

 void append(A...)(A a)
 {
  static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || is(typeof(a[0]) ==
T[])))
  {
    appendComposite(a[0]);
  } else {
    appendTuple(a);
  }
 }

 void appendTuple(A...)(A a)
 {
  foreach(x; a) appendOne() = x;
 }

 void appendComposite(A)(A a)
 {
  int prevLen = length();
  resize(prevLen + a.length);
  for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
 }

private:

    static struct ArrayBase(T)
    {
    public:

          ~this() { clear(); }
          void clear() { if(array) delete array; }

          int length() const { return array ? *lengthPtr() : 0; }
          int capacity() const { return array ? *capacityPtr() : 0; }
          int* lengthPtr() const { return cast(int*)(array); }
          int* capacityPtr() const { return cast(int*)(array+4); }
          T* data() const { return cast(T*)(array+8); }

          ubyte* array;
    }

 ArrayBase!T base;

 void copyData(ref ArrayBase!T array)
 {
  int copyLen = min(length, array.length());
  for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
 }
}

static bool less(T)(T a, T b) { return a < b; }

void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
{
 for(int i = low; i <= high; i++)
 {
  int min = i;
  for(int j = i + 1; j <= high; j++)
    if(L(a[j], a[min])) min = j;
  swap(a[i], a[min]);
 }
}

int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
    T x = a[r];
    int j = p - 1;
    for (int i = p; i < r; i++)
    {
          if (L(x, a[i])) continue;
          swap(a[i], a[++j]);
    }
    a[r] = a[j + 1];
    a[j + 1] = x;
    return j + 1;
}

void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
 if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
 if (p < r)
    {
          int q = partition1!(T, L)(a, p, r);
          quickSort1!(T, L)(a, p, q - 1);
          quickSort1!(T, L)(a, q + 1, r);
    }
}

void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
a.length()-1); }

void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
{
 for(int i = low; i <= high; i++)
 {
  int min = i;
  for(int j = i + 1; j <= high; j++)
    if(L(a[j], a[min])) min = j;
  swap(a[i], a[min]);
 }
}

int partition2(T, alias L = less!T)(T *a, int p, int r)
{
    T x = a[r];
    int j = p - 1;
    for (int i = p; i < r; i++)
    {
          if (L(x, a[i])) continue;
          swap(a[i], a[++j]);
    }
    a[r] = a[j + 1];
    a[j + 1] = x;
    return j + 1;
}

void quickSort2(T, alias L = less!T)(T *a, int p, int r)
{
 if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
 if (p < r)
    {
          int q = partition2!(T, L)(a, p, r);
          quickSort2!(T, L)(a, p, q - 1);
          quickSort2!(T, L)(a, q + 1, r);
    }
}

void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0,
length-1); }

ulong getCycle() {
 asm { rdtsc; }
}

double[] vals;

ulong bench1()
{
    ulong startTime = getCycle();

 Array!double v;
 for(int i = 0; i < 100; i++)
 {
  v = vals;
  sort1(v);
 }
    return getCycle() - startTime;
}

ulong bench2()
{
    ulong startTime = getCycle();

 Array!double v;
 for(int i = 0; i < 100; i++)
 {
  v = vals;
  sort2(&v[0], v.length);
 }
    return getCycle() - startTime;
}

void main()
{
 Mt19937 gen;
 vals.length = 1000;
 for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);

 ulong time1;
 ulong time2;

 for(int i = 0; i < 100; i++)
 {
  time1 += bench1();
  time2 += bench2();
 }
 printf("Sorting with Array.opIndex: %f\n", time1/1e5);
 printf("Sorting with pointers: %f\n", time2/1e5);
 printf("%f percent faster\n", 100.0 * (time1-time2) / time1);
}


Testing your C++ program (altered getCycle() for GCC)

Times I get:
-------
Sorting with Array: 46869.159840
Sorting with pointers: 38688.937320
17.453316 percent faster

Sorting with Array: 46631.903760
Sorting with pointers: 38520.609360
17.394302 percent faster

Sorting with Array: 46674.330720
Sorting with pointers: 38545.202520
17.416700 percent faster
-------


On a , I thought I might try an older version of GCC for the D program.
Really surprisingly, I got:

-------
Sorting with Array.opIndex: 43075.059840
Sorting with pointers: 40019.701920
7.093102 percent faster

Sorting with Array.opIndex: 42940.085640
Sorting with pointers: 39594.089040
7.792245 percent faster

Sorting with Array.opIndex: 44389.127280
Sorting with pointers: 41159.016960
7.276805 percent faster
-------

This will need some thinking through as to just what happened between GDC-4.3
-> GDC-4.4 :~)

Regards

Dec 13 2010

"Craig Black" <craigblack2 cox.net> writes:

 Testing your C++ program (altered getCycle() for GCC)

 Times I get:
 -------
 Sorting with Array: 46869.159840
 Sorting with pointers: 38688.937320
 17.453316 percent faster

 Sorting with Array: 46631.903760
 Sorting with pointers: 38520.609360
 17.394302 percent faster

 Sorting with Array: 46674.330720
 Sorting with pointers: 38545.202520
 17.416700 percent faster
 -------


 On a , I thought I might try an older version of GCC for the D program.
 Really surprisingly, I got:

 -------
 Sorting with Array.opIndex: 43075.059840
 Sorting with pointers: 40019.701920
 7.093102 percent faster

 Sorting with Array.opIndex: 42940.085640
 Sorting with pointers: 39594.089040
 7.792245 percent faster

 Sorting with Array.opIndex: 44389.127280
 Sorting with pointers: 41159.016960
 7.276805 percent faster
 -------

 This will need some thinking through as to just what happened between 
 GDC-4.3 -> GDC-4.4 :~)

 Regards

Curious benches there.  Seems GCC isn't inlining (or other optimization) as 
well as Visual C++ here.  I'm getting a neglible ~2% difference.

-Craig

Dec 13 2010

Iain Buclaw <ibuclaw ubuntu.com> writes:

== Quote from Craig Black (craigblack2 cox.net)'s article
 Testing your C++ program (altered getCycle() for GCC)

 Times I get:
 -------
 Sorting with Array: 46869.159840
 Sorting with pointers: 38688.937320
 17.453316 percent faster

 Sorting with Array: 46631.903760
 Sorting with pointers: 38520.609360
 17.394302 percent faster

 Sorting with Array: 46674.330720
 Sorting with pointers: 38545.202520
 17.416700 percent faster
 -------


 On a , I thought I might try an older version of GCC for the D program.
 Really surprisingly, I got:

 -------
 Sorting with Array.opIndex: 43075.059840
 Sorting with pointers: 40019.701920
 7.093102 percent faster

 Sorting with Array.opIndex: 42940.085640
 Sorting with pointers: 39594.089040
 7.792245 percent faster

 Sorting with Array.opIndex: 44389.127280
 Sorting with pointers: 41159.016960
 7.276805 percent faster
 -------

 This will need some thinking through as to just what happened between
 GDC-4.3 -> GDC-4.4 :~)

 Regards

 Curious benches there.  Seems GCC isn't inlining (or other optimization) as
 well as Visual C++ here.  I'm getting a neglible ~2% difference.
 -Craig

OK, found and fixed the problem with GDC-4.4. My results are now:

g++-4.4 -O3 -march=native -ftree-vectorize
-------
Sorting with Array: 45861.599640
Sorting with pointers: 38417.632680
16.231372 percent faster

Sorting with Array: 46421.979120
Sorting with pointers: 38474.709840
17.119626 percent faster

Sorting with Array: 48622.885200
Sorting with pointers: 39593.385600
18.570473 percent faster
-------

gdc-4.4 -frelease -O3 -march=native -ftree-vectorize
-------
Sorting with Array.opIndex: 41454.052200
Sorting with pointers: 38331.517560
7.532520 percent faster

Sorting with Array.opIndex: 41109.852720
Sorting with pointers: 37836.883320
7.961521 percent faster

Sorting with Array.opIndex: 40587.349320
Sorting with pointers: 37390.488120
7.876497 percent faster
-------

dmd-2.050 -release -O -inline
-------
Sorting with Array.opIndex: 53561.598720
Sorting with pointers: 48348.626760
9.732667 percent faster

Sorting with Array.opIndex: 55861.658280
Sorting with pointers: 49909.880760
10.654495 percent faster

Sorting with Array.opIndex: 56887.660800
Sorting with pointers: 51470.453400
9.522640 percent faster
-------

I noticed that with dmd -inline needs to be explicitly set. Otherwise there
appears to be nothing to suggest a wild difference
between using opIndex and pointers. :~)

Regards

Dec 13 2010

"Craig Black" <craigblack2 cox.net> writes:

"Iain Buclaw" <ibuclaw ubuntu.com> wrote in message 
news:ie6ahr$1coc$1 digitalmars.com...
 == Quote from Craig Black (craigblack2 cox.net)'s article
 Testing your C++ program (altered getCycle() for GCC)

 Times I get:
 -------
 Sorting with Array: 46869.159840
 Sorting with pointers: 38688.937320
 17.453316 percent faster

 Sorting with Array: 46631.903760
 Sorting with pointers: 38520.609360
 17.394302 percent faster

 Sorting with Array: 46674.330720
 Sorting with pointers: 38545.202520
 17.416700 percent faster
 -------


 On a , I thought I might try an older version of GCC for the D program.
 Really surprisingly, I got:

 -------
 Sorting with Array.opIndex: 43075.059840
 Sorting with pointers: 40019.701920
 7.093102 percent faster

 Sorting with Array.opIndex: 42940.085640
 Sorting with pointers: 39594.089040
 7.792245 percent faster

 Sorting with Array.opIndex: 44389.127280
 Sorting with pointers: 41159.016960
 7.276805 percent faster
 -------

 This will need some thinking through as to just what happened between
 GDC-4.3 -> GDC-4.4 :~)

 Regards

 Curious benches there.  Seems GCC isn't inlining (or other optimization) 
 as
 well as Visual C++ here.  I'm getting a neglible ~2% difference.
 -Craig

 OK, found and fixed the problem with GDC-4.4. My results are now:

 g++-4.4 -O3 -march=native -ftree-vectorize
 -------
 Sorting with Array: 45861.599640
 Sorting with pointers: 38417.632680
 16.231372 percent faster

 Sorting with Array: 46421.979120
 Sorting with pointers: 38474.709840
 17.119626 percent faster

 Sorting with Array: 48622.885200
 Sorting with pointers: 39593.385600
 18.570473 percent faster
 -------

 gdc-4.4 -frelease -O3 -march=native -ftree-vectorize
 -------
 Sorting with Array.opIndex: 41454.052200
 Sorting with pointers: 38331.517560
 7.532520 percent faster

 Sorting with Array.opIndex: 41109.852720
 Sorting with pointers: 37836.883320
 7.961521 percent faster

 Sorting with Array.opIndex: 40587.349320
 Sorting with pointers: 37390.488120
 7.876497 percent faster
 -------

 dmd-2.050 -release -O -inline
 -------
 Sorting with Array.opIndex: 53561.598720
 Sorting with pointers: 48348.626760
 9.732667 percent faster

 Sorting with Array.opIndex: 55861.658280
 Sorting with pointers: 49909.880760
 10.654495 percent faster

 Sorting with Array.opIndex: 56887.660800
 Sorting with pointers: 51470.453400
 9.522640 percent faster
 -------

 I noticed that with dmd -inline needs to be explicitly set. Otherwise 
 there appears to be nothing to suggest a wild difference
 between using opIndex and pointers. :~)

 Regards

Thanks for your help and interest.  Basically you are saying I should use 
GDC instead?  I've heard rumors that GDC is not as stable.  Can you comment 
on this?  I am still getting different results than you with dmd, but maybe 
that's because we have different machines?

My results on a core i7 1.86 GHz notebook:

dmd-2.050 -O -release -inline

Sorting with Array.opIndex: 19160.2
Sorting with pointers: 15006.4
21.6791 percent faster

Sorting with Array.opIndex: 20200.8
Sorting with pointers: 15699.1
22.2887 percent faster

Sorting with Array.opIndex: 19563.3
Sorting with pointers: 15418.4
21.1872 percent faster

-Craig

Dec 14 2010

"Nick Voronin" <elfy.nv gmail.com> writes:

On Mon, 13 Dec 2010 01:50:50 +0300, Craig Black <craigblack2 cox.net>  
wrote:

 The following program illustrates the problems with inlining in the dmd  
 compiler.  Perhaps with some more work I can reduce it to a smaller test  
 case.  I was playing around with a simple Array template, and noticed  
 that similar C++ code performs much better.  This is due, at least in  
 part, to opIndex not being properly inlined by dmd.  There are two sort  
 functions, quickSort1 and quickSort2.  quickSort1 indexes an Array data  
 structure. quickSort2 indexes raw pointers.  quickSort2 is roughly 20%  
 faster on my core i7.

Compiled with dmd v2.050/win32  -g -O -inline -release

First, I looked in debugger on actual asm and I must say inlining is done  
very well. Code for two versions is almost identical with slight overhead  
in case of Array for there is extra level of indirection in data access,  
inlining or not.

Second, I have anywhere from 3.3 to 6.7% difference in performance, but no  
more than that. Tested on Core2Duo E6300, Windows XP SP3. I increased  
number of iterations for benchmark!() to 5 to reduce volatility of  
results. That's the only change to source I did.

Third... Now here is a funny thing. Absolute times and difference between  
implementation depends on how do you run the program. I was dumbfounded as  
of how does it matter, but the fact is that aforementioned avg 5%  
difference I get if I run it with command line as "inline.exe". If I run  
it as "inline" without extension I get difference around 15% and absolute  
times are notably smaller.


X:\d\tests\craig>inline.exe
Sorting with Array.opIndex: 6533
Sorting with pointers: 6264
4.11756 percent faster

X:\d\tests\craig>inline
Sorting with Array.opIndex: 5390
Sorting with pointers: 4674
13.2839 percent faster

Something like that. It's not a fluke. I tested it on my old AthlonXp with  
XP SP2 and saw exactly the same picture (btw, difference in % between  
implementation was about the same).

I ran both variants under stracent and found no difference except one  
pointer on the stack when LeaveCriticalSection and GetCurrentThreadId are  
called was always off by 4 bytes. This made me thinking. The only  
observable difference is length of command line. And indeed, renaming  
program showed that only length of command line is a reason, not the  
content.

Further tests suggest that some value is either aligned to 8 byte or not  
depending on length of command line and this makes all the difference  
(which happens to be greater than difference between implementations of  
sorting). I couldn't find what value causes slowdown though.

-- 
Using Opera's revolutionary email client: http://www.opera.com/mail/

Dec 16 2010

"Nick Voronin" <elfy.nv gmail.com> writes:

Looks like alignment of local variables is somewhat broken.

import std.stdio;

void main()
{
	int a;
	double d;
	writeln(&a," ", &d);
}

 testdl.exe

12FE70 12FE78

 testdl

12FE74 12FE7C

It's not like double is not aligned, otherwise it would be placed right  
after int, 4 bytes apart, but it is aligned with some assumption which may  
or may not hold depending on command line. Bug? I hope doubles _should_ be  
aligned by compiler?


On Fri, 17 Dec 2010 09:50:53 +0300, Nick Voronin <elfy.nv gmail.com> wrote:

 Third... Now here is a funny thing. Absolute times and difference  
 between implementation depends on how do you run the program. I was  
 dumbfounded as of how does it matter, but the fact is that  
 aforementioned avg 5% difference I get if I run it with command line as  
 "inline.exe". If I run it as "inline" without extension I get difference  
 around 15% and absolute times are notably smaller.


 X:\d\tests\craig>inline.exe
 Sorting with Array.opIndex: 6533
 Sorting with pointers: 6264
 4.11756 percent faster

 X:\d\tests\craig>inline
 Sorting with Array.opIndex: 5390
 Sorting with pointers: 4674
 13.2839 percent faster

 Something like that. It's not a fluke. I tested it on my old AthlonXp  
 with XP SP2 and saw exactly the same picture (btw, difference in %  
 between implementation was about the same).

 I ran both variants under stracent and found no difference except one  
 pointer on the stack when LeaveCriticalSection and GetCurrentThreadId  
 are called was always off by 4 bytes. This made me thinking. The only  
 observable difference is length of command line. And indeed, renaming  
 program showed that only length of command line is a reason, not the  
 content.

 Further tests suggest that some value is either aligned to 8 byte or not  
 depending on length of command line and this makes all the difference  
 (which happens to be greater than difference between implementations of  
 sorting). I couldn't find what value causes slowdown though.


-- 
Using Opera's revolutionary email client: http://www.opera.com/mail/

Dec 17 2010

bearophile <bearophileHUGS lycos.com> writes:

Nick Voronin:

 Looks like alignment of local variables is somewhat broken.

This is a very nice bug. If not already present then please add it to Bugzilla.
If you don't want to add it, then I will add it myself. I have modified your
code like this:

import core.stdc.stdio: printf;
void main() {
    int a;
    double d;
    printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8);
}

And this shows the performance difference:

import core.stdc.stdio: printf;
import std.date: getUTCtime, ticksPerSecond;
void main() {
    double d = 0.0;
    auto t0 = getUTCtime();
    for (size_t i = 0; i < 100_000_000; i++)
        d += 1;
    auto t1 = getUTCtime();
    printf("%lf\n", d);
    printf("%u\n", (cast(size_t)&d) % 8);
    printf("%lf\n", (cast(double)t1 - cast(double)t0) / ticksPerSecond);
}

Bye,
bearophile

Dec 17 2010

"Nick Voronin" <elfy.nv gmail.com> writes:

I found this on bugzilla.
http://d.puremagic.com/issues/show_bug.cgi?id=2278

So it's not really a bug. Yet there is no simple workaround. And gap  
between int and double looks strange when stack frame itself is unaligned.

btw, is there no explicit alignment for variables in D at all?
align(8) double d; compiles if d is global, but it does nothing.

import core.stdc.stdio: printf;

align(8) int a;
align(8) double d;

void main() {
     printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8);
}

This _always_ prints "4 4" for me. It seems that globals are not affected  
by environment, but not aligned anyway.

Honestly, I'm lost :) It all works not as expected but then again I can't  
find any evidences that my expectations are valid.

On Sat, 18 Dec 2010 01:57:44 +0300, bearophile <bearophileHUGS lycos.com>  
wrote:

 Nick Voronin:

 Looks like alignment of local variables is somewhat broken.

 This is a very nice bug. If not already present then please add it to  
 Bugzilla. If you don't want to add it, then I will add it myself. I have  
 modified your code like this:

 import core.stdc.stdio: printf;
 void main() {
     int a;
     double d;
     printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8);
 }

 And this shows the performance difference:

 import core.stdc.stdio: printf;
 import std.date: getUTCtime, ticksPerSecond;
 void main() {
     double d = 0.0;
     auto t0 = getUTCtime();
     for (size_t i = 0; i < 100_000_000; i++)
         d += 1;
     auto t1 = getUTCtime();
     printf("%lf\n", d);
     printf("%u\n", (cast(size_t)&d) % 8);
     printf("%lf\n", (cast(double)t1 - cast(double)t0) / ticksPerSecond);
 }

 Bye,
 bearophile


-- 
Using Opera's revolutionary email client: http://www.opera.com/mail/

Dec 17 2010

bearophile <bearophileHUGS lycos.com> writes:

Nick Voronin:

 I found this on bugzilla.
 http://d.puremagic.com/issues/show_bug.cgi?id=2278
 
 So it's not really a bug.

It's borderline a performance bug. You may add that code to the issue 2278 as
further examples.

Bye,
bearophile

Dec 17 2010

Don <nospam nospam.com> writes:

Nick Voronin wrote:
 I found this on bugzilla.
 http://d.puremagic.com/issues/show_bug.cgi?id=2278
 
 So it's not really a bug. Yet there is no simple workaround. And gap 
 between int and double looks strange when stack frame itself is unaligned.

Interestingly, since I filed that bug, DMD for MacOSX was created, which 
  ensures that the stack stays aligned in the way I described! So the 
DMD backend is perfectly capable of doing it now.

 btw, is there no explicit alignment for variables in D at all?
 align(8) double d; compiles if d is global, but it does nothing.

That's a regression. Large globals are always aligned to a 16-byte 
boundary (see changelog for 2.007)
However this code:

import core.stdc.stdio: printf;

int a;
double[4] d;

void main() {
     printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8);
}
shows that it stopped doing that somewhere between 2.027 and 2.030.

Dec 17 2010

"Nick Voronin" <elfy.nv gmail.com> writes:

On Sat, 18 Dec 2010 04:17:46 +0300, Don <nospam nospam.com> wrote:

 Interestingly, since I filed that bug, DMD for MacOSX was created, which  
   ensures that the stack stays aligned in the way I described! So the  
 DMD backend is perfectly capable of doing it now.

I added some examples to that bugreport. Just in case :)

 btw, is there no explicit alignment for variables in D at all?
 align(8) double d; compiles if d is global, but it does nothing.

 That's a regression. Large globals are always aligned to a 16-byte  
 boundary (see changelog for 2.007)
 However this code:

 import core.stdc.stdio: printf;

 int a;
 double[4] d;

 void main() {
      printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8);
 }
 shows that it stopped doing that somewhere between 2.027 and 2.030.

Thanks. Filed it as http://d.puremagic.com/issues/show_bug.cgi?id=5355


-- 
Using Opera's revolutionary email client: http://www.opera.com/mail/

Dec 17 2010

Nick Voronin <elfy.nv gmail.com> writes:

On Sat, 18 Dec 2010 02:17:46 +0100
Don <nospam nospam.com> wrote:

 Nick Voronin wrote:
 btw, is there no explicit alignment for variables in D at all?
 align(8) double d; compiles if d is global, but it does nothing.

 
 That's a regression. Large globals are always aligned to a 16-byte 
 boundary (see changelog for 2.007)

On second thought large globals in static segment (as log says) are probably
only those with __gshared prefix. And they do look aligned.

 However this code:
 
 import core.stdc.stdio: printf;
 
 int a;
 double[4] d;
 
 void main() {
      printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8);
 }
 shows that it stopped doing that somewhere between 2.027 and 2.030.


-- 
Nick Voronin <elfy.nv gmail.com>

Dec 18 2010

Don <nospam nospam.com> writes:

Nick Voronin wrote:
 On Sat, 18 Dec 2010 02:17:46 +0100
 Don <nospam nospam.com> wrote:
 
 Nick Voronin wrote:
 btw, is there no explicit alignment for variables in D at all?
 align(8) double d; compiles if d is global, but it does nothing.

 That's a regression. Large globals are always aligned to a 16-byte 
 boundary (see changelog for 2.007)

 
 On second thought large globals in static segment (as log says) are probably
only those with __gshared prefix. And they do look aligned.

Good catch! Yes, that makes perfect sense.
So the bug is that align() is ignored for TLS variables.

Dec 20 2010

D Programming

C/C++ Programming

Other

digitalmars.D - Inlining Code Test