www.digitalmars.com         C & C++   DMDScript  

digitalmars.D - Inlining Code Test

reply "Craig Black" <craigblack2 cox.net> writes:
The following program illustrates the problems with inlining in the dmd 
compiler.  Perhaps with some more work I can reduce it to a smaller test 
case.  I was playing around with a simple Array template, and noticed that 
similar C++ code performs much better.  This is due, at least in part, to 
opIndex not being properly inlined by dmd.  There are two sort functions, 
quickSort1 and quickSort2.  quickSort1 indexes an Array data structure. 
quickSort2 indexes raw pointers.  quickSort2 is roughly 20% faster on my 
core i7.

import std.stdio;
import std.date;
import std.random;
import std.conv;
import std.algorithm;
import std.range;

struct Array(T)
{
public:

  this(int length) { resize(length); }
  this(T[] a) { append(a); }

  this(this)
  {
    if(!base.array) return;
    ArrayBase!T old;
    old = base;
    base.array = null;
    reserve(old.length(), old.length());
    copyData(old);
    old.array = null;
  }

  void clear() { base.clear(); }

  void resize(int sz)
  {
    assert(sz >= 0);
    if(sz == 0) return clear();
    if(!base.array) reserve(sz, sz);
    *base.lengthPtr() = sz;
  }

  void reserve(int capacity, int length)
  {
    assert(length <= capacity);
    if(capacity == 0) return clear();
    ArrayBase!T old;
    if(base.array)
    {
      if(base.capacity() >= capacity) return;
      old.array = base.array;
      base.array = null;
    }
    base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
    *base.lengthPtr() = length;
    *base.capacityPtr() = capacity;

    for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
    if(old.array) copyData(old);
  }

  int length() const { return base.length(); }
  int capacity() const { return base.capacity(); }

  bool empty() const { return base.array == null; }

  ref T at(int i)
  {
    assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ 
" out of bounds of empty array");
    assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ 
to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
    return base.data()[i];
  }

  ref const T at(int i) const
  {
    assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ 
" out of bounds of empty array");
    assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ 
to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
    return base.data()[i];
  }

  const ref T opIndex(int i) const { return at(i); }
  void opIndexAssign(T t, int i) { at(i) = t; }
  void opIndexAssign(ref T t, int i) { at(i) = t; }

  void opAssign(ref const Array!T array)
  {
    copy(array);
  }

  void opAssign(T[] array)
  {
    int len = array.length;
    resize(len);
    for(int i = 0; i < len; i++) at(i) = array[i];
  }

  void copy(ref const Array!T array)
  {
    if(array.empty()) return clear();
    int len = array.length();
    reserve(len, len);
    for(int i = 0; i < len; i++) at(i) = array[i];
  }

  void opOpAssign(string op, A...)(A a) if(op == "~")
  {
    appendComposite(a);
  }

  ref T front() { return at(0); }
  const ref T front() const { return at(0); }
  ref T back() { return at(length()-1); }
  const ref T back() const { return at(length()-1); }

  ref T appendOne()
  {
    int sz = length();
    int sp = capacity();
    if(sp > sz) (*base.lengthPtr())++;
    else
    {
      sz++;
      sp = max(2, sp+sp/2, sz);
      reserve(sp, sz);
    }
    return back();
  }

  void append(A...)(A a)
  {
    static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || 
is(typeof(a[0]) == T[])))
    {
      appendComposite(a[0]);
    } else {
      appendTuple(a);
    }
  }

  void appendTuple(A...)(A a)
  {
    foreach(x; a) appendOne() = x;
  }

  void appendComposite(A)(A a)
  {
    int prevLen = length();
    resize(prevLen + a.length);
    for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
  }

private:

      static struct ArrayBase(T)
      {
      public:

            ~this() { clear(); }
            void clear() { if(array) delete array; }

            int length() const { return array ? *lengthPtr() : 0; }
            int capacity() const { return array ? *capacityPtr() : 0; }
            int* lengthPtr() const { return cast(int*)(array); }
            int* capacityPtr() const { return cast(int*)(array+4); }
            T* data() const { return cast(T*)(array+8); }

            ubyte* array;
      }

  ArrayBase!T base;

  void copyData(ref ArrayBase!T array)
  {
    int copyLen = min(length, array.length());
    for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
  }
}

static bool less(T)(T a, T b) { return a < b; }

void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
{
  for(int i = low; i <= high; i++)
  {
    int min = i;
    for(int j = i + 1; j <= high; j++)
      if(L(a[j], a[min])) min = j;
    swap(a[i], a[min]);
  }
}

int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
      T x = a[r];
      int j = p - 1;
      for (int i = p; i < r; i++)
      {
            if (L(x, a[i])) continue;
            swap(a[i], a[++j]);
      }
      a[r] = a[j + 1];
      a[j + 1] = x;
      return j + 1;
}

void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
{
  if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
  if (p < r)
      {
            int q = partition1!(T, L)(a, p, r);
            quickSort1!(T, L)(a, p, q - 1);
            quickSort1!(T, L)(a, q + 1, r);
      }
}

void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, 
a.length()-1); }

void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
{
  for(int i = low; i <= high; i++)
  {
    int min = i;
    for(int j = i + 1; j <= high; j++)
      if(L(a[j], a[min])) min = j;
    swap(a[i], a[min]);
  }
}

int partition2(T, alias L = less!T)(T *a, int p, int r)
{
      T x = a[r];
      int j = p - 1;
      for (int i = p; i < r; i++)
      {
            if (L(x, a[i])) continue;
            swap(a[i], a[++j]);
      }
      a[r] = a[j + 1];
      a[j + 1] = x;
      return j + 1;
}

void quickSort2(T, alias L = less!T)(T *a, int p, int r)
{
  if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
  if (p < r)
      {
            int q = partition2!(T, L)(a, p, r);
            quickSort2!(T, L)(a, p, q - 1);
            quickSort2!(T, L)(a, q + 1, r);
      }
}

void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, 
length-1); }

double[] vals;

void bench1()
{
  Array!double v;
  for(int i = 0; i < 100; i++)
  {
    v = vals;
    sort1(v);
  }
}

void bench2()
{
  Array!double v;
  for(int i = 0; i < 100; i++)
  {
    v = vals;
    sort2(&v[0], v.length);
  }
}

void main()
{
  Mt19937 gen;
  vals.length = 1000;
  for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);

  ulong[] times = [0, 0];
  for(int i = 0; i < 100; i++)
  {
    auto times2 = benchmark!(bench1, bench2)(1);
    times[0] += times2[0];
    times[1] += times2[1];
  }
  writeln("Sorting with Array.opIndex: ", times[0]);
  writeln("Sorting with pointers: ", times[1]);
  writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
}
Dec 12 2010
next sibling parent reply Iain Buclaw <ibuclaw ubuntu.com> writes:
== Quote from Craig Black (craigblack2 cox.net)'s article
 The following program illustrates the problems with inlining in the dmd
 compiler.  Perhaps with some more work I can reduce it to a smaller test
 case.  I was playing around with a simple Array template, and noticed that
 similar C++ code performs much better.  This is due, at least in part, to
 opIndex not being properly inlined by dmd.  There are two sort functions,
 quickSort1 and quickSort2.  quickSort1 indexes an Array data structure.
 quickSort2 indexes raw pointers.  quickSort2 is roughly 20% faster on my
 core i7.
 import std.stdio;
 import std.date;
 import std.random;
 import std.conv;
 import std.algorithm;
 import std.range;
 struct Array(T)
 {
 public:
   this(int length) { resize(length); }
   this(T[] a) { append(a); }
   this(this)
   {
     if(!base.array) return;
     ArrayBase!T old;
     old = base;
     base.array = null;
     reserve(old.length(), old.length());
     copyData(old);
     old.array = null;
   }
   void clear() { base.clear(); }
   void resize(int sz)
   {
     assert(sz >= 0);
     if(sz == 0) return clear();
     if(!base.array) reserve(sz, sz);
     *base.lengthPtr() = sz;
   }
   void reserve(int capacity, int length)
   {
     assert(length <= capacity);
     if(capacity == 0) return clear();
     ArrayBase!T old;
     if(base.array)
     {
       if(base.capacity() >= capacity) return;
       old.array = base.array;
       base.array = null;
     }
     base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
     *base.lengthPtr() = length;
     *base.capacityPtr() = capacity;
     for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
     if(old.array) copyData(old);
   }
   int length() const { return base.length(); }
   int capacity() const { return base.capacity(); }
   bool empty() const { return base.array == null; }
   ref T at(int i)
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   ref const T at(int i) const
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   const ref T opIndex(int i) const { return at(i); }
   void opIndexAssign(T t, int i) { at(i) = t; }
   void opIndexAssign(ref T t, int i) { at(i) = t; }
   void opAssign(ref const Array!T array)
   {
     copy(array);
   }
   void opAssign(T[] array)
   {
     int len = array.length;
     resize(len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void copy(ref const Array!T array)
   {
     if(array.empty()) return clear();
     int len = array.length();
     reserve(len, len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void opOpAssign(string op, A...)(A a) if(op == "~")
   {
     appendComposite(a);
   }
   ref T front() { return at(0); }
   const ref T front() const { return at(0); }
   ref T back() { return at(length()-1); }
   const ref T back() const { return at(length()-1); }
   ref T appendOne()
   {
     int sz = length();
     int sp = capacity();
     if(sp > sz) (*base.lengthPtr())++;
     else
     {
       sz++;
       sp = max(2, sp+sp/2, sz);
       reserve(sp, sz);
     }
     return back();
   }
   void append(A...)(A a)
   {
     static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
 is(typeof(a[0]) == T[])))
     {
       appendComposite(a[0]);
     } else {
       appendTuple(a);
     }
   }
   void appendTuple(A...)(A a)
   {
     foreach(x; a) appendOne() = x;
   }
   void appendComposite(A)(A a)
   {
     int prevLen = length();
     resize(prevLen + a.length);
     for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
   }
 private:
       static struct ArrayBase(T)
       {
       public:
             ~this() { clear(); }
             void clear() { if(array) delete array; }
             int length() const { return array ? *lengthPtr() : 0; }
             int capacity() const { return array ? *capacityPtr() : 0; }
             int* lengthPtr() const { return cast(int*)(array); }
             int* capacityPtr() const { return cast(int*)(array+4); }
             T* data() const { return cast(T*)(array+8); }
             ubyte* array;
       }
   ArrayBase!T base;
   void copyData(ref ArrayBase!T array)
   {
     int copyLen = min(length, array.length());
     for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
   }
 }
 static bool less(T)(T a, T b) { return a < b; }
 void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
   if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition1!(T, L)(a, p, r);
             quickSort1!(T, L)(a, p, q - 1);
             quickSort1!(T, L)(a, q + 1, r);
       }
 }
 void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
 a.length()-1); }
 void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition2(T, alias L = less!T)(T *a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort2(T, alias L = less!T)(T *a, int p, int r)
 {
   if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition2!(T, L)(a, p, r);
             quickSort2!(T, L)(a, p, q - 1);
             quickSort2!(T, L)(a, q + 1, r);
       }
 }
 void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0,
 length-1); }
 double[] vals;
 void bench1()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort1(v);
   }
 }
 void bench2()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort2(&v[0], v.length);
   }
 }
 void main()
 {
   Mt19937 gen;
   vals.length = 1000;
   for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
   ulong[] times = [0, 0];
   for(int i = 0; i < 100; i++)
   {
     auto times2 = benchmark!(bench1, bench2)(1);
     times[0] += times2[0];
     times[1] += times2[1];
   }
   writeln("Sorting with Array.opIndex: ", times[0]);
   writeln("Sorting with pointers: ", times[1]);
   writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
 }
Testing on GDC with a Netbook, results from 3 consecutive runs are: Without -frelease ------- Sorting with Array.opIndex: 27981 Sorting with pointers: 5602 79.9793 percent faster Sorting with Array.opIndex: 25565 Sorting with pointers: 5179 79.7418 percent faster Sorting with Array.opIndex: 28657 Sorting with pointers: 5772 79.8583 percent faster ------- With -frelease ------- Sorting with Array.opIndex: 10591 Sorting with pointers: 4771 54.9523 percent faster Sorting with Array.opIndex: 10289 Sorting with pointers: 4710 54.223 percent faster Sorting with Array.opIndex: 11305 Sorting with pointers: 5216 53.8611 percent faster ------- With -frelease -fno-bounds-check ------- Sorting with Array.opIndex: 11651 Sorting with pointers: 5236 55.0597 percent faster Sorting with Array.opIndex: 9873 Sorting with pointers: 4559 53.8236 percent faster Sorting with Array.opIndex: 10361 Sorting with pointers: 4745 54.2033 percent faster ------- GDC doesn't use DMD's FE inliner, but results from the GCC backend's inliner: ------- Considering inline candidate check. Inlining check into fillUp. Merging blocks 9 and 10 Merging blocks 9 and 11 Considering inline candidate initialize. Inlining initialize into emplace. Merging blocks 2 and 3 Merging blocks 2 and 4 Considering inline candidate bench2. Not inlining: code size would grow by 77 insns. Considering inline candidate bench1. Not inlining: code size would grow by 49 insns. ------- So there's _me_ seriously doubting that inlining has anything to do with the 50% increase. Regards Iain
Dec 12 2010
next sibling parent reply Andrej Mitrovic <andrej.mitrovich gmail.com> writes:
My crappy old Pentium 4 has gone totally mad:

Sorting with Array.opIndex: 45080
Sorting with pointers: 45608
4.092e+16 percent faster (??)

Compiled with dmd -profile -O -release -inline -noboundscheck

On 12/13/10, Iain Buclaw <ibuclaw ubuntu.com> wrote:
 == Quote from Craig Black (craigblack2 cox.net)'s article
 The following program illustrates the problems with inlining in the dmd
 compiler.  Perhaps with some more work I can reduce it to a smaller test
 case.  I was playing around with a simple Array template, and noticed that
 similar C++ code performs much better.  This is due, at least in part, to
 opIndex not being properly inlined by dmd.  There are two sort functions,
 quickSort1 and quickSort2.  quickSort1 indexes an Array data structure.
 quickSort2 indexes raw pointers.  quickSort2 is roughly 20% faster on my
 core i7.
 import std.stdio;
 import std.date;
 import std.random;
 import std.conv;
 import std.algorithm;
 import std.range;
 struct Array(T)
 {
 public:
   this(int length) { resize(length); }
   this(T[] a) { append(a); }
   this(this)
   {
     if(!base.array) return;
     ArrayBase!T old;
     old = base;
     base.array = null;
     reserve(old.length(), old.length());
     copyData(old);
     old.array = null;
   }
   void clear() { base.clear(); }
   void resize(int sz)
   {
     assert(sz >= 0);
     if(sz == 0) return clear();
     if(!base.array) reserve(sz, sz);
     *base.lengthPtr() = sz;
   }
   void reserve(int capacity, int length)
   {
     assert(length <= capacity);
     if(capacity == 0) return clear();
     ArrayBase!T old;
     if(base.array)
     {
       if(base.capacity() >= capacity) return;
       old.array = base.array;
       base.array = null;
     }
     base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
     *base.lengthPtr() = length;
     *base.capacityPtr() = capacity;
     for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
     if(old.array) copyData(old);
   }
   int length() const { return base.length(); }
   int capacity() const { return base.capacity(); }
   bool empty() const { return base.array == null; }
   ref T at(int i)
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   ref const T at(int i) const
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   const ref T opIndex(int i) const { return at(i); }
   void opIndexAssign(T t, int i) { at(i) = t; }
   void opIndexAssign(ref T t, int i) { at(i) = t; }
   void opAssign(ref const Array!T array)
   {
     copy(array);
   }
   void opAssign(T[] array)
   {
     int len = array.length;
     resize(len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void copy(ref const Array!T array)
   {
     if(array.empty()) return clear();
     int len = array.length();
     reserve(len, len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void opOpAssign(string op, A...)(A a) if(op == "~")
   {
     appendComposite(a);
   }
   ref T front() { return at(0); }
   const ref T front() const { return at(0); }
   ref T back() { return at(length()-1); }
   const ref T back() const { return at(length()-1); }
   ref T appendOne()
   {
     int sz = length();
     int sp = capacity();
     if(sp > sz) (*base.lengthPtr())++;
     else
     {
       sz++;
       sp = max(2, sp+sp/2, sz);
       reserve(sp, sz);
     }
     return back();
   }
   void append(A...)(A a)
   {
     static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
 is(typeof(a[0]) == T[])))
     {
       appendComposite(a[0]);
     } else {
       appendTuple(a);
     }
   }
   void appendTuple(A...)(A a)
   {
     foreach(x; a) appendOne() = x;
   }
   void appendComposite(A)(A a)
   {
     int prevLen = length();
     resize(prevLen + a.length);
     for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
   }
 private:
       static struct ArrayBase(T)
       {
       public:
             ~this() { clear(); }
             void clear() { if(array) delete array; }
             int length() const { return array ? *lengthPtr() : 0; }
             int capacity() const { return array ? *capacityPtr() : 0; }
             int* lengthPtr() const { return cast(int*)(array); }
             int* capacityPtr() const { return cast(int*)(array+4); }
             T* data() const { return cast(T*)(array+8); }
             ubyte* array;
       }
   ArrayBase!T base;
   void copyData(ref ArrayBase!T array)
   {
     int copyLen = min(length, array.length());
     for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
   }
 }
 static bool less(T)(T a, T b) { return a < b; }
 void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
   if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition1!(T, L)(a, p, r);
             quickSort1!(T, L)(a, p, q - 1);
             quickSort1!(T, L)(a, q + 1, r);
       }
 }
 void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
 a.length()-1); }
 void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition2(T, alias L = less!T)(T *a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort2(T, alias L = less!T)(T *a, int p, int r)
 {
   if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition2!(T, L)(a, p, r);
             quickSort2!(T, L)(a, p, q - 1);
             quickSort2!(T, L)(a, q + 1, r);
       }
 }
 void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a,
 0,
 length-1); }
 double[] vals;
 void bench1()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort1(v);
   }
 }
 void bench2()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort2(&v[0], v.length);
   }
 }
 void main()
 {
   Mt19937 gen;
   vals.length = 1000;
   for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
   ulong[] times = [0, 0];
   for(int i = 0; i < 100; i++)
   {
     auto times2 = benchmark!(bench1, bench2)(1);
     times[0] += times2[0];
     times[1] += times2[1];
   }
   writeln("Sorting with Array.opIndex: ", times[0]);
   writeln("Sorting with pointers: ", times[1]);
   writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
 }
Testing on GDC with a Netbook, results from 3 consecutive runs are: Without -frelease ------- Sorting with Array.opIndex: 27981 Sorting with pointers: 5602 79.9793 percent faster Sorting with Array.opIndex: 25565 Sorting with pointers: 5179 79.7418 percent faster Sorting with Array.opIndex: 28657 Sorting with pointers: 5772 79.8583 percent faster ------- With -frelease ------- Sorting with Array.opIndex: 10591 Sorting with pointers: 4771 54.9523 percent faster Sorting with Array.opIndex: 10289 Sorting with pointers: 4710 54.223 percent faster Sorting with Array.opIndex: 11305 Sorting with pointers: 5216 53.8611 percent faster ------- With -frelease -fno-bounds-check ------- Sorting with Array.opIndex: 11651 Sorting with pointers: 5236 55.0597 percent faster Sorting with Array.opIndex: 9873 Sorting with pointers: 4559 53.8236 percent faster Sorting with Array.opIndex: 10361 Sorting with pointers: 4745 54.2033 percent faster ------- GDC doesn't use DMD's FE inliner, but results from the GCC backend's inliner: ------- Considering inline candidate check. Inlining check into fillUp. Merging blocks 9 and 10 Merging blocks 9 and 11 Considering inline candidate initialize. Inlining initialize into emplace. Merging blocks 2 and 3 Merging blocks 2 and 4 Considering inline candidate bench2. Not inlining: code size would grow by 77 insns. Considering inline candidate bench1. Not inlining: code size would grow by 49 insns. ------- So there's _me_ seriously doubting that inlining has anything to do with the 50% increase. Regards Iain
Dec 12 2010
parent reply "Craig Black" <craigblack2 cox.net> writes:
Try it without -profile.  That tends to slow everything down to a halt.

-Craig

"Andrej Mitrovic" <andrej.mitrovich gmail.com> wrote in message 
news:mailman.948.1292198993.21107.digitalmars-d puremagic.com...
 My crappy old Pentium 4 has gone totally mad:

 Sorting with Array.opIndex: 45080
 Sorting with pointers: 45608
 4.092e+16 percent faster (??)

 Compiled with dmd -profile -O -release -inline -noboundscheck

 On 12/13/10, Iain Buclaw <ibuclaw ubuntu.com> wrote:
 == Quote from Craig Black (craigblack2 cox.net)'s article
 The following program illustrates the problems with inlining in the dmd
 compiler.  Perhaps with some more work I can reduce it to a smaller test
 case.  I was playing around with a simple Array template, and noticed 
 that
 similar C++ code performs much better.  This is due, at least in part, 
 to
 opIndex not being properly inlined by dmd.  There are two sort 
 functions,
 quickSort1 and quickSort2.  quickSort1 indexes an Array data structure.
 quickSort2 indexes raw pointers.  quickSort2 is roughly 20% faster on my
 core i7.
 import std.stdio;
 import std.date;
 import std.random;
 import std.conv;
 import std.algorithm;
 import std.range;
 struct Array(T)
 {
 public:
   this(int length) { resize(length); }
   this(T[] a) { append(a); }
   this(this)
   {
     if(!base.array) return;
     ArrayBase!T old;
     old = base;
     base.array = null;
     reserve(old.length(), old.length());
     copyData(old);
     old.array = null;
   }
   void clear() { base.clear(); }
   void resize(int sz)
   {
     assert(sz >= 0);
     if(sz == 0) return clear();
     if(!base.array) reserve(sz, sz);
     *base.lengthPtr() = sz;
   }
   void reserve(int capacity, int length)
   {
     assert(length <= capacity);
     if(capacity == 0) return clear();
     ArrayBase!T old;
     if(base.array)
     {
       if(base.capacity() >= capacity) return;
       old.array = base.array;
       base.array = null;
     }
     base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
     *base.lengthPtr() = length;
     *base.capacityPtr() = capacity;
     for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
     if(old.array) copyData(old);
   }
   int length() const { return base.length(); }
   int capacity() const { return base.capacity(); }
   bool empty() const { return base.array == null; }
   ref T at(int i)
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ 
 to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " 
 ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   ref const T at(int i) const
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ 
 to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " 
 ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   const ref T opIndex(int i) const { return at(i); }
   void opIndexAssign(T t, int i) { at(i) = t; }
   void opIndexAssign(ref T t, int i) { at(i) = t; }
   void opAssign(ref const Array!T array)
   {
     copy(array);
   }
   void opAssign(T[] array)
   {
     int len = array.length;
     resize(len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void copy(ref const Array!T array)
   {
     if(array.empty()) return clear();
     int len = array.length();
     reserve(len, len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void opOpAssign(string op, A...)(A a) if(op == "~")
   {
     appendComposite(a);
   }
   ref T front() { return at(0); }
   const ref T front() const { return at(0); }
   ref T back() { return at(length()-1); }
   const ref T back() const { return at(length()-1); }
   ref T appendOne()
   {
     int sz = length();
     int sp = capacity();
     if(sp > sz) (*base.lengthPtr())++;
     else
     {
       sz++;
       sp = max(2, sp+sp/2, sz);
       reserve(sp, sz);
     }
     return back();
   }
   void append(A...)(A a)
   {
     static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
 is(typeof(a[0]) == T[])))
     {
       appendComposite(a[0]);
     } else {
       appendTuple(a);
     }
   }
   void appendTuple(A...)(A a)
   {
     foreach(x; a) appendOne() = x;
   }
   void appendComposite(A)(A a)
   {
     int prevLen = length();
     resize(prevLen + a.length);
     for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
   }
 private:
       static struct ArrayBase(T)
       {
       public:
             ~this() { clear(); }
             void clear() { if(array) delete array; }
             int length() const { return array ? *lengthPtr() : 0; }
             int capacity() const { return array ? *capacityPtr() : 0; }
             int* lengthPtr() const { return cast(int*)(array); }
             int* capacityPtr() const { return cast(int*)(array+4); }
             T* data() const { return cast(T*)(array+8); }
             ubyte* array;
       }
   ArrayBase!T base;
   void copyData(ref ArrayBase!T array)
   {
     int copyLen = min(length, array.length());
     for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
   }
 }
 static bool less(T)(T a, T b) { return a < b; }
 void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int 
 high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
   if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition1!(T, L)(a, p, r);
             quickSort1!(T, L)(a, p, q - 1);
             quickSort1!(T, L)(a, q + 1, r);
       }
 }
 void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
 a.length()-1); }
 void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition2(T, alias L = less!T)(T *a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort2(T, alias L = less!T)(T *a, int p, int r)
 {
   if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition2!(T, L)(a, p, r);
             quickSort2!(T, L)(a, p, q - 1);
             quickSort2!(T, L)(a, q + 1, r);
       }
 }
 void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a,
 0,
 length-1); }
 double[] vals;
 void bench1()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort1(v);
   }
 }
 void bench2()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort2(&v[0], v.length);
   }
 }
 void main()
 {
   Mt19937 gen;
   vals.length = 1000;
   for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
   ulong[] times = [0, 0];
   for(int i = 0; i < 100; i++)
   {
     auto times2 = benchmark!(bench1, bench2)(1);
     times[0] += times2[0];
     times[1] += times2[1];
   }
   writeln("Sorting with Array.opIndex: ", times[0]);
   writeln("Sorting with pointers: ", times[1]);
   writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
 }
Testing on GDC with a Netbook, results from 3 consecutive runs are: Without -frelease ------- Sorting with Array.opIndex: 27981 Sorting with pointers: 5602 79.9793 percent faster Sorting with Array.opIndex: 25565 Sorting with pointers: 5179 79.7418 percent faster Sorting with Array.opIndex: 28657 Sorting with pointers: 5772 79.8583 percent faster ------- With -frelease ------- Sorting with Array.opIndex: 10591 Sorting with pointers: 4771 54.9523 percent faster Sorting with Array.opIndex: 10289 Sorting with pointers: 4710 54.223 percent faster Sorting with Array.opIndex: 11305 Sorting with pointers: 5216 53.8611 percent faster ------- With -frelease -fno-bounds-check ------- Sorting with Array.opIndex: 11651 Sorting with pointers: 5236 55.0597 percent faster Sorting with Array.opIndex: 9873 Sorting with pointers: 4559 53.8236 percent faster Sorting with Array.opIndex: 10361 Sorting with pointers: 4745 54.2033 percent faster ------- GDC doesn't use DMD's FE inliner, but results from the GCC backend's inliner: ------- Considering inline candidate check. Inlining check into fillUp. Merging blocks 9 and 10 Merging blocks 9 and 11 Considering inline candidate initialize. Inlining initialize into emplace. Merging blocks 2 and 3 Merging blocks 2 and 4 Considering inline candidate bench2. Not inlining: code size would grow by 77 insns. Considering inline candidate bench1. Not inlining: code size would grow by 49 insns. ------- So there's _me_ seriously doubting that inlining has anything to do with the 50% increase. Regards Iain
Dec 12 2010
parent Andrej Mitrovic <andrej.mitrovich gmail.com> writes:
Ye that's better:

Sorting with Array.opIndex: 2859
Sorting with pointers: 1765
38.2651 percent faster

And it only took 2 seconds. With profile it takes a full minute.

On 12/13/10, Craig Black <craigblack2 cox.net> wrote:
 Try it without -profile.  That tends to slow everything down to a halt.

 -Craig

 "Andrej Mitrovic" <andrej.mitrovich gmail.com> wrote in message
 news:mailman.948.1292198993.21107.digitalmars-d puremagic.com...
 My crappy old Pentium 4 has gone totally mad:

 Sorting with Array.opIndex: 45080
 Sorting with pointers: 45608
 4.092e+16 percent faster (??)

 Compiled with dmd -profile -O -release -inline -noboundscheck

 On 12/13/10, Iain Buclaw <ibuclaw ubuntu.com> wrote:
 == Quote from Craig Black (craigblack2 cox.net)'s article
 The following program illustrates the problems with inlining in the dmd
 compiler.  Perhaps with some more work I can reduce it to a smaller test
 case.  I was playing around with a simple Array template, and noticed
 that
 similar C++ code performs much better.  This is due, at least in part,
 to
 opIndex not being properly inlined by dmd.  There are two sort
 functions,
 quickSort1 and quickSort2.  quickSort1 indexes an Array data structure.
 quickSort2 indexes raw pointers.  quickSort2 is roughly 20% faster on my
 core i7.
 import std.stdio;
 import std.date;
 import std.random;
 import std.conv;
 import std.algorithm;
 import std.range;
 struct Array(T)
 {
 public:
   this(int length) { resize(length); }
   this(T[] a) { append(a); }
   this(this)
   {
     if(!base.array) return;
     ArrayBase!T old;
     old = base;
     base.array = null;
     reserve(old.length(), old.length());
     copyData(old);
     old.array = null;
   }
   void clear() { base.clear(); }
   void resize(int sz)
   {
     assert(sz >= 0);
     if(sz == 0) return clear();
     if(!base.array) reserve(sz, sz);
     *base.lengthPtr() = sz;
   }
   void reserve(int capacity, int length)
   {
     assert(length <= capacity);
     if(capacity == 0) return clear();
     ArrayBase!T old;
     if(base.array)
     {
       if(base.capacity() >= capacity) return;
       old.array = base.array;
       base.array = null;
     }
     base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]);
     *base.lengthPtr() = length;
     *base.capacityPtr() = capacity;
     for(int i = 0; i < capacity; i++) emplace!T(base.data()+i);
     if(old.array) copyData(old);
   }
   int length() const { return base.length(); }
   int capacity() const { return base.capacity(); }
   bool empty() const { return base.array == null; }
   ref T at(int i)
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index "

 ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   ref const T at(int i) const
   {
     assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~
 to!string(i)
 ~
 " out of bounds of empty array");
     assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index "

 ~
 to!string(i) ~ " out of bounds 0-" ~ to!string(length()-1));
     return base.data()[i];
   }
   const ref T opIndex(int i) const { return at(i); }
   void opIndexAssign(T t, int i) { at(i) = t; }
   void opIndexAssign(ref T t, int i) { at(i) = t; }
   void opAssign(ref const Array!T array)
   {
     copy(array);
   }
   void opAssign(T[] array)
   {
     int len = array.length;
     resize(len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void copy(ref const Array!T array)
   {
     if(array.empty()) return clear();
     int len = array.length();
     reserve(len, len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   void opOpAssign(string op, A...)(A a) if(op == "~")
   {
     appendComposite(a);
   }
   ref T front() { return at(0); }
   const ref T front() const { return at(0); }
   ref T back() { return at(length()-1); }
   const ref T back() const { return at(length()-1); }
   ref T appendOne()
   {
     int sz = length();
     int sp = capacity();
     if(sp > sz) (*base.lengthPtr())++;
     else
     {
       sz++;
       sp = max(2, sp+sp/2, sz);
       reserve(sp, sz);
     }
     return back();
   }
   void append(A...)(A a)
   {
     static if(a.length == 1 && (is(typeof(a[0]) == Array!T) ||
 is(typeof(a[0]) == T[])))
     {
       appendComposite(a[0]);
     } else {
       appendTuple(a);
     }
   }
   void appendTuple(A...)(A a)
   {
     foreach(x; a) appendOne() = x;
   }
   void appendComposite(A)(A a)
   {
     int prevLen = length();
     resize(prevLen + a.length);
     for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i];
   }
 private:
       static struct ArrayBase(T)
       {
       public:
             ~this() { clear(); }
             void clear() { if(array) delete array; }
             int length() const { return array ? *lengthPtr() : 0; }
             int capacity() const { return array ? *capacityPtr() : 0; }
             int* lengthPtr() const { return cast(int*)(array); }
             int* capacityPtr() const { return cast(int*)(array+4); }
             T* data() const { return cast(T*)(array+8); }
             ubyte* array;
       }
   ArrayBase!T base;
   void copyData(ref ArrayBase!T array)
   {
     int copyLen = min(length, array.length());
     for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
   }
 }
 static bool less(T)(T a, T b) { return a < b; }
 void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int
 high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r)
 {
   if(p + 7 > r) return insertionSort1!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition1!(T, L)(a, p, r);
             quickSort1!(T, L)(a, p, q - 1);
             quickSort1!(T, L)(a, q + 1, r);
       }
 }
 void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0,
 a.length()-1); }
 void insertionSort2(T, alias L = less!T)(T *a, int low, int high)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(L(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 int partition2(T, alias L = less!T)(T *a, int p, int r)
 {
       T x = a[r];
       int j = p - 1;
       for (int i = p; i < r; i++)
       {
             if (L(x, a[i])) continue;
             swap(a[i], a[++j]);
       }
       a[r] = a[j + 1];
       a[j + 1] = x;
       return j + 1;
 }
 void quickSort2(T, alias L = less!T)(T *a, int p, int r)
 {
   if(p + 7 > r) return insertionSort2!(T, L)(a, p, r);
   if (p < r)
       {
             int q = partition2!(T, L)(a, p, r);
             quickSort2!(T, L)(a, p, q - 1);
             quickSort2!(T, L)(a, q + 1, r);
       }
 }
 void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a,
 0,
 length-1); }
 double[] vals;
 void bench1()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort1(v);
   }
 }
 void bench2()
 {
   Array!double v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort2(&v[0], v.length);
   }
 }
 void main()
 {
   Mt19937 gen;
   vals.length = 1000;
   for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0);
   ulong[] times = [0, 0];
   for(int i = 0; i < 100; i++)
   {
     auto times2 = benchmark!(bench1, bench2)(1);
     times[0] += times2[0];
     times[1] += times2[1];
   }
   writeln("Sorting with Array.opIndex: ", times[0]);
   writeln("Sorting with pointers: ", times[1]);
   writeln(100.0 * (times[0]-times[1]) / times[0], " percent faster");
 }
Testing on GDC with a Netbook, results from 3 consecutive runs are: Without -frelease ------- Sorting with Array.opIndex: 27981 Sorting with pointers: 5602 79.9793 percent faster Sorting with Array.opIndex: 25565 Sorting with pointers: 5179 79.7418 percent faster Sorting with Array.opIndex: 28657 Sorting with pointers: 5772 79.8583 percent faster ------- With -frelease ------- Sorting with Array.opIndex: 10591 Sorting with pointers: 4771 54.9523 percent faster Sorting with Array.opIndex: 10289 Sorting with pointers: 4710 54.223 percent faster Sorting with Array.opIndex: 11305 Sorting with pointers: 5216 53.8611 percent faster ------- With -frelease -fno-bounds-check ------- Sorting with Array.opIndex: 11651 Sorting with pointers: 5236 55.0597 percent faster Sorting with Array.opIndex: 9873 Sorting with pointers: 4559 53.8236 percent faster Sorting with Array.opIndex: 10361 Sorting with pointers: 4745 54.2033 percent faster ------- GDC doesn't use DMD's FE inliner, but results from the GCC backend's inliner: ------- Considering inline candidate check. Inlining check into fillUp. Merging blocks 9 and 10 Merging blocks 9 and 11 Considering inline candidate initialize. Inlining initialize into emplace. Merging blocks 2 and 3 Merging blocks 2 and 4 Considering inline candidate bench2. Not inlining: code size would grow by 77 insns. Considering inline candidate bench1. Not inlining: code size would grow by 49 insns. ------- So there's _me_ seriously doubting that inlining has anything to do with the 50% increase. Regards Iain
Dec 12 2010
prev sibling parent reply "Craig Black" <craigblack2 cox.net> writes:
If the problem is not inlining, then why does the same code in C++ does not 
suffer from this performance limitation (when compiled with Visual C++ 
2008)?

#include <stdio.h>
#include <assert.h>
#include <stdlib.h>

template <class T>
T min(T a, T b) { return a < b ? a : b; }
template <class T>
T max(T a, T b) { return a > b ? a : b; }

inline void *operator new(size_t, void *ptr) { return ptr; }
inline void operator delete(void *, void *) {}

template <class T>
class Array
{
public:

  Array() {}
  Array(int length) { resize(length); }
  Array(const Array<T> &a) { append(a); }

  void clear() { base.clear(); }

  void resize(int sz)
  {
    assert(sz >= 0);
    if(sz == 0) return clear();
    if(!base.array) reserve(sz, sz);
    *base.lengthPtr() = sz;
  }

  void reserve(int capacity, int length)
  {
    assert(length <= capacity);
    if(capacity == 0) return clear();
    ArrayBase old;
    if(base.array)
    {
      if(base.capacity() >= capacity) return;
      old.array = base.array;
      base.array = 0;
    }
    base.array = (char*)(new char[capacity*sizeof(T)+8]);
    *base.lengthPtr() = length;
    *base.capacityPtr() = capacity;
    for(int i = 0; i < capacity; i++) new(base.data()+i)T;
    if(old.array) copyData(old);
  }

  int length() const { return base.length(); }
  int capacity() const { return base.capacity(); }

  bool empty() const { return base.array == 0; }

  T &at(int i)
  {
   assert(!empty());
    assert(i >= 0 && i < length());
    return base.data()[i];
  }

  const T &at(int i) const
  {
    assert(!empty());
    assert(i >= 0 && i < length());
    return base.data()[i];
  }

  T &operator [](int i) { return at(i); }
  const T &operator [](int i) const { return at(i); }

  void operator = (const Array &array)
  {
    copy(array);
  }

  void copy(const Array &array)
  {
    if(array.empty()) return clear();
    int len = array.length();
    reserve(len, len);
    for(int i = 0; i < len; i++) at(i) = array[i];
  }

  T &front() { return at(0); }
  const T &front() const { return at(0); }
  T &back() { return at(length()-1); }
  const T &back() const { return at(length()-1); }

  T &append()
  {
    int sz = length();
    int sp = capacity();
    if(sp > sz) (*base.lengthPtr())++;
    else
    {
      sz++;
      sp = max(max(2, sp+sp/2), sz);
      reserve(sp, sz);
    }
    return back();
  }

  void append(const Array<T> &a)
  {
    int prevLen = length();
    resize(prevLen + a.length());
    for(int i = 0; i < a.length(); i++) at(i+prevLen) = a[i];
  }

private:

  class ArrayBase
  {
  public:

    ArrayBase() { array = 0; }
    ~ArrayBase() { clear(); }
    void clear() { if(array) delete array; }

    int length() const { return array ? *lengthPtr() : 0; }
    int capacity() const { return array ? *capacityPtr() : 0; }
    int* lengthPtr() const { return (int*)array; }
    int* capacityPtr() const { return (int*)(array+4); }
    T* data() const { return (T*)(array+8); }

    char* array;
  };

  ArrayBase base;

  void copyData(ArrayBase &array)
  {
    int copyLen = min(length(), array.length());
    for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
  }
};

template <class T>
struct Less
{
  bool operator() (T a, T b) { return a < b; }
};

template <class T>
void swap(T &a, T &b)
{
      T temp = a;
      a = b;
      b = temp;
}

template <class T, class L>
void insertionSort1(Array<T> &a, int low, int high, L less)
{
  for(int i = low; i <= high; i++)
  {
    int min = i;
    for(int j = i + 1; j <= high; j++)
      if(less(a[j], a[min])) min = j;
    swap(a[i], a[min]);
  }
}

template<class T, class L>
int partition1(Array<T> &a, int p, int r, L less)
{
  T x = a[r];
  int j = p - 1;
  for (int i = p; i < r; i++)
  {
    if (less(x, a[i])) continue;
    swap(a[i], a[++j]);
  }
  a[r] = a[j + 1];
  a[j + 1] = x;
  return j + 1;
}

template<class T, class L>
void quickSort1(Array<T> &a, int p, int r, L less)
{
  if(p + 7 > r) return insertionSort1(a, p, r, less);
  if (p < r)
  {
    int q = partition1(a, p, r, less);
    quickSort1(a, p, q - 1, less);
    quickSort1(a, q + 1, r, less);
  }
}

template <class T, class L>
void sort1(Array<T> &a) { quickSort1(a, 0, a.length()-1, Less<T>()); }

template <class T, class L>
void insertionSort2(T *a, int low, int high, L less)
{
  for(int i = low; i <= high; i++)
  {
    int min = i;
    for(int j = i + 1; j <= high; j++)
      if(less(a[j], a[min])) min = j;
    swap(a[i], a[min]);
  }
}

template<class T, class L>
int partition2(T *a, int p, int r, L less)
{
  T x = a[r];
  int j = p - 1;
  for (int i = p; i < r; i++)
  {
    if (less(x, a[i])) continue;
    swap(a[i], a[++j]);
  }
  a[r] = a[j + 1];
  a[j + 1] = x;
  return j + 1;
}

template<class T, class L>
void quickSort2(T *a, int p, int r, L less)
{
  if(p + 7 > r) return insertionSort2(a, p, r, less);
  if (p < r)
  {
    int q = partition2(a, p, r, less);
    quickSort2(a, p, q - 1, less);
    quickSort2(a, q + 1, r, less);
  }
}

template <class T, class L>
void sort2(Array<T> &a) { quickSort2(&a[0], 0, a.length()-1, Less<T>()); }

typedef unsigned long long uint64;

uint64 getCycle() {
    __asm { RDTSC }
}

Array<double> vals;

uint64 bench1()
{
      uint64 startTime = getCycle();
  Array<double> v;
  for(int i = 0; i < 100; i++)
  {
    v = vals;
    sort1<double, Less<double> >(v);
  }
      return getCycle()-startTime;
}

uint64 bench2()
{
      uint64 startTime = getCycle();
  Array<double> v;
  for(int i = 0; i < 100; i++)
  {
    v = vals;
    sort2<double, Less<double> >(v);
  }
      return getCycle()-startTime;
}

void main()
{
  vals.resize(1000);
  for(int i = 0; i < 1000; i++) vals[i] = (rand() % 1000000) / 1000.0;

  uint64 time1 = 0;
  uint64 time2 = 0;

  for(int i = 0; i < 100; i++)
  {
    time1 += bench1();
    time2 += bench2();
  }
  printf("Sorting with Array: %f\n", time1/100000.0);
  printf("Sorting with pointers: %f\n", time2/100000.0);
  printf("%f percent faster\n", 100.0 * (time1-time2) / time1);
}
Dec 12 2010
parent reply Iain Buclaw <ibuclaw ubuntu.com> writes:
== Quote from Craig Black (craigblack2 cox.net)'s article
 If the problem is not inlining, then why does the same code in C++ does not
 suffer from this performance limitation (when compiled with Visual C++
 2008)?
 #include <stdio.h>
 #include <assert.h>
 #include <stdlib.h>
 template <class T>
 T min(T a, T b) { return a < b ? a : b; }
 template <class T>
 T max(T a, T b) { return a > b ? a : b; }
 inline void *operator new(size_t, void *ptr) { return ptr; }
 inline void operator delete(void *, void *) {}
 template <class T>
 class Array
 {
 public:
   Array() {}
   Array(int length) { resize(length); }
   Array(const Array<T> &a) { append(a); }
   void clear() { base.clear(); }
   void resize(int sz)
   {
     assert(sz >= 0);
     if(sz == 0) return clear();
     if(!base.array) reserve(sz, sz);
     *base.lengthPtr() = sz;
   }
   void reserve(int capacity, int length)
   {
     assert(length <= capacity);
     if(capacity == 0) return clear();
     ArrayBase old;
     if(base.array)
     {
       if(base.capacity() >= capacity) return;
       old.array = base.array;
       base.array = 0;
     }
     base.array = (char*)(new char[capacity*sizeof(T)+8]);
     *base.lengthPtr() = length;
     *base.capacityPtr() = capacity;
     for(int i = 0; i < capacity; i++) new(base.data()+i)T;
     if(old.array) copyData(old);
   }
   int length() const { return base.length(); }
   int capacity() const { return base.capacity(); }
   bool empty() const { return base.array == 0; }
   T &at(int i)
   {
    assert(!empty());
     assert(i >= 0 && i < length());
     return base.data()[i];
   }
   const T &at(int i) const
   {
     assert(!empty());
     assert(i >= 0 && i < length());
     return base.data()[i];
   }
   T &operator [](int i) { return at(i); }
   const T &operator [](int i) const { return at(i); }
   void operator = (const Array &array)
   {
     copy(array);
   }
   void copy(const Array &array)
   {
     if(array.empty()) return clear();
     int len = array.length();
     reserve(len, len);
     for(int i = 0; i < len; i++) at(i) = array[i];
   }
   T &front() { return at(0); }
   const T &front() const { return at(0); }
   T &back() { return at(length()-1); }
   const T &back() const { return at(length()-1); }
   T &append()
   {
     int sz = length();
     int sp = capacity();
     if(sp > sz) (*base.lengthPtr())++;
     else
     {
       sz++;
       sp = max(max(2, sp+sp/2), sz);
       reserve(sp, sz);
     }
     return back();
   }
   void append(const Array<T> &a)
   {
     int prevLen = length();
     resize(prevLen + a.length());
     for(int i = 0; i < a.length(); i++) at(i+prevLen) = a[i];
   }
 private:
   class ArrayBase
   {
   public:
     ArrayBase() { array = 0; }
     ~ArrayBase() { clear(); }
     void clear() { if(array) delete array; }
     int length() const { return array ? *lengthPtr() : 0; }
     int capacity() const { return array ? *capacityPtr() : 0; }
     int* lengthPtr() const { return (int*)array; }
     int* capacityPtr() const { return (int*)(array+4); }
     T* data() const { return (T*)(array+8); }
     char* array;
   };
   ArrayBase base;
   void copyData(ArrayBase &array)
   {
     int copyLen = min(length(), array.length());
     for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; }
   }
 };
 template <class T>
 struct Less
 {
   bool operator() (T a, T b) { return a < b; }
 };
 template <class T>
 void swap(T &a, T &b)
 {
       T temp = a;
       a = b;
       b = temp;
 }
 template <class T, class L>
 void insertionSort1(Array<T> &a, int low, int high, L less)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(less(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 template<class T, class L>
 int partition1(Array<T> &a, int p, int r, L less)
 {
   T x = a[r];
   int j = p - 1;
   for (int i = p; i < r; i++)
   {
     if (less(x, a[i])) continue;
     swap(a[i], a[++j]);
   }
   a[r] = a[j + 1];
   a[j + 1] = x;
   return j + 1;
 }
 template<class T, class L>
 void quickSort1(Array<T> &a, int p, int r, L less)
 {
   if(p + 7 > r) return insertionSort1(a, p, r, less);
   if (p < r)
   {
     int q = partition1(a, p, r, less);
     quickSort1(a, p, q - 1, less);
     quickSort1(a, q + 1, r, less);
   }
 }
 template <class T, class L>
 void sort1(Array<T> &a) { quickSort1(a, 0, a.length()-1, Less<T>()); }
 template <class T, class L>
 void insertionSort2(T *a, int low, int high, L less)
 {
   for(int i = low; i <= high; i++)
   {
     int min = i;
     for(int j = i + 1; j <= high; j++)
       if(less(a[j], a[min])) min = j;
     swap(a[i], a[min]);
   }
 }
 template<class T, class L>
 int partition2(T *a, int p, int r, L less)
 {
   T x = a[r];
   int j = p - 1;
   for (int i = p; i < r; i++)
   {
     if (less(x, a[i])) continue;
     swap(a[i], a[++j]);
   }
   a[r] = a[j + 1];
   a[j + 1] = x;
   return j + 1;
 }
 template<class T, class L>
 void quickSort2(T *a, int p, int r, L less)
 {
   if(p + 7 > r) return insertionSort2(a, p, r, less);
   if (p < r)
   {
     int q = partition2(a, p, r, less);
     quickSort2(a, p, q - 1, less);
     quickSort2(a, q + 1, r, less);
   }
 }
 template <class T, class L>
 void sort2(Array<T> &a) { quickSort2(&a[0], 0, a.length()-1, Less<T>()); }
 typedef unsigned long long uint64;
 uint64 getCycle() {
     __asm { RDTSC }
 }
 Array<double> vals;
 uint64 bench1()
 {
       uint64 startTime = getCycle();
   Array<double> v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort1<double, Less<double> >(v);
   }
       return getCycle()-startTime;
 }
 uint64 bench2()
 {
       uint64 startTime = getCycle();
   Array<double> v;
   for(int i = 0; i < 100; i++)
   {
     v = vals;
     sort2<double, Less<double> >(v);
   }
       return getCycle()-startTime;
 }
 void main()
 {
   vals.resize(1000);
   for(int i = 0; i < 1000; i++) vals[i] = (rand() % 1000000) / 1000.0;
   uint64 time1 = 0;
   uint64 time2 = 0;
   for(int i = 0; i < 100; i++)
   {
     time1 += bench1();
     time2 += bench2();
   }
   printf("Sorting with Array: %f\n", time1/100000.0);
   printf("Sorting with pointers: %f\n", time2/100000.0);
   printf("%f percent faster\n", 100.0 * (time1-time2) / time1);
 }
D can use inline asm in much the same way, but that's deterring from your point. =) import std.c.stdio; import std.random; import std.conv; import std.algorithm; import std.range; struct Array(T) { public: this(int length) { resize(length); } this(T[] a) { append(a); } this(this) { if(!base.array) return; ArrayBase!T old; old = base; base.array = null; reserve(old.length(), old.length()); copyData(old); old.array = null; } void clear() { base.clear(); } void resize(int sz) { assert(sz >= 0); if(sz == 0) return clear(); if(!base.array) reserve(sz, sz); *base.lengthPtr() = sz; } void reserve(int capacity, int length) { assert(length <= capacity); if(capacity == 0) return clear(); ArrayBase!T old; if(base.array) { if(base.capacity() >= capacity) return; old.array = base.array; base.array = null; } base.array = cast(ubyte*)(new ubyte[capacity*T.sizeof+8]); *base.lengthPtr() = length; *base.capacityPtr() = capacity; for(int i = 0; i < capacity; i++) emplace!T(base.data()+i); if(old.array) copyData(old); } int length() const { return base.length(); } int capacity() const { return base.capacity(); } bool empty() const { return base.array == null; } ref T at(int i) { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()- 1)); return base.data()[i]; } ref const T at(int i) const { assert(!empty(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds of empty array"); assert(i >= 0 && i < length(), "Array of " ~ T.stringof ~ ": index " ~ to!string(i) ~ " out of bounds 0-" ~ to!string(length()- 1)); return base.data()[i]; } const ref T opIndex(int i) const { return at(i); } void opIndexAssign(T t, int i) { at(i) = t; } void opIndexAssign(ref T t, int i) { at(i) = t; } void opAssign(ref const Array!T array) { copy(array); } void opAssign(T[] array) { int len = array.length; resize(len); for(int i = 0; i < len; i++) at(i) = array[i]; } void copy(ref const Array!T array) { if(array.empty()) return clear(); int len = array.length(); reserve(len, len); for(int i = 0; i < len; i++) at(i) = array[i]; } void opOpAssign(string op, A...)(A a) if(op == "~") { appendComposite(a); } ref T front() { return at(0); } const ref T front() const { return at(0); } ref T back() { return at(length()-1); } const ref T back() const { return at(length()-1); } ref T appendOne() { int sz = length(); int sp = capacity(); if(sp > sz) (*base.lengthPtr())++; else { sz++; sp = max(2, sp+sp/2, sz); reserve(sp, sz); } return back(); } void append(A...)(A a) { static if(a.length == 1 && (is(typeof(a[0]) == Array!T) || is(typeof(a[0]) == T[]))) { appendComposite(a[0]); } else { appendTuple(a); } } void appendTuple(A...)(A a) { foreach(x; a) appendOne() = x; } void appendComposite(A)(A a) { int prevLen = length(); resize(prevLen + a.length); for(int i = 0; i < a.length; i++) at(i+prevLen) = a[i]; } private: static struct ArrayBase(T) { public: ~this() { clear(); } void clear() { if(array) delete array; } int length() const { return array ? *lengthPtr() : 0; } int capacity() const { return array ? *capacityPtr() : 0; } int* lengthPtr() const { return cast(int*)(array); } int* capacityPtr() const { return cast(int*)(array+4); } T* data() const { return cast(T*)(array+8); } ubyte* array; } ArrayBase!T base; void copyData(ref ArrayBase!T array) { int copyLen = min(length, array.length()); for(int i = 0; i < copyLen; i++) { at(i) = array.data()[i]; } } } static bool less(T)(T a, T b) { return a < b; } void insertionSort1(T, alias L = less!T)(ref Array!T a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition1(T, alias L = less!T)(ref Array!T a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort1(T, alias L = less!T)(ref Array!T a, int p, int r) { if(p + 7 > r) return insertionSort1!(T, L)(a, p, r); if (p < r) { int q = partition1!(T, L)(a, p, r); quickSort1!(T, L)(a, p, q - 1); quickSort1!(T, L)(a, q + 1, r); } } void sort1(T, alias L = less!T)(ref Array!T a) { quickSort1!(T, L)(a, 0, a.length()-1); } void insertionSort2(T, alias L = less!T)(T *a, int low, int high) { for(int i = low; i <= high; i++) { int min = i; for(int j = i + 1; j <= high; j++) if(L(a[j], a[min])) min = j; swap(a[i], a[min]); } } int partition2(T, alias L = less!T)(T *a, int p, int r) { T x = a[r]; int j = p - 1; for (int i = p; i < r; i++) { if (L(x, a[i])) continue; swap(a[i], a[++j]); } a[r] = a[j + 1]; a[j + 1] = x; return j + 1; } void quickSort2(T, alias L = less!T)(T *a, int p, int r) { if(p + 7 > r) return insertionSort2!(T, L)(a, p, r); if (p < r) { int q = partition2!(T, L)(a, p, r); quickSort2!(T, L)(a, p, q - 1); quickSort2!(T, L)(a, q + 1, r); } } void sort2(T, alias L = less!T)(T *a, int length) { quickSort2!(T, L)(a, 0, length-1); } ulong getCycle() { asm { rdtsc; } } double[] vals; ulong bench1() { ulong startTime = getCycle(); Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort1(v); } return getCycle() - startTime; } ulong bench2() { ulong startTime = getCycle(); Array!double v; for(int i = 0; i < 100; i++) { v = vals; sort2(&v[0], v.length); } return getCycle() - startTime; } void main() { Mt19937 gen; vals.length = 1000; for(int i = 0; i < 1000; i++) vals[i] = uniform(0.0,1000.0); ulong time1; ulong time2; for(int i = 0; i < 100; i++) { time1 += bench1(); time2 += bench2(); } printf("Sorting with Array.opIndex: %f\n", time1/1e5); printf("Sorting with pointers: %f\n", time2/1e5); printf("%f percent faster\n", 100.0 * (time1-time2) / time1); } Testing your C++ program (altered getCycle() for GCC) Times I get: ------- Sorting with Array: 46869.159840 Sorting with pointers: 38688.937320 17.453316 percent faster Sorting with Array: 46631.903760 Sorting with pointers: 38520.609360 17.394302 percent faster Sorting with Array: 46674.330720 Sorting with pointers: 38545.202520 17.416700 percent faster ------- On a , I thought I might try an older version of GCC for the D program. Really surprisingly, I got: ------- Sorting with Array.opIndex: 43075.059840 Sorting with pointers: 40019.701920 7.093102 percent faster Sorting with Array.opIndex: 42940.085640 Sorting with pointers: 39594.089040 7.792245 percent faster Sorting with Array.opIndex: 44389.127280 Sorting with pointers: 41159.016960 7.276805 percent faster ------- This will need some thinking through as to just what happened between GDC-4.3 -> GDC-4.4 :~) Regards
Dec 13 2010
parent reply "Craig Black" <craigblack2 cox.net> writes:
 Testing your C++ program (altered getCycle() for GCC)

 Times I get:
 -------
 Sorting with Array: 46869.159840
 Sorting with pointers: 38688.937320
 17.453316 percent faster

 Sorting with Array: 46631.903760
 Sorting with pointers: 38520.609360
 17.394302 percent faster

 Sorting with Array: 46674.330720
 Sorting with pointers: 38545.202520
 17.416700 percent faster
 -------


 On a , I thought I might try an older version of GCC for the D program.
 Really surprisingly, I got:

 -------
 Sorting with Array.opIndex: 43075.059840
 Sorting with pointers: 40019.701920
 7.093102 percent faster

 Sorting with Array.opIndex: 42940.085640
 Sorting with pointers: 39594.089040
 7.792245 percent faster

 Sorting with Array.opIndex: 44389.127280
 Sorting with pointers: 41159.016960
 7.276805 percent faster
 -------

 This will need some thinking through as to just what happened between 
 GDC-4.3 -> GDC-4.4 :~)

 Regards
Curious benches there. Seems GCC isn't inlining (or other optimization) as well as Visual C++ here. I'm getting a neglible ~2% difference. -Craig
Dec 13 2010
parent reply Iain Buclaw <ibuclaw ubuntu.com> writes:
== Quote from Craig Black (craigblack2 cox.net)'s article
 Testing your C++ program (altered getCycle() for GCC)

 Times I get:
 -------
 Sorting with Array: 46869.159840
 Sorting with pointers: 38688.937320
 17.453316 percent faster

 Sorting with Array: 46631.903760
 Sorting with pointers: 38520.609360
 17.394302 percent faster

 Sorting with Array: 46674.330720
 Sorting with pointers: 38545.202520
 17.416700 percent faster
 -------


 On a , I thought I might try an older version of GCC for the D program.
 Really surprisingly, I got:

 -------
 Sorting with Array.opIndex: 43075.059840
 Sorting with pointers: 40019.701920
 7.093102 percent faster

 Sorting with Array.opIndex: 42940.085640
 Sorting with pointers: 39594.089040
 7.792245 percent faster

 Sorting with Array.opIndex: 44389.127280
 Sorting with pointers: 41159.016960
 7.276805 percent faster
 -------

 This will need some thinking through as to just what happened between
 GDC-4.3 -> GDC-4.4 :~)

 Regards
Curious benches there. Seems GCC isn't inlining (or other optimization) as well as Visual C++ here. I'm getting a neglible ~2% difference. -Craig
OK, found and fixed the problem with GDC-4.4. My results are now: g++-4.4 -O3 -march=native -ftree-vectorize ------- Sorting with Array: 45861.599640 Sorting with pointers: 38417.632680 16.231372 percent faster Sorting with Array: 46421.979120 Sorting with pointers: 38474.709840 17.119626 percent faster Sorting with Array: 48622.885200 Sorting with pointers: 39593.385600 18.570473 percent faster ------- gdc-4.4 -frelease -O3 -march=native -ftree-vectorize ------- Sorting with Array.opIndex: 41454.052200 Sorting with pointers: 38331.517560 7.532520 percent faster Sorting with Array.opIndex: 41109.852720 Sorting with pointers: 37836.883320 7.961521 percent faster Sorting with Array.opIndex: 40587.349320 Sorting with pointers: 37390.488120 7.876497 percent faster ------- dmd-2.050 -release -O -inline ------- Sorting with Array.opIndex: 53561.598720 Sorting with pointers: 48348.626760 9.732667 percent faster Sorting with Array.opIndex: 55861.658280 Sorting with pointers: 49909.880760 10.654495 percent faster Sorting with Array.opIndex: 56887.660800 Sorting with pointers: 51470.453400 9.522640 percent faster ------- I noticed that with dmd -inline needs to be explicitly set. Otherwise there appears to be nothing to suggest a wild difference between using opIndex and pointers. :~) Regards
Dec 13 2010
parent "Craig Black" <craigblack2 cox.net> writes:
"Iain Buclaw" <ibuclaw ubuntu.com> wrote in message 
news:ie6ahr$1coc$1 digitalmars.com...
 == Quote from Craig Black (craigblack2 cox.net)'s article
 Testing your C++ program (altered getCycle() for GCC)

 Times I get:
 -------
 Sorting with Array: 46869.159840
 Sorting with pointers: 38688.937320
 17.453316 percent faster

 Sorting with Array: 46631.903760
 Sorting with pointers: 38520.609360
 17.394302 percent faster

 Sorting with Array: 46674.330720
 Sorting with pointers: 38545.202520
 17.416700 percent faster
 -------


 On a , I thought I might try an older version of GCC for the D program.
 Really surprisingly, I got:

 -------
 Sorting with Array.opIndex: 43075.059840
 Sorting with pointers: 40019.701920
 7.093102 percent faster

 Sorting with Array.opIndex: 42940.085640
 Sorting with pointers: 39594.089040
 7.792245 percent faster

 Sorting with Array.opIndex: 44389.127280
 Sorting with pointers: 41159.016960
 7.276805 percent faster
 -------

 This will need some thinking through as to just what happened between
 GDC-4.3 -> GDC-4.4 :~)

 Regards
Curious benches there. Seems GCC isn't inlining (or other optimization) as well as Visual C++ here. I'm getting a neglible ~2% difference. -Craig
OK, found and fixed the problem with GDC-4.4. My results are now: g++-4.4 -O3 -march=native -ftree-vectorize ------- Sorting with Array: 45861.599640 Sorting with pointers: 38417.632680 16.231372 percent faster Sorting with Array: 46421.979120 Sorting with pointers: 38474.709840 17.119626 percent faster Sorting with Array: 48622.885200 Sorting with pointers: 39593.385600 18.570473 percent faster ------- gdc-4.4 -frelease -O3 -march=native -ftree-vectorize ------- Sorting with Array.opIndex: 41454.052200 Sorting with pointers: 38331.517560 7.532520 percent faster Sorting with Array.opIndex: 41109.852720 Sorting with pointers: 37836.883320 7.961521 percent faster Sorting with Array.opIndex: 40587.349320 Sorting with pointers: 37390.488120 7.876497 percent faster ------- dmd-2.050 -release -O -inline ------- Sorting with Array.opIndex: 53561.598720 Sorting with pointers: 48348.626760 9.732667 percent faster Sorting with Array.opIndex: 55861.658280 Sorting with pointers: 49909.880760 10.654495 percent faster Sorting with Array.opIndex: 56887.660800 Sorting with pointers: 51470.453400 9.522640 percent faster ------- I noticed that with dmd -inline needs to be explicitly set. Otherwise there appears to be nothing to suggest a wild difference between using opIndex and pointers. :~) Regards
Thanks for your help and interest. Basically you are saying I should use GDC instead? I've heard rumors that GDC is not as stable. Can you comment on this? I am still getting different results than you with dmd, but maybe that's because we have different machines? My results on a core i7 1.86 GHz notebook: dmd-2.050 -O -release -inline Sorting with Array.opIndex: 19160.2 Sorting with pointers: 15006.4 21.6791 percent faster Sorting with Array.opIndex: 20200.8 Sorting with pointers: 15699.1 22.2887 percent faster Sorting with Array.opIndex: 19563.3 Sorting with pointers: 15418.4 21.1872 percent faster -Craig
Dec 14 2010
prev sibling next sibling parent reply "Nick Voronin" <elfy.nv gmail.com> writes:
On Mon, 13 Dec 2010 01:50:50 +0300, Craig Black <craigblack2 cox.net>  
wrote:

 The following program illustrates the problems with inlining in the dmd  
 compiler.  Perhaps with some more work I can reduce it to a smaller test  
 case.  I was playing around with a simple Array template, and noticed  
 that similar C++ code performs much better.  This is due, at least in  
 part, to opIndex not being properly inlined by dmd.  There are two sort  
 functions, quickSort1 and quickSort2.  quickSort1 indexes an Array data  
 structure. quickSort2 indexes raw pointers.  quickSort2 is roughly 20%  
 faster on my core i7.
Compiled with dmd v2.050/win32 -g -O -inline -release First, I looked in debugger on actual asm and I must say inlining is done very well. Code for two versions is almost identical with slight overhead in case of Array for there is extra level of indirection in data access, inlining or not. Second, I have anywhere from 3.3 to 6.7% difference in performance, but no more than that. Tested on Core2Duo E6300, Windows XP SP3. I increased number of iterations for benchmark!() to 5 to reduce volatility of results. That's the only change to source I did. Third... Now here is a funny thing. Absolute times and difference between implementation depends on how do you run the program. I was dumbfounded as of how does it matter, but the fact is that aforementioned avg 5% difference I get if I run it with command line as "inline.exe". If I run it as "inline" without extension I get difference around 15% and absolute times are notably smaller. X:\d\tests\craig>inline.exe Sorting with Array.opIndex: 6533 Sorting with pointers: 6264 4.11756 percent faster X:\d\tests\craig>inline Sorting with Array.opIndex: 5390 Sorting with pointers: 4674 13.2839 percent faster Something like that. It's not a fluke. I tested it on my old AthlonXp with XP SP2 and saw exactly the same picture (btw, difference in % between implementation was about the same). I ran both variants under stracent and found no difference except one pointer on the stack when LeaveCriticalSection and GetCurrentThreadId are called was always off by 4 bytes. This made me thinking. The only observable difference is length of command line. And indeed, renaming program showed that only length of command line is a reason, not the content. Further tests suggest that some value is either aligned to 8 byte or not depending on length of command line and this makes all the difference (which happens to be greater than difference between implementations of sorting). I couldn't find what value causes slowdown though. -- Using Opera's revolutionary email client: http://www.opera.com/mail/
Dec 16 2010
parent "Nick Voronin" <elfy.nv gmail.com> writes:
Looks like alignment of local variables is somewhat broken.

import std.stdio;

void main()
{
	int a;
	double d;
	writeln(&a," ", &d);
}

 testdl.exe
12FE70 12FE78
 testdl
12FE74 12FE7C It's not like double is not aligned, otherwise it would be placed right after int, 4 bytes apart, but it is aligned with some assumption which may or may not hold depending on command line. Bug? I hope doubles _should_ be aligned by compiler? On Fri, 17 Dec 2010 09:50:53 +0300, Nick Voronin <elfy.nv gmail.com> wrote:
 Third... Now here is a funny thing. Absolute times and difference  
 between implementation depends on how do you run the program. I was  
 dumbfounded as of how does it matter, but the fact is that  
 aforementioned avg 5% difference I get if I run it with command line as  
 "inline.exe". If I run it as "inline" without extension I get difference  
 around 15% and absolute times are notably smaller.


 X:\d\tests\craig>inline.exe
 Sorting with Array.opIndex: 6533
 Sorting with pointers: 6264
 4.11756 percent faster

 X:\d\tests\craig>inline
 Sorting with Array.opIndex: 5390
 Sorting with pointers: 4674
 13.2839 percent faster

 Something like that. It's not a fluke. I tested it on my old AthlonXp  
 with XP SP2 and saw exactly the same picture (btw, difference in %  
 between implementation was about the same).

 I ran both variants under stracent and found no difference except one  
 pointer on the stack when LeaveCriticalSection and GetCurrentThreadId  
 are called was always off by 4 bytes. This made me thinking. The only  
 observable difference is length of command line. And indeed, renaming  
 program showed that only length of command line is a reason, not the  
 content.

 Further tests suggest that some value is either aligned to 8 byte or not  
 depending on length of command line and this makes all the difference  
 (which happens to be greater than difference between implementations of  
 sorting). I couldn't find what value causes slowdown though.
-- Using Opera's revolutionary email client: http://www.opera.com/mail/
Dec 17 2010
prev sibling parent reply bearophile <bearophileHUGS lycos.com> writes:
Nick Voronin:

 Looks like alignment of local variables is somewhat broken.
This is a very nice bug. If not already present then please add it to Bugzilla. If you don't want to add it, then I will add it myself. I have modified your code like this: import core.stdc.stdio: printf; void main() { int a; double d; printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8); } And this shows the performance difference: import core.stdc.stdio: printf; import std.date: getUTCtime, ticksPerSecond; void main() { double d = 0.0; auto t0 = getUTCtime(); for (size_t i = 0; i < 100_000_000; i++) d += 1; auto t1 = getUTCtime(); printf("%lf\n", d); printf("%u\n", (cast(size_t)&d) % 8); printf("%lf\n", (cast(double)t1 - cast(double)t0) / ticksPerSecond); } Bye, bearophile
Dec 17 2010
parent reply "Nick Voronin" <elfy.nv gmail.com> writes:
I found this on bugzilla.
http://d.puremagic.com/issues/show_bug.cgi?id=2278

So it's not really a bug. Yet there is no simple workaround. And gap  
between int and double looks strange when stack frame itself is unaligned.

btw, is there no explicit alignment for variables in D at all?
align(8) double d; compiles if d is global, but it does nothing.

import core.stdc.stdio: printf;

align(8) int a;
align(8) double d;

void main() {
     printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8);
}

This _always_ prints "4 4" for me. It seems that globals are not affected  
by environment, but not aligned anyway.

Honestly, I'm lost :) It all works not as expected but then again I can't  
find any evidences that my expectations are valid.

On Sat, 18 Dec 2010 01:57:44 +0300, bearophile <bearophileHUGS lycos.com>  
wrote:

 Nick Voronin:

 Looks like alignment of local variables is somewhat broken.
This is a very nice bug. If not already present then please add it to Bugzilla. If you don't want to add it, then I will add it myself. I have modified your code like this: import core.stdc.stdio: printf; void main() { int a; double d; printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8); } And this shows the performance difference: import core.stdc.stdio: printf; import std.date: getUTCtime, ticksPerSecond; void main() { double d = 0.0; auto t0 = getUTCtime(); for (size_t i = 0; i < 100_000_000; i++) d += 1; auto t1 = getUTCtime(); printf("%lf\n", d); printf("%u\n", (cast(size_t)&d) % 8); printf("%lf\n", (cast(double)t1 - cast(double)t0) / ticksPerSecond); } Bye, bearophile
-- Using Opera's revolutionary email client: http://www.opera.com/mail/
Dec 17 2010
next sibling parent bearophile <bearophileHUGS lycos.com> writes:
Nick Voronin:

 I found this on bugzilla.
 http://d.puremagic.com/issues/show_bug.cgi?id=2278
 
 So it's not really a bug.
It's borderline a performance bug. You may add that code to the issue 2278 as further examples. Bye, bearophile
Dec 17 2010
prev sibling parent reply Don <nospam nospam.com> writes:
Nick Voronin wrote:
 I found this on bugzilla.
 http://d.puremagic.com/issues/show_bug.cgi?id=2278
 
 So it's not really a bug. Yet there is no simple workaround. And gap 
 between int and double looks strange when stack frame itself is unaligned.
Interestingly, since I filed that bug, DMD for MacOSX was created, which ensures that the stack stays aligned in the way I described! So the DMD backend is perfectly capable of doing it now.
 btw, is there no explicit alignment for variables in D at all?
 align(8) double d; compiles if d is global, but it does nothing.
That's a regression. Large globals are always aligned to a 16-byte boundary (see changelog for 2.007) However this code: import core.stdc.stdio: printf; int a; double[4] d; void main() { printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8); } shows that it stopped doing that somewhere between 2.027 and 2.030.
Dec 17 2010
next sibling parent "Nick Voronin" <elfy.nv gmail.com> writes:
On Sat, 18 Dec 2010 04:17:46 +0300, Don <nospam nospam.com> wrote:

 Interestingly, since I filed that bug, DMD for MacOSX was created, which  
   ensures that the stack stays aligned in the way I described! So the  
 DMD backend is perfectly capable of doing it now.
I added some examples to that bugreport. Just in case :)
 btw, is there no explicit alignment for variables in D at all?
 align(8) double d; compiles if d is global, but it does nothing.
That's a regression. Large globals are always aligned to a 16-byte boundary (see changelog for 2.007) However this code: import core.stdc.stdio: printf; int a; double[4] d; void main() { printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8); } shows that it stopped doing that somewhere between 2.027 and 2.030.
Thanks. Filed it as http://d.puremagic.com/issues/show_bug.cgi?id=5355 -- Using Opera's revolutionary email client: http://www.opera.com/mail/
Dec 17 2010
prev sibling parent reply Nick Voronin <elfy.nv gmail.com> writes:
On Sat, 18 Dec 2010 02:17:46 +0100
Don <nospam nospam.com> wrote:

 Nick Voronin wrote:
 btw, is there no explicit alignment for variables in D at all?
 align(8) double d; compiles if d is global, but it does nothing.
That's a regression. Large globals are always aligned to a 16-byte boundary (see changelog for 2.007)
On second thought large globals in static segment (as log says) are probably only those with __gshared prefix. And they do look aligned.
 However this code:
 
 import core.stdc.stdio: printf;
 
 int a;
 double[4] d;
 
 void main() {
      printf("%u %u\n", (cast(size_t)&a) % 8, (cast(size_t)&d) % 8);
 }
 shows that it stopped doing that somewhere between 2.027 and 2.030.
-- Nick Voronin <elfy.nv gmail.com>
Dec 18 2010
parent Don <nospam nospam.com> writes:
Nick Voronin wrote:
 On Sat, 18 Dec 2010 02:17:46 +0100
 Don <nospam nospam.com> wrote:
 
 Nick Voronin wrote:
 btw, is there no explicit alignment for variables in D at all?
 align(8) double d; compiles if d is global, but it does nothing.
That's a regression. Large globals are always aligned to a 16-byte boundary (see changelog for 2.007)
On second thought large globals in static segment (as log says) are probably only those with __gshared prefix. And they do look aligned.
Good catch! Yes, that makes perfect sense. So the bug is that align() is ignored for TLS variables.
Dec 20 2010