|
Archives
D Programming
D
D.gnu
digitalmars.D
digitalmars.D.bugs
digitalmars.D.dtl
digitalmars.D.dwt
digitalmars.D.announce
digitalmars.D.learn
digitalmars.D.debugger
C/C++ Programming
c++
c++.announce
c++.atl
c++.beta
c++.chat
c++.command-line
c++.dos
c++.dos.16-bits
c++.dos.32-bits
c++.idde
c++.mfc
c++.rtl
c++.stl
c++.stl.hp
c++.stl.port
c++.stl.sgi
c++.stlsoft
c++.windows
c++.windows.16-bits
c++.windows.32-bits
c++.wxwindows
digitalmars.empire
digitalmars.DMDScript
|
c++ - Bubble sort bechmark
↑ ↓ ← → "Javier Gutiérrez" <nikkho nospam.hotmail.com> writes:
I have done a simple benchmark using a bubble sort algorithm with diferent
compilers.
All the tests have been run in and AMD K7 at 1050 Mhz using Windows XP Pro.
Here are my results:
COMPILER TIME SIZE COMMAND LINE
C++ Builder 6 6168 57.344 BCC32 -6 -O2 -O -a8 -d -r -k- -s -lOS Int.c
CoderWarrior 7.2 Pro 4647 36.864 Within the IDE
DigitalMars 8.27.5 7390 26.140 SC -6 -a8 -f -ff -mn -Nc -o Int.c
Visual C++ .NET 3374 36.864 CL /Ox /Og /Ob2 /Oi /Ot /Oy /GT /G6 /GA /D
"WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /GF /FD /ML /Zp16 /Gy /W4
/nologo /c /Zi /TP Int.c; LINK /OUT:"Int_VC.exe" /INCREMENTAL:NO /NOLOGO
/SUBSYSTEM:CONSOLE /OPT:REF /OPT:ICF /OPT:WIN98 /MACHINE:IX86 Int.obj
Watcom C++ 11.0c beta 6209 39.424
WCL386 -oneatx -oh -oi -ei -em -zp16 -6 -fp6 -zw -d0 -bt=nt -l=nt Int.c
Why is DMC performing relativelly bad in speed terms? Have I set up the
compiler switches correctly?
Great executable size achieved!
-
Int.c ----------------------------------------------------------------------
------------------
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SORT_ITER 10
#define SORT_SIZE 10000
void TestSort (void);
void InitSort (int gaiTab[]);
void DoSort(int gaiTab[]);
void main (void)
{
TestSort();
}
void TestSort (void)
{
int i;
int *aiTab;
clock_t clkStart, clkStop;
printf("Testing Int -> Bubble sort ");
aiTab=(int *) malloc(SORT_SIZE*sizeof(int));
clkStart=clock();
for (i=SORT_ITER; i>=0; i--)
{
InitSort(aiTab);
DoSort(aiTab);
}
clkStop=clock();
printf("%d ms.\n", (((clkStop-clkStart)*1000)/CLK_TCK));
free(aiTab);
}
void InitSort (int paiTab[])
{
int iCont;
for (iCont=SORT_SIZE; iCont>=0; iCont--)
paiTab[iCont]=SORT_SIZE-iCont;
}
void DoSort (int paiTab[])
{
int Swap;
int Temp,I;
do
{
Swap = 0;
for (I = 0; I<SORT_SIZE; I++)
if (paiTab[I] > paiTab[I+1])
{
Temp = paiTab[I];
paiTab[I] = paiTab[I+1];
paiTab[I+1] = Temp;
Swap = 1;
}
}
while (Swap);
}
↑ ↓ ← → Jan Knepper <jan smartsoft.cc> writes:
I think you have a trade-off here.
You run full optimization, and that takes time with DMC++...
Now do the same test and remove the optimizer switches from the compilers...
Jan
"Javier Gutiérrez" wrote:
I have done a simple benchmark using a bubble sort algorithm with diferent
compilers.
All the tests have been run in and AMD K7 at 1050 Mhz using Windows XP Pro.
Here are my results:
COMPILER TIME SIZE COMMAND LINE
C++ Builder 6 6168 57.344 BCC32 -6 -O2 -O -a8 -d -r -k- -s -lOS Int.c
CoderWarrior 7.2 Pro 4647 36.864 Within the IDE
DigitalMars 8.27.5 7390 26.140 SC -6 -a8 -f -ff -mn -Nc -o Int.c
Visual C++ .NET 3374 36.864 CL /Ox /Og /Ob2 /Oi /Ot /Oy /GT /G6 /GA /D
"WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /GF /FD /ML /Zp16 /Gy /W4
/nologo /c /Zi /TP Int.c; LINK /OUT:"Int_VC.exe" /INCREMENTAL:NO /NOLOGO
/SUBSYSTEM:CONSOLE /OPT:REF /OPT:ICF /OPT:WIN98 /MACHINE:IX86 Int.obj
Watcom C++ 11.0c beta 6209 39.424
WCL386 -oneatx -oh -oi -ei -em -zp16 -6 -fp6 -zw -d0 -bt=nt -l=nt Int.c
Why is DMC performing relativelly bad in speed terms? Have I set up the
compiler switches correctly?
Great executable size achieved!
↑ ↓ ← → "Javier Gutiérrez" <nikkho nospam.hotmail.com> writes:
Hi Jan,
The times shown are execution times, not compilation times.
"Jan Knepper" <jan smartsoft.cc> escribió en el mensaje
news:3CA704A5.2AC65F61 smartsoft.cc...
I think you have a trade-off here.
You run full optimization, and that takes time with DMC++...
Now do the same test and remove the optimizer switches from the
Jan
"Javier Gutiérrez" wrote:
I have done a simple benchmark using a bubble sort algorithm with
compilers.
All the tests have been run in and AMD K7 at 1050 Mhz using Windows XP
Here are my results:
COMPILER TIME SIZE COMMAND LINE
C++ Builder 6 6168 57.344 BCC32 -6 -O2 -O -a8 -d -r -k- -s -lOS Int.c
CoderWarrior 7.2 Pro 4647 36.864 Within the IDE
DigitalMars 8.27.5 7390 26.140 SC -6 -a8 -f -ff -mn -Nc -o Int.c
Visual C++ .NET 3374 36.864 CL /Ox /Og /Ob2 /Oi /Ot /Oy /GT /G6 /GA /D
"WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /GF /FD /ML /Zp16 /Gy /W4
/nologo /c /Zi /TP Int.c; LINK /OUT:"Int_VC.exe" /INCREMENTAL:NO /NOLOGO
/SUBSYSTEM:CONSOLE /OPT:REF /OPT:ICF /OPT:WIN98 /MACHINE:IX86 Int.obj
Watcom C++ 11.0c beta 6209 39.424
WCL386 -oneatx -oh -oi -ei -em -zp16 -6 -fp6 -zw -d0 -bt=nt -l=nt Int.c
Why is DMC performing relativelly bad in speed terms? Have I set up
compiler switches correctly?
Great executable size achieved!
↑ ↓ ← → Jan Knepper <jan smartsoft.cc> writes:
Sorry, Which version of the compiler are you using?
"Javier Gutiérrez" wrote:
Hi Jan,
The times shown are execution times, not compilation times.
"Jan Knepper" <jan smartsoft.cc> escribió en el mensaje
news:3CA704A5.2AC65F61 smartsoft.cc...
I think you have a trade-off here.
You run full optimization, and that takes time with DMC++...
Now do the same test and remove the optimizer switches from the
Jan
"Javier Gutiérrez" wrote:
I have done a simple benchmark using a bubble sort algorithm with
compilers.
All the tests have been run in and AMD K7 at 1050 Mhz using Windows XP
Here are my results:
COMPILER TIME SIZE COMMAND LINE
C++ Builder 6 6168 57.344 BCC32 -6 -O2 -O -a8 -d -r -k- -s -lOS Int.c
CoderWarrior 7.2 Pro 4647 36.864 Within the IDE
DigitalMars 8.27.5 7390 26.140 SC -6 -a8 -f -ff -mn -Nc -o Int.c
Visual C++ .NET 3374 36.864 CL /Ox /Og /Ob2 /Oi /Ot /Oy /GT /G6 /GA /D
"WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /GF /FD /ML /Zp16 /Gy /W4
/nologo /c /Zi /TP Int.c; LINK /OUT:"Int_VC.exe" /INCREMENTAL:NO /NOLOGO
/SUBSYSTEM:CONSOLE /OPT:REF /OPT:ICF /OPT:WIN98 /MACHINE:IX86 Int.obj
Watcom C++ 11.0c beta 6209 39.424
WCL386 -oneatx -oh -oi -ei -em -zp16 -6 -fp6 -zw -d0 -bt=nt -l=nt Int.c
Why is DMC performing relativelly bad in speed terms? Have I set up
compiler switches correctly?
Great executable size achieved!
↑ ↓ ← → "Javier Gutiérrez" <nikkho nospam.hotmail.com> writes:
I' am using 8.27.5
"Jan Knepper" <jan smartsoft.cc> escribió en el mensaje
news:3CA706FD.A89EDE17 smartsoft.cc...
Sorry, Which version of the compiler are you using?
"Javier Gutiérrez" wrote:
Hi Jan,
The times shown are execution times, not compilation times.
"Jan Knepper" <jan smartsoft.cc> escribió en el mensaje
news:3CA704A5.2AC65F61 smartsoft.cc...
I think you have a trade-off here.
You run full optimization, and that takes time with DMC++...
Now do the same test and remove the optimizer switches from the
Jan
"Javier Gutiérrez" wrote:
I have done a simple benchmark using a bubble sort algorithm with
compilers.
All the tests have been run in and AMD K7 at 1050 Mhz using Windows
Pro.
Here are my results:
COMPILER TIME SIZE COMMAND LINE
C++ Builder 6 6168 57.344 BCC32 -6 -O2 -O -a8 -d -r -k- -s -lOS
CoderWarrior 7.2 Pro 4647 36.864 Within the IDE
DigitalMars 8.27.5 7390 26.140 SC -6 -a8 -f -ff -mn -Nc -o Int.c
Visual C++ .NET 3374 36.864 CL /Ox /Og /Ob2 /Oi /Ot /Oy /GT /G6 /GA
"WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /GF /FD /ML /Zp16 /Gy
/nologo /c /Zi /TP Int.c; LINK /OUT:"Int_VC.exe" /INCREMENTAL:NO
/SUBSYSTEM:CONSOLE /OPT:REF /OPT:ICF /OPT:WIN98 /MACHINE:IX86
Watcom C++ 11.0c beta 6209 39.424
WCL386 -oneatx -oh -oi -ei -em -zp16 -6 -fp6 -zw -d0 -bt=nt -l=nt
Why is DMC performing relativelly bad in speed terms? Have I set
the
compiler switches correctly?
Great executable size achieved!
↑ ↓ ← → "Walter" <walter digitalmars.com> writes:
I suppose it depends on what code is generated by the other compilers.
Here's what DMC generates for the critical loop:
_DoSort:
push EBX
mov EDX,8[ESP]
push ESI
push EDI
L97: xor EDI,EDI
xor EBX,EBX
L9B: mov ECX,[EBX*4][EDX]
mov EAX,4[EBX*4][EDX]
cmp ECX,EAX
jle LB4
mov [EBX*4][EDX],EAX
mov ESI,ECX
mov EDI,1
mov 4[EBX*4][EDX],ESI
LB4: inc EBX
cmp EBX,02710h
jb L9B
test EDI,EDI
jne L97
pop EDI
pop ESI
pop EBX
ret
"Javier Gutiérrez" <nikkho nospam.hotmail.com> wrote in message
news:a86p82$1m3o$1 digitaldaemon.com...
I have done a simple benchmark using a bubble sort algorithm with diferent
compilers.
All the tests have been run in and AMD K7 at 1050 Mhz using Windows XP
Here are my results:
COMPILER TIME SIZE COMMAND LINE
C++ Builder 6 6168 57.344 BCC32 -6 -O2 -O -a8 -d -r -k- -s -lOS Int.c
CoderWarrior 7.2 Pro 4647 36.864 Within the IDE
DigitalMars 8.27.5 7390 26.140 SC -6 -a8 -f -ff -mn -Nc -o Int.c
Visual C++ .NET 3374 36.864 CL /Ox /Og /Ob2 /Oi /Ot /Oy /GT /G6 /GA /D
"WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /GF /FD /ML /Zp16 /Gy /W4
/nologo /c /Zi /TP Int.c; LINK /OUT:"Int_VC.exe" /INCREMENTAL:NO /NOLOGO
/SUBSYSTEM:CONSOLE /OPT:REF /OPT:ICF /OPT:WIN98 /MACHINE:IX86 Int.obj
Watcom C++ 11.0c beta 6209 39.424
WCL386 -oneatx -oh -oi -ei -em -zp16 -6 -fp6 -zw -d0 -bt=nt -l=nt Int.c
Why is DMC performing relativelly bad in speed terms? Have I set up
compiler switches correctly?
Great executable size achieved!
-
------------------
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SORT_ITER 10
#define SORT_SIZE 10000
void TestSort (void);
void InitSort (int gaiTab[]);
void DoSort(int gaiTab[]);
void main (void)
{
TestSort();
}
void TestSort (void)
{
int i;
int *aiTab;
clock_t clkStart, clkStop;
printf("Testing Int -> Bubble sort ");
aiTab=(int *) malloc(SORT_SIZE*sizeof(int));
clkStart=clock();
for (i=SORT_ITER; i>=0; i--)
{
InitSort(aiTab);
DoSort(aiTab);
}
clkStop=clock();
printf("%d ms.\n", (((clkStop-clkStart)*1000)/CLK_TCK));
free(aiTab);
}
void InitSort (int paiTab[])
{
int iCont;
for (iCont=SORT_SIZE; iCont>=0; iCont--)
paiTab[iCont]=SORT_SIZE-iCont;
}
void DoSort (int paiTab[])
{
int Swap;
int Temp,I;
do
{
Swap = 0;
for (I = 0; I<SORT_SIZE; I++)
if (paiTab[I] > paiTab[I+1])
{
Temp = paiTab[I];
paiTab[I] = paiTab[I+1];
paiTab[I+1] = Temp;
Swap = 1;
}
}
while (Swap);
}
↑ ↓ ← → "Javier Gutiérrez" <nikkho nospam.hotmail.com> writes:
Attached are all the executables compressed using zip.
"Walter" <walter digitalmars.com> escribió en el mensaje
news:a87f9l$20q3$1 digitaldaemon.com...
I suppose it depends on what code is generated by the other compilers.
Here's what DMC generates for the critical loop:
_DoSort:
push EBX
mov EDX,8[ESP]
push ESI
push EDI
L97: xor EDI,EDI
xor EBX,EBX
L9B: mov ECX,[EBX*4][EDX]
mov EAX,4[EBX*4][EDX]
cmp ECX,EAX
jle LB4
mov [EBX*4][EDX],EAX
mov ESI,ECX
mov EDI,1
mov 4[EBX*4][EDX],ESI
LB4: inc EBX
cmp EBX,02710h
jb L9B
test EDI,EDI
jne L97
pop EDI
pop ESI
pop EBX
ret
"Javier Gutiérrez" <nikkho nospam.hotmail.com> wrote in message
news:a86p82$1m3o$1 digitaldaemon.com...
I have done a simple benchmark using a bubble sort algorithm with
compilers.
All the tests have been run in and AMD K7 at 1050 Mhz using Windows XP
Here are my results:
COMPILER TIME SIZE COMMAND LINE
C++ Builder 6 6168 57.344 BCC32 -6 -O2 -O -a8 -d -r -k- -s -lOS Int.c
CoderWarrior 7.2 Pro 4647 36.864 Within the IDE
DigitalMars 8.27.5 7390 26.140 SC -6 -a8 -f -ff -mn -Nc -o Int.c
Visual C++ .NET 3374 36.864 CL /Ox /Og /Ob2 /Oi /Ot /Oy /GT /G6 /GA /D
"WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /GF /FD /ML /Zp16 /Gy /W4
/nologo /c /Zi /TP Int.c; LINK /OUT:"Int_VC.exe" /INCREMENTAL:NO /NOLOGO
/SUBSYSTEM:CONSOLE /OPT:REF /OPT:ICF /OPT:WIN98 /MACHINE:IX86 Int.obj
Watcom C++ 11.0c beta 6209 39.424
WCL386 -oneatx -oh -oi -ei -em -zp16 -6 -fp6 -zw -d0 -bt=nt -l=nt Int.c
Why is DMC performing relativelly bad in speed terms? Have I set up
compiler switches correctly?
Great executable size achieved!
-
------------------
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SORT_ITER 10
#define SORT_SIZE 10000
void TestSort (void);
void InitSort (int gaiTab[]);
void DoSort(int gaiTab[]);
void main (void)
{
TestSort();
}
void TestSort (void)
{
int i;
int *aiTab;
clock_t clkStart, clkStop;
printf("Testing Int -> Bubble sort ");
aiTab=(int *) malloc(SORT_SIZE*sizeof(int));
clkStart=clock();
for (i=SORT_ITER; i>=0; i--)
{
InitSort(aiTab);
DoSort(aiTab);
}
clkStop=clock();
printf("%d ms.\n", (((clkStop-clkStart)*1000)/CLK_TCK));
free(aiTab);
}
void InitSort (int paiTab[])
{
int iCont;
for (iCont=SORT_SIZE; iCont>=0; iCont--)
paiTab[iCont]=SORT_SIZE-iCont;
}
void DoSort (int paiTab[])
{
int Swap;
int Temp,I;
do
{
Swap = 0;
for (I = 0; I<SORT_SIZE; I++)
if (paiTab[I] > paiTab[I+1])
{
Temp = paiTab[I];
paiTab[I] = paiTab[I+1];
paiTab[I+1] = Temp;
Swap = 1;
}
}
while (Swap);
}
↑ ↓ ← → "Javier Gutiérrez" <nikkho nospam.hotmail.com> writes:
Above is the one generated by C++ Builder 6, and VC++ .NET.
The only think I see is the offset calculation, Borland adds 4 to the
offset, while DMC adds 1, and mul it in the Mov. As far as I know, it should
result in the same speed...
But in fact Borland code is faster, 6168 ms against 7390 ms for DMC.
As VC++ .NET, it seems the loop has been unrolled... Maybe this is the
great advantage from 3374 ms...
Why DMC have not unrolled it?
C++ Builder 6
--------------------------------------------------------------------------
_DoSort proc near
14:
push ebp
mov ebp,esp
push ebx
push esi
15:
xor esi,esi
xor edx,edx
mov eax,dword ptr [ebp+8]
16:
mov ebx,dword ptr [eax+4]
mov ecx,dword ptr [eax]
cmp ebx,ecx
jge short 18
mov dword ptr [eax],ebx
mov dword ptr [eax+4],ecx
mov esi,1
18:
inc edx
add eax,4
cmp edx,10000
jl short 16
test esi,esi
jne short 15
21:
pop esi
pop ebx
pop ebp
ret
_DoSort endp
VC++ .NET
----------------------------------------------------------------------------
------------
DoSort 4 PROC NEAR ; COMDAT
push ebx
push esi
push edi
lea ebx, DWORD PTR [ecx+8]
$L1304:
xor ecx, ecx
mov eax, ebx
mov edi, 1000 ; 000003e8H
$L1307:
mov edx, DWORD PTR [eax-4]
mov esi, DWORD PTR [eax-8]
cmp esi, edx
jle SHORT $L1308
mov DWORD PTR [eax-8], edx
mov DWORD PTR [eax-4], esi
mov ecx, 1
$L1308:
mov edx, DWORD PTR [eax]
mov esi, DWORD PTR [eax-4]
cmp esi, edx
jle SHORT $L1332
mov DWORD PTR [eax-4], edx
mov DWORD PTR [eax], esi
mov ecx, 1
$L1332:
mov edx, DWORD PTR [eax+4]
mov esi, DWORD PTR [eax]
cmp esi, edx
jle SHORT $L1333
mov DWORD PTR [eax], edx
mov DWORD PTR [eax+4], esi
mov ecx, 1
$L1333:
mov edx, DWORD PTR [eax+8]
mov esi, DWORD PTR [eax+4]
cmp esi, edx
jle SHORT $L1334
mov DWORD PTR [eax+4], edx
mov DWORD PTR [eax+8], esi
mov ecx, 1
$L1334:
mov edx, DWORD PTR [eax+12]
mov esi, DWORD PTR [eax+8]
cmp esi, edx
jle SHORT $L1335
mov DWORD PTR [eax+8], edx
mov DWORD PTR [eax+12], esi
mov ecx, 1
$L1335:
mov edx, DWORD PTR [eax+16]
mov esi, DWORD PTR [eax+12]
cmp esi, edx
jle SHORT $L1336
mov DWORD PTR [eax+12], edx
mov DWORD PTR [eax+16], esi
mov ecx, 1
$L1336:
mov edx, DWORD PTR [eax+20]
mov esi, DWORD PTR [eax+16]
cmp esi, edx
jle SHORT $L1337
mov DWORD PTR [eax+16], edx
mov DWORD PTR [eax+20], esi
mov ecx, 1
$L1337:
mov edx, DWORD PTR [eax+24]
mov esi, DWORD PTR [eax+20]
cmp esi, edx
jle SHORT $L1338
mov DWORD PTR [eax+20], edx
mov DWORD PTR [eax+24], esi
mov ecx, 1
$L1338:
mov edx, DWORD PTR [eax+28]
mov esi, DWORD PTR [eax+24]
cmp esi, edx
jle SHORT $L1339
mov DWORD PTR [eax+24], edx
mov DWORD PTR [eax+28], esi
mov ecx, 1
$L1339:
mov edx, DWORD PTR [eax+32]
mov esi, DWORD PTR [eax+28]
cmp esi, edx
jle SHORT $L1340
mov DWORD PTR [eax+28], edx
mov DWORD PTR [eax+32], esi
mov ecx, 1
$L1340:
add eax, 40 ; 00000028H
dec edi
jne $L1307
test ecx, ecx
jne $L1304
pop edi
pop esi
pop ebx
ret 0
DoSort 4 ENDP
"Walter" <walter digitalmars.com> escribió en el mensaje
news:a87f9l$20q3$1 digitaldaemon.com...
I suppose it depends on what code is generated by the other compilers.
Here's what DMC generates for the critical loop:
_DoSort:
push EBX
mov EDX,8[ESP]
push ESI
push EDI
L97: xor EDI,EDI
xor EBX,EBX
L9B: mov ECX,[EBX*4][EDX]
mov EAX,4[EBX*4][EDX]
cmp ECX,EAX
jle LB4
mov [EBX*4][EDX],EAX
mov ESI,ECX
mov EDI,1
mov 4[EBX*4][EDX],ESI
LB4: inc EBX
cmp EBX,02710h
jb L9B
test EDI,EDI
jne L97
pop EDI
pop ESI
pop EBX
ret
"Javier Gutiérrez" <nikkho nospam.hotmail.com> wrote in message
news:a86p82$1m3o$1 digitaldaemon.com...
I have done a simple benchmark using a bubble sort algorithm with
compilers.
All the tests have been run in and AMD K7 at 1050 Mhz using Windows XP
Here are my results:
COMPILER TIME SIZE COMMAND LINE
C++ Builder 6 6168 57.344 BCC32 -6 -O2 -O -a8 -d -r -k- -s -lOS Int.c
CoderWarrior 7.2 Pro 4647 36.864 Within the IDE
DigitalMars 8.27.5 7390 26.140 SC -6 -a8 -f -ff -mn -Nc -o Int.c
Visual C++ .NET 3374 36.864 CL /Ox /Og /Ob2 /Oi /Ot /Oy /GT /G6 /GA /D
"WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /GF /FD /ML /Zp16 /Gy /W4
/nologo /c /Zi /TP Int.c; LINK /OUT:"Int_VC.exe" /INCREMENTAL:NO /NOLOGO
/SUBSYSTEM:CONSOLE /OPT:REF /OPT:ICF /OPT:WIN98 /MACHINE:IX86 Int.obj
Watcom C++ 11.0c beta 6209 39.424
WCL386 -oneatx -oh -oi -ei -em -zp16 -6 -fp6 -zw -d0 -bt=nt -l=nt Int.c
Why is DMC performing relativelly bad in speed terms? Have I set up
compiler switches correctly?
Great executable size achieved!
-
------------------
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SORT_ITER 10
#define SORT_SIZE 10000
void TestSort (void);
void InitSort (int gaiTab[]);
void DoSort(int gaiTab[]);
void main (void)
{
TestSort();
}
void TestSort (void)
{
int i;
int *aiTab;
clock_t clkStart, clkStop;
printf("Testing Int -> Bubble sort ");
aiTab=(int *) malloc(SORT_SIZE*sizeof(int));
clkStart=clock();
for (i=SORT_ITER; i>=0; i--)
{
InitSort(aiTab);
DoSort(aiTab);
}
clkStop=clock();
printf("%d ms.\n", (((clkStop-clkStart)*1000)/CLK_TCK));
free(aiTab);
}
void InitSort (int paiTab[])
{
int iCont;
for (iCont=SORT_SIZE; iCont>=0; iCont--)
paiTab[iCont]=SORT_SIZE-iCont;
}
void DoSort (int paiTab[])
{
int Swap;
int Temp,I;
do
{
Swap = 0;
for (I = 0; I<SORT_SIZE; I++)
if (paiTab[I] > paiTab[I+1])
{
Temp = paiTab[I];
paiTab[I] = paiTab[I+1];
paiTab[I+1] = Temp;
Swap = 1;
}
}
while (Swap);
}
↑ ↓ ← → "John Culver" <jculver btinternet.spamless.com> writes:
Hi,
From my testing I doubt the speed difference is in the code generated in
DoSort() :
When I compiled with the following 2 fixes:
in DoSort() loop should be :
for (I = 0; I<(SORT_SIZE-1); I++) // I=0..I<SORT_SIZE produces
incorrect
results and reads and writes unallocated memory which could introduce unknown
delays
and in TestSort() loop should be :
for (i=SORT_ITER; i>0; i--) // strictly should be i>0 to generate
SORT_ITER
loops rather than SORT_ITER+1
I get the following results for my system : Athlon 1.33 / Win98 SE (yuk!) / DMC
8.25
With the code as is (fixed) I get approx. 5500ms ()
If I manually unroll the i loop in TestSort() the appropriate 10 time I
consistently
get execution times of only 4000ms !!!
Note that DoSort() and InitSort() have 100% identical code in this case
(according to
obj2asm).
So the unrolled code is noticably more efficient, so it looks like the
identical and
apparently efficent code in DoSort() is being stuffed up by something else.
Perhaps my
Athlons instruction translation techniques are doing something very different
due to
the context (remember it's NOT really an x86 processor - it is really a risc86
faking
it), perhaps it's caches are messed up by the CS or SP alignment ?? It certainly
doesn't look like the low speed is the compiler generating poor code, as the
same code
generates 2 very different speeds in only subtly different contexts.
Unfortunately, these days if you do a one task computational benchmark you are
more
likely to discover some subtle feature of your processor, not of your compiler.
JohnC
PS To clarify things : I love AMD products (well processors and chipsets)
======= int.c (revised, and with brutal UNROLL option - see #define UNROLL ...
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SORT_ITER 10
#define SORT_SIZE 10000
void TestSort (void);
void InitSort (int gaiTab[]);
void DoSort(int gaiTab[]);
// #define UNROLL 1
void main (void)
{
TestSort();
}
void TestSort (void)
{
int i;
int *aiTab;
clock_t clkStart, clkStop;
printf("Testing Int -> Bubble sort ");
aiTab=(int *) malloc(SORT_SIZE*sizeof(int));
clkStart=clock();
#ifndef UNROLL
for (i=SORT_ITER; i>0; i--) // strictly should be i>0
#endif
{
InitSort(aiTab);
DoSort(aiTab);
#ifdef UNROLL
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
#endif
}
clkStop=clock();
printf("%d ms.\n", (((clkStop-clkStart)*1000)/CLK_TCK));
free(aiTab);
}
void InitSort (int paiTab[])
{
int iCont;
for (iCont=SORT_SIZE; iCont>=0; iCont--)
paiTab[iCont]=SORT_SIZE-iCont;
}
void DoSort (int paiTab[])
{
int Swap;
int Temp,I;
do
{
Swap = 0;
for (I = 0; I<(SORT_SIZE-1); I++) // I=0..I<SORT_SIZE produces incorrect
results
and reads and writes unallocated memory
if (paiTab[I] > paiTab[I+1])
{
Temp = paiTab[I];
paiTab[I] = paiTab[I+1];
paiTab[I+1] = Temp;
Swap = 1;
}
}
while (Swap);
}
=======
"Javier Gutiérrez" <nikkho nospam.hotmail.com> wrote in message
news:a882ek$2atl$1 digitaldaemon.com...
Above is the one generated by C++ Builder 6, and VC++ .NET.
The only think I see is the offset calculation, Borland adds 4 to the
offset, while DMC adds 1, and mul it in the Mov. As far as I know, it should
result in the same speed...
But in fact Borland code is faster, 6168 ms against 7390 ms for DMC.
As VC++ .NET, it seems the loop has been unrolled... Maybe this is the
great advantage from 3374 ms...
Why DMC have not unrolled it?
C++ Builder 6
--------------------------------------------------------------------------
↑ ↓ ← → "Walter" <walter digitalmars.com> writes:
Hmm. The cpu specs advertise that complex addressing modes don't add extra
time. Perhaps this is not true. -Walter
"John Culver" <jculver btinternet.spamless.com> wrote in message
news:a8844q$2c0c$1 digitaldaemon.com...
Hi,
From my testing I doubt the speed difference is in the code generated
When I compiled with the following 2 fixes:
in DoSort() loop should be :
for (I = 0; I<(SORT_SIZE-1); I++) // I=0..I<SORT_SIZE produces
results and reads and writes unallocated memory which could introduce
and in TestSort() loop should be :
for (i=SORT_ITER; i>0; i--) // strictly should be i>0 to generate
loops rather than SORT_ITER+1
I get the following results for my system : Athlon 1.33 / Win98 SE (yuk!)
With the code as is (fixed) I get approx. 5500ms ()
If I manually unroll the i loop in TestSort() the appropriate 10 time I
get execution times of only 4000ms !!!
Note that DoSort() and InitSort() have 100% identical code in this case
obj2asm).
So the unrolled code is noticably more efficient, so it looks like the
apparently efficent code in DoSort() is being stuffed up by something
Athlons instruction translation techniques are doing something very
the context (remember it's NOT really an x86 processor - it is really a
it), perhaps it's caches are messed up by the CS or SP alignment ?? It
doesn't look like the low speed is the compiler generating poor code, as
generates 2 very different speeds in only subtly different contexts.
Unfortunately, these days if you do a one task computational benchmark you
likely to discover some subtle feature of your processor, not of your
JohnC
PS To clarify things : I love AMD products (well processors and chipsets)
======= int.c (revised, and with brutal UNROLL option - see #define UNROLL
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SORT_ITER 10
#define SORT_SIZE 10000
void TestSort (void);
void InitSort (int gaiTab[]);
void DoSort(int gaiTab[]);
// #define UNROLL 1
void main (void)
{
TestSort();
}
void TestSort (void)
{
int i;
int *aiTab;
clock_t clkStart, clkStop;
printf("Testing Int -> Bubble sort ");
aiTab=(int *) malloc(SORT_SIZE*sizeof(int));
clkStart=clock();
#ifndef UNROLL
for (i=SORT_ITER; i>0; i--) // strictly should be i>0
#endif
{
InitSort(aiTab);
DoSort(aiTab);
#ifdef UNROLL
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
InitSort(aiTab);
DoSort(aiTab);
#endif
}
clkStop=clock();
printf("%d ms.\n", (((clkStop-clkStart)*1000)/CLK_TCK));
free(aiTab);
}
void InitSort (int paiTab[])
{
int iCont;
for (iCont=SORT_SIZE; iCont>=0; iCont--)
paiTab[iCont]=SORT_SIZE-iCont;
}
void DoSort (int paiTab[])
{
int Swap;
int Temp,I;
do
{
Swap = 0;
for (I = 0; I<(SORT_SIZE-1); I++) // I=0..I<SORT_SIZE produces
and reads and writes unallocated memory
if (paiTab[I] > paiTab[I+1])
{
Temp = paiTab[I];
paiTab[I] = paiTab[I+1];
paiTab[I+1] = Temp;
Swap = 1;
}
}
while (Swap);
}
=======
"Javier Gutiérrez" <nikkho nospam.hotmail.com> wrote in message
news:a882ek$2atl$1 digitaldaemon.com...
Above is the one generated by C++ Builder 6, and VC++ .NET.
The only think I see is the offset calculation, Borland adds 4 to
offset, while DMC adds 1, and mul it in the Mov. As far as I know, it
result in the same speed...
But in fact Borland code is faster, 6168 ms against 7390 ms for DMC.
As VC++ .NET, it seems the loop has been unrolled... Maybe this is
great advantage from 3374 ms...
Why DMC have not unrolled it?
C++ Builder 6
--------------------------------------------------------------------------
↑ ↓ ← → "Javier Gutiérrez" <nikkho nospam.hotmail.com> writes:
Watcom code:
It seems very similar to Borland one...
DoSort_:
push ebx
push ecx
push edx
push esi
push ebp
mov esi,eax
mov ebp,00000001H
lea ebx,9c40H[esi]
L$3:
mov eax,esi
xor ecx,ecx
L$4:
mov edx,dword ptr [eax]
cmp edx,dword ptr 4H[eax]
jg L$6
L$5:
add eax,00000004H
cmp eax,ebx
jne L$4
test ecx,ecx
jne L$3
pop ebp
pop esi
pop edx
pop ecx
pop ebx
ret
"Walter" <walter digitalmars.com> escribió en el mensaje
news:a87f9l$20q3$1 digitaldaemon.com...
I suppose it depends on what code is generated by the other compilers.
Here's what DMC generates for the critical loop:
_DoSort:
push EBX
mov EDX,8[ESP]
push ESI
push EDI
L97: xor EDI,EDI
xor EBX,EBX
L9B: mov ECX,[EBX*4][EDX]
mov EAX,4[EBX*4][EDX]
cmp ECX,EAX
jle LB4
mov [EBX*4][EDX],EAX
mov ESI,ECX
mov EDI,1
mov 4[EBX*4][EDX],ESI
LB4: inc EBX
cmp EBX,02710h
jb L9B
test EDI,EDI
jne L97
pop EDI
pop ESI
pop EBX
ret
"Javier Gutiérrez" <nikkho nospam.hotmail.com> wrote in message
news:a86p82$1m3o$1 digitaldaemon.com...
I have done a simple benchmark using a bubble sort algorithm with
compilers.
All the tests have been run in and AMD K7 at 1050 Mhz using Windows XP
Here are my results:
COMPILER TIME SIZE COMMAND LINE
C++ Builder 6 6168 57.344 BCC32 -6 -O2 -O -a8 -d -r -k- -s -lOS Int.c
CoderWarrior 7.2 Pro 4647 36.864 Within the IDE
DigitalMars 8.27.5 7390 26.140 SC -6 -a8 -f -ff -mn -Nc -o Int.c
Visual C++ .NET 3374 36.864 CL /Ox /Og /Ob2 /Oi /Ot /Oy /GT /G6 /GA /D
"WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /GF /FD /ML /Zp16 /Gy /W4
/nologo /c /Zi /TP Int.c; LINK /OUT:"Int_VC.exe" /INCREMENTAL:NO /NOLOGO
/SUBSYSTEM:CONSOLE /OPT:REF /OPT:ICF /OPT:WIN98 /MACHINE:IX86 Int.obj
Watcom C++ 11.0c beta 6209 39.424
WCL386 -oneatx -oh -oi -ei -em -zp16 -6 -fp6 -zw -d0 -bt=nt -l=nt Int.c
Why is DMC performing relativelly bad in speed terms? Have I set up
compiler switches correctly?
Great executable size achieved!
-
------------------
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SORT_ITER 10
#define SORT_SIZE 10000
void TestSort (void);
void InitSort (int gaiTab[]);
void DoSort(int gaiTab[]);
void main (void)
{
TestSort();
}
void TestSort (void)
{
int i;
int *aiTab;
clock_t clkStart, clkStop;
printf("Testing Int -> Bubble sort ");
aiTab=(int *) malloc(SORT_SIZE*sizeof(int));
clkStart=clock();
for (i=SORT_ITER; i>=0; i--)
{
InitSort(aiTab);
DoSort(aiTab);
}
clkStop=clock();
printf("%d ms.\n", (((clkStop-clkStart)*1000)/CLK_TCK));
free(aiTab);
}
void InitSort (int paiTab[])
{
int iCont;
for (iCont=SORT_SIZE; iCont>=0; iCont--)
paiTab[iCont]=SORT_SIZE-iCont;
}
void DoSort (int paiTab[])
{
int Swap;
int Temp,I;
do
{
Swap = 0;
for (I = 0; I<SORT_SIZE; I++)
if (paiTab[I] > paiTab[I+1])
{
Temp = paiTab[I];
paiTab[I] = paiTab[I+1];
paiTab[I+1] = Temp;
Swap = 1;
}
}
while (Swap);
}
|
|