digitalmars.D - Array reverse
- bearophile (116/116) Nov 23 2012 I've seen a difference in the performance of
- monarch_dodra (21/41) Nov 23 2012 I'm not surprised: Algorithm's reverse is written for the generic
- bearophile (62/72) Nov 27 2012 This is the full program:
- Walter Bright (3/4) Nov 23 2012 You've got array bounds checking turned on for algorithm.reverse, and yo...
- bearophile (75/78) Nov 27 2012 Right, sorry. This is compiled with the latest DMD 2.061alpha
- David Nadlinger (64/66) Nov 28 2012 fwiw, here is what LDC produces:
I've seen a difference in the performance of
std.algorithm.reverse applied on an array compared to home-made
code, so I have written a little test.
Below you see the code, and the asm of the functions, compiled
with DMD 2.060, 32 bit (-O -release -inline).
Feel free to recompile the code yourself to see if I have done
some mistake :-)
------------------------------
import core.stdc.stdio: printf;
import std.algorithm: reverse;
void reverseArr(T)(T[] arr) {
for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; ) {
auto aux = *x;
*x++ = *y;
*y-- = aux;
}
}
void main() {
auto a = [10, 20, 30, 40];
foreach(x; a) printf("%d ", x); printf("\n");
a.reverseArr();
foreach(x; a) printf("%d ", x); printf("\n");
a.reverse();
foreach(x; a) printf("%d ", x); printf("\n");
}
------------------------------
reverseArr:
push EBX
mov ECX,0Ch[ESP]
mov EBX,0Ch[ESP]
push ESI
mov ESI,0Ch[ESP]
lea ESI,-4[ESI*4][ECX]
cmp ECX,ESI
jae L2B
L19: mov EAX,[EBX]
mov EDX,[ESI]
mov [EBX],EDX
add EBX,4
mov [ESI],EAX
add ESI,0FFFFFFFCh
cmp EBX,ESI
jb L19
L2B: pop ESI
pop EBX
ret 8
-------------------
std.algorithm.reverse:
L0: sub ESP,020h
push EBX
push ESI
push EDI
cmp dword ptr 030h[ESP],0
je LE1
L11: cmp dword ptr 030h[ESP],0
mov EAX,030h[ESP]
mov EDX,034h[ESP]
mov 018h[ESP],EAX
mov 01Ch[ESP],EDX
jne L32
mov EAX,022Dh
call near ptr _D3std9algorithm7__arrayZ
L32: mov EAX,030h[ESP]
mov EDX,034h[ESP]
mov 024h[ESP],EDX
mov ECX,030h[ESP]
lea EDX,-1[ECX]
mov 020h[ESP],EAX
cmp EDX,ECX
mov 010h[ESP],EDX
jb L5E
mov EAX,0258h
call near ptr _D3std9algorithm7__arrayZ
L5E: mov ECX,01Ch[ESP]
mov EBX,030h[ESP]
mov EDX,024h[ESP]
lea EBX,-4[EBX*4][EDX]
mov ESI,[ECX]
mov EDX,[EBX]
mov [ECX],EDX
mov ECX,030h[ESP]
cmp ECX,ECX
mov [EBX],ESI
ja L86
cmp ECX,1
jae L90
L86: mov EAX,0173h
call near ptr _D3std9algorithm7__arrayZ
L90: mov EDX,034h[ESP]
mov EDI,010h[ESP]
mov 030h[ESP],EDI
add EDX,4
mov 034h[ESP],EDX
cmp dword ptr 030h[ESP],0
je LE1
mov EBX,030h[ESP]
lea ECX,-1[EBX]
cmp ECX,EBX
mov 014h[ESP],ECX
jbe LC6
mov EAX,01E8h
call near ptr _D3std9algorithm7__arrayZ
LC6: mov ESI,014h[ESP]
mov EDX,034h[ESP]
mov 030h[ESP],ESI
mov 034h[ESP],EDX
cmp dword ptr 030h[ESP],0
jne L11
LE1: pop EDI
pop ESI
pop EBX
add ESP,020h
ret 8
------------------------------
Bye,
bearophile
Nov 23 2012
On Friday, 23 November 2012 at 09:14:20 UTC, bearophile wrote:
I've seen a difference in the performance of
std.algorithm.reverse applied on an array compared to home-made
code, so I have written a little test.
Below you see the code, and the asm of the functions, compiled
with DMD 2.060, 32 bit (-O -release -inline).
Feel free to recompile the code yourself to see if I have done
some mistake :-)
------------------------------
import core.stdc.stdio: printf;
import std.algorithm: reverse;
void reverseArr(T)(T[] arr) {
for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; ) {
auto aux = *x;
*x++ = *y;
*y-- = aux;
}
}
[SNIP]
Bye,
bearophile
I'm not surprised: Algorithm's reverse is written for the generic
bidir range. It could be made faster using an approach like
yours', and even faster using a specialized pointer
implementation for pointers (no bounds checking when accessing
elements).
The *1* thing I'd be curious about is what costs are
iteration-related, and what costs are "swap" related.
In theory, swap's implementation should be reduced to what you
have for integers.
Could you maybe also add this in your study?
void reverseArr(T)(T[] arr) {
for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; )
swap(*x++, *y--);
}
--------
Related, the starting condition, as written: "y = &arr[$ - 1]"
will choke on empty ranges.
I'd have suggested: "y = arr.ptr + arr.length - 1", but that will
also choke if the arr.ptr is null. I'd say you should just add an
empty short-circuit test int there.
Nov 23 2012
monarch_dodra:Could you maybe also add this in your study? void reverseArr(T)(T[] arr) { for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; ) swap(*x++, *y--); }This is the full program: import core.stdc.stdio: printf; import std.algorithm: reverse, swap; void reverseArr(T)(T[] arr) { for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; ) { auto aux = *x; *x++ = *y; *y-- = aux; } } void reverseArray(T)(T[] arr) { for (auto x = arr.ptr, y = &arr[$ - 1]; x < y; ) swap(*x++, *y--); } void main() { auto a = [10, 20, 30, 40]; foreach(x; a) printf("%d ", x); printf("\n"); a.reverseArr(); foreach(x; a) printf("%d ", x); printf("\n"); a.reverse(); foreach(x; a) printf("%d ", x); printf("\n"); a.reverseArray(); foreach(x; a) printf("%d ", x); printf("\n"); } And this is the asm of the reverseArray: reverseArray: push EBX mov ECX, 0Ch[ESP] mov EBX, 0Ch[ESP] push ESI mov ESI, 0Ch[ESP] lea ESI, -4[ESI*4][ECX] push EDI cmp ECX, ESI jae L30 L1A: mov EAX, EBX mov EDI, ESI mov EDX, [EAX] mov ECX, [EDI] add EBX, 4 sub ESI, 4 mov [EAX], ECX cmp EBX, ESI mov [EDI], EDX jb L1A L30: pop EDI pop ESI pop EBX ret 8 So the inner loop is 10 asm instructions instead of the 8 asm instructions of the loop of reverseArr(), that is 2 more register swaps, that are performed very quickly by the hardware register renaming in the modern CPUs. So this is a small difference, probably acceptable in many situations. While the difference between the performance of reverseArray() and std.algorithm.reverse is significant.Related, the starting condition, as written: "y = &arr[$ - 1]" will choke on empty ranges. I'd have suggested: "y = arr.ptr + arr.length - 1", but that will also choke if the arr.ptr is null. I'd say you should just add an empty short-circuit test int there.Right. But my study was mostly about what's inside the loop of those functions. What's outside the loop is not influencing performance significantly. Bye, bearophile
Nov 27 2012
On 11/23/2012 1:14 AM, bearophile wrote:Feel free to recompile the code yourself to see if I have done some mistake :-)You've got array bounds checking turned on for algorithm.reverse, and you avoided that in your code by using unchecked pointers.
Nov 23 2012
Walter Bright: Sorry for the late reply.You've got array bounds checking turned on for algorithm.reverse, and you avoided that in your code by using unchecked pointers.Right, sorry. This is compiled with the latest DMD 2.061alpha with -O -release -inline -noboundscheck. I have used the same program source code as above. Now the calls inside std.algorithm.reverse are fully inlined, below there are no "call" istrunctions. But the number of instructions performed in the loop is rather different. dpaste is currently down and I don't have gdc2/dc2 installed here, so I can't show the asm from those other compilers. I have also done benchmarks, and I have seen performance difference in real code between std.algorithm.reverse and a reverseArr function. reverseArr: push EBX mov ECX, 0Ch[ESP] mov EBX, 0Ch[ESP] push ESI mov ESI, 0Ch[ESP] lea ESI, -4[ESI*4][ECX] cmp ECX, ESI jae L2B L19: mov EAX, [EBX] mov EDX, [ESI] mov [EBX], EDX add EBX, 4 mov [ESI], EAX add ESI, 0FFFFFFFCh cmp EBX, ESI jb L19 L2B: pop ESI pop EBX ret 8 std.algorithm.reverse: sub ESP, 014h push EBX push ESI cmp dword ptr 020h[ESP], 0 je L76 LC: mov EAX, 020h[ESP] mov EDX, 024h[ESP] mov EBX, 020h[ESP] mov 0Ch[ESP], EDX mov ECX, 0Ch[ESP] mov ESI, [ECX] mov 014h[ESP], EDX mov EDX, 014h[ESP] lea EBX, -4[EBX*4][EDX] mov 8[ESP], EAX mov EDX, [EBX] mov [ECX], EDX mov ECX, 020h[ESP] mov EDX, 024h[ESP] mov 010h[ESP], EAX lea EAX, -1[ECX] add EDX, 4 mov 020h[ESP], EAX mov [EBX], ESI mov 024h[ESP], EDX cmp dword ptr 020h[ESP], 0 je L76 mov EBX, 020h[ESP] lea ESI, -1[EBX] mov ECX, 024h[ESP] mov 020h[ESP], ESI mov 024h[ESP], ECX cmp dword ptr 020h[ESP], 0 jne LC L76: pop ESI pop EBX add ESP, 014h ret 8 Bye, bearophile
Nov 27 2012
On Wednesday, 28 November 2012 at 00:19:33 UTC, bearophile wrote:dpaste is currently down and I don't have gdc2/dc2 installed here, so I can't show the asm from those other compilers.fwiw, here is what LDC produces: --- _D4test18__T10reverseArrTiZ10reverseArrFAiZv: lea RAX, QWORD PTR [RSI + 4*RDI - 4] cmp RSI, RAX jae .LBB1_3 lea RAX, QWORD PTR [RSI + 4*RDI - 8] .align 16, 0x90 .LBB1_2: mov ECX, DWORD PTR [RSI] mov EDX, DWORD PTR [RAX + 4] mov DWORD PTR [RSI], EDX mov DWORD PTR [RAX + 4], ECX add RSI, 4 cmp RSI, RAX lea RAX, QWORD PTR [RAX - 4] jb .LBB1_2 .LBB1_3: ret --- --- _D3std9algorithm15__T7reverseTAiZ7reverseFAiZv: test RDI, RDI je .LBB2_4 lea RAX, QWORD PTR [RSI + 4*RDI - 4] .align 16, 0x90 .LBB2_2: mov ECX, DWORD PTR [RSI] mov EDX, DWORD PTR [RAX] mov DWORD PTR [RSI], EDX mov DWORD PTR [RAX], ECX cmp RDI, 1 je .LBB2_4 add RAX, -4 add RSI, 4 add RDI, -2 jne .LBB2_2 .LBB2_4: ret --- --- _D4test20__T12reverseArrayTiZ12reverseArrayFAiZv: lea RAX, QWORD PTR [RSI + 4*RDI - 4] cmp RSI, RAX jae .LBB3_3 add RSI, 4 .align 16, 0x90 .LBB3_2: mov ECX, DWORD PTR [RSI - 4] mov EDX, DWORD PTR [RAX] mov DWORD PTR [RSI - 4], EDX mov DWORD PTR [RAX], ECX add RAX, -4 cmp RSI, RAX lea RSI, QWORD PTR [RSI + 4] jb .LBB3_2 .LBB3_3: ret --- The extra jump in the std.algorithm version is still there – I haven't checked whether it would be feasible to optimize the induction variable away entirely. David
Nov 28 2012









"bearophile" <bearophileHUGS lycos.com> 