www.digitalmars.com         C & C++   DMDScript  

digitalmars.D.bugs - [Issue 4438] New: A missed function inlining

reply d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=4438

           Summary: A missed function inlining
           Product: D
           Version: D1 & D2
          Platform: x86
        OS/Version: Windows
            Status: NEW
          Severity: enhancement
          Priority: P2
         Component: DMD
        AssignedTo: nobody puremagic.com
        ReportedBy: bearophile_hugs eml.cc



A test program that can be compiled with DMD2+Phobos2 and LDC1+Tango1:


version (Tango)
    import tango.stdc.stdio: printf;
else
    import std.c.stdio: printf;

double masked_dot(double[] a1, double[] a2, ubyte[] mask)
    in {
        assert(a1.length == a2.length);
        assert(a1.length == mask.length);
    } body {
        double sum = 0.0;
        foreach (i, m; mask)
            if (m)
                sum += a1[i] * a2[i];
        return sum;
    }

void main() {
    int N = 1000;

    auto m1 = new double[][](N, N);
    foreach (ref row; m1)
        row[] = 2.0;

    auto m2 = new double[][](N, N);
    foreach (ref row; m2)
        row[] = 0.5;

    auto mask = new ubyte[N];
    mask[] = 1;

    double sum = 0.0;
    for (int r; r < m1.length; r++)
        sum += masked_dot(m1[r], m2[r], mask);

    printf("%f\n", sum);
}


Compiled with:
dmd v2.047 and ldc based on DMD v1.057 and llvm 2.6

ldc -O3 -release -inline -output-s test
dmd -O -release -inline test.d


I'd like dmd to inline this masked_dot() function too.

-------------------------------

The cleaned up asm produced by dmd:

_D4test10masked_dotFAdAdAhZd    comdat
        sub    ESP,030h
        push    ESI
        xor    ESI,ESI
        mov    dword ptr 4[ESP],0
        mov    dword ptr 8[ESP],0
        cmp    038h[ESP],ESI
        je    L51
        mov    EDX,03Ch[ESP]
        mov    EAX,038h[ESP]
        mov    ECX,EDX
L26:        cmp    [ECX][ESI],0
        je    L4A
        mov    EDX,04Ch[ESP]
        mov    EAX,048h[ESP]
        mov    EAX,040h[ESP]
        fld    qword ptr [ESI*8][EDX]
        mov    EDX,044h[ESP]
        fmul    qword ptr [ESI*8][EDX]
        fadd    qword ptr 4[ESP]
        fstp    qword ptr 4[ESP]
L4A:        inc    ESI
        cmp    ESI,038h[ESP]
        jb    L26
L51:        fld    qword ptr 4[ESP]
        pop    ESI
        add    ESP,030h
        ret    018h

__Dmain    comdat
L0:        sub    ESP,028h
        mov    EAX,offset FLAT:_D12TypeInfo_AAd6__initZ
        push    EBX
        push    ESI
        push    EDI
        push    03E8h
        push    03E8h
        push    2
        push    EAX
        call    near ptr __d_newarraymiT
        xor    EBX,EBX
        mov    020h[ESP],EAX
        mov    024h[ESP],EDX
        add    ESP,010h
        cmp    010h[ESP],EBX
        je    L58
        mov    ESI,EDX
L32:        lea    EDI,[EBX*8][ESI]
        mov    EDX,4[EDI]
        mov    EAX,[EDI]
        push    dword ptr [EDI]
        push    dword ptr FLAT:_DATA[04h]
        push    dword ptr FLAT:_DATA[00h]
        push    EDX
        call    near ptr __memset64
        add    ESP,010h
        inc    EBX
        cmp    EBX,010h[ESP]
        jb    L32
L58:        push    03E8h
        mov    ECX,offset FLAT:_D12TypeInfo_AAd6__initZ
        push    03E8h
        push    2
        push    ECX
        call    near ptr __d_newarraymiT
        xor    EBX,EBX
        mov    028h[ESP],EAX
        mov    02Ch[ESP],EDX
        add    ESP,010h
        cmp    018h[ESP],EBX
        je    LAA
        mov    ESI,EDX
L84:        lea    EDI,[EBX*8][ESI]
        mov    EDX,4[EDI]
        mov    EAX,[EDI]
        push    dword ptr [EDI]
        push    dword ptr FLAT:_DATA[0Ch]
        push    dword ptr FLAT:_DATA[08h]
        push    EDX
        call    near ptr __memset64
        add    ESP,010h
        inc    EBX
        cmp    EBX,018h[ESP]
        jb    L84
LAA:        push    03E8h
        mov    EBX,offset FLAT:_D11TypeInfo_Ah6__initZ
        push    EBX
        call    near ptr __d_newarrayT
        mov    028h[ESP],EAX
        mov    ECX,028h[ESP]
        mov    EAX,01010101h
        mov    02Ch[ESP],EDX
        mov    EDX,02Ch[ESP]
        mov    EBX,028h[ESP]
        mov    EDI,EDX
        rep
        stosb
        mov    030h[ESP],ECX
        mov    034h[ESP],ECX
        mov    038h[ESP],ECX
        add    ESP,8
        cmp    010h[ESP],ECX
        je    L12E
        mov    EDX,014h[ESP]
        mov    EDI,EDX
        mov    EAX,010h[ESP]
        mov    EDX,01Ch[ESP]
        mov    EBX,030h[ESP]
        mov    EAX,018h[ESP]
        mov    ESI,EDX
L104:        push    dword ptr 4[EBX*8][EDI]
        push    [EBX*8][EDI]
        push    dword ptr 4[EBX*8][ESI]
        push    [EBX*8][ESI]
        push    dword ptr 034h[ESP]
        push    dword ptr 034h[ESP]
        call    near ptr _D4test10masked_dotFAdAdAhZd
        inc    EBX
        fadd    qword ptr 028h[ESP]
        cmp    EBX,010h[ESP]
        fstp    qword ptr 028h[ESP]
        jb    L104
L12E:        push    dword ptr 02Ch[ESP]
        mov    ECX,offset FLAT:_DATA[010h]
        push    dword ptr 02Ch[ESP]
        push    ECX
        call    near ptr _printf
        add    ESP,0Ch
        xor    EAX,EAX
        pop    EDI
        pop    ESI
        pop    EBX
        add    ESP,028h
        ret

-------------------------------

The cleaned up asm produced by ldc:

_D4test10masked_dotFAdAdAhZd:
    pushl   %edi
    pushl   %esi
    subl    $12, %esp
    movl    24(%esp), %eax
    testl   %eax, %eax
    je  .LBB1_6
    movl    28(%esp), %ecx
    movl    36(%esp), %edx
    movl    44(%esp), %esi
    pxor    %xmm0, %xmm0
    xorl    %edi, %edi
    .align  16
.LBB1_2:
    cmpb    $0, (%ecx,%edi)
    je  .LBB1_4
    movsd   (%esi,%edi,8), %xmm1
    mulsd   (%edx,%edi,8), %xmm1
    addsd   %xmm1, %xmm0
.LBB1_4:
    incl    %edi
    cmpl    %eax, %edi
    jne .LBB1_2
.LBB1_5:
    movsd   %xmm0, (%esp)
    fldl    (%esp)
    addl    $12, %esp
    popl    %esi
    popl    %edi
    ret $24
.LBB1_6:
    pxor    %xmm0, %xmm0
    jmp .LBB1_5

_Dmain:
    pushl   %ebp
    pushl   %ebx
    pushl   %edi
    pushl   %esi
    subl    $36, %esp
    movl    $1000, 28(%esp)
    movl    $1000, 32(%esp)
    leal    28(%esp), %eax
    movl    %eax, 8(%esp)
    movl    $2, 4(%esp)
    movl    $_D12TypeInfo_AAd6__initZ, (%esp)
    xorl    %esi, %esi
    call    _d_newarraymiT
    movl    %eax, %edi
    .align  16
.LBB2_1:
    movl    4(%edi,%esi,8), %eax
    movl    (%edi,%esi,8), %ecx
    movl    %ecx, 4(%esp)
    movl    %eax, (%esp)
    movl    $1073741824, 12(%esp)
    movl    $0, 8(%esp)
    call    _d_array_init_double
    incl    %esi
    cmpl    $1000, %esi
    jne .LBB2_1
    movl    $1000, 20(%esp)
    movl    $1000, 24(%esp)
    leal    20(%esp), %eax
    movl    %eax, 8(%esp)
    movl    $2, 4(%esp)
    movl    $_D12TypeInfo_AAd6__initZ, (%esp)
    xorl    %esi, %esi
    call    _d_newarraymiT
    movl    %eax, %ebx
    .align  16
.LBB2_3:
    movl    4(%ebx,%esi,8), %eax
    movl    (%ebx,%esi,8), %ecx
    movl    %ecx, 4(%esp)
    movl    %eax, (%esp)
    movl    $1071644672, 12(%esp)
    movl    $0, 8(%esp)
    call    _d_array_init_double
    incl    %esi
    cmpl    $1000, %esi
    jne .LBB2_3
    movl    $1000, 4(%esp)
    movl    $_D11TypeInfo_Ah6__initZ, (%esp)
    call    _d_newarrayT
    movl    %eax, %esi
    movl    %esi, (%esp)
    movl    $1000, 8(%esp)
    movl    $1, 4(%esp)
    call    memset
    pxor    %xmm0, %xmm0
    xorl    %eax, %eax
.LBB2_5:
    movl    4(%ebx,%eax,8), %ecx
    movl    4(%edi,%eax,8), %edx
    pxor    %xmm1, %xmm1
    xorl    %ebp, %ebp
    .align  16
.LBB2_6:
    cmpb    $0, (%esi,%ebp)
    je  .LBB2_8
    movsd   (%edx,%ebp,8), %xmm2
    mulsd   (%ecx,%ebp,8), %xmm2
    addsd   %xmm2, %xmm1
.LBB2_8:
    incl    %ebp
    cmpl    $1000, %ebp
    jne .LBB2_6
    addsd   %xmm1, %xmm0
    incl    %eax
    cmpl    $1000, %eax
    jne .LBB2_5
    movsd   %xmm0, 4(%esp)
    movl    $.str, (%esp)
    call    printf
    xorl    %eax, %eax
    addl    $36, %esp
    popl    %esi
    popl    %edi
    popl    %ebx
    popl    %ebp
    ret $8

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
Jul 08 2010
next sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=4438


Leandro Lucarella <llucax gmail.com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Keywords|                            |performance
                 CC|                            |llucax gmail.com
           Platform|x86                         |All
             Blocks|                            |859
         OS/Version|Windows                     |All



PDT ---
I'm marking this a a blocker of bug 859 so there is a single bug to track all
the inlining issues. Please do the same if you open more bugs associated to
inlining, or post them directly in bug 859.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
Jul 08 2010
prev sibling next sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=4438


Brad Roberts <braddr puremagic.com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |braddr puremagic.com
             Blocks|859                         |



---
undoing false dependency

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
Jul 08 2010
prev sibling next sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=4438




PDT ---

 undoing false dependency
Can you elaborate a little on why having bug 859 as a tracker of all missing inline oportunities is a bad thing? Thanks -- Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email ------- You are receiving this mail because: -------
Jul 09 2010
prev sibling parent d-bugmail puremagic.com writes:
http://d.puremagic.com/issues/show_bug.cgi?id=4438




A different case of missed inlining. In the following code isValidMove() is not
inlined by DMD 2.052 (with -O -inline -release), despite this function contains
no loop and no throws. Removing the "ref" for the board array in the
isValidMove() signature changes nothing.

The runtime is 12.6 seconds, the alternative version of the knightsTour()
function with manually inlined isValidMove() (currently commented out below)
runs in 8.0 seconds. My tests show that GCC -O3 is able to inline similar code
written in C.



// Code adapted from:
// http://en.literateprograms.org/The_Knight's_Tour_(C)?oldid=14704
import core.stdc.stdio: printf;

template TypeTuple(T...) {
    alias T TypeTuple;
}

bool isValidMove(int N)(ref int[N][N] board, int xpos, int ypos, int nmove) {
    if (xpos < 0 || xpos >= N || ypos < 0 || ypos >= N ||
        (board[xpos][ypos] && board[xpos][ypos] < nmove))
        return false;
    return true;
}

// run time: 12.6 seconds
bool knightsTour(int N)(ref int[N][N] board, int xpos, int ypos, int nmove) {
    if (!isValidMove(board, xpos, ypos, nmove))
        return false;

    board[xpos][ypos] = nmove;

    if (nmove == N * N)
        return true;

    alias TypeTuple!(+1, -2, -2, -1, -1, +2, +2, +1) spx;
    alias TypeTuple!(+2, +1, -1, +2, -2, +1, -1, -2) spy;
    static assert(spx.length == spy.length);
    bool ok = true;
    foreach (i, sx; spx)
        ok = ok && !knightsTour(board, xpos + sx, ypos + spy[i], nmove + 1);
    if (ok) {
        board[xpos][ypos] = N * N;
        return false;
    }

    return true;
}

/*
// run time:  8.0 seconds
bool knightsTour(int N)(ref int[N][N] board, int xpos, int ypos, int nmove) {
    // if is not valid move
    if (xpos < 0 || xpos >= N || ypos < 0 || ypos >= N ||
        (board[xpos][ypos] && board[xpos][ypos] < nmove))
        return false;

    board[xpos][ypos] = nmove;

    if (nmove == N * N)
        return true;

    alias TypeTuple!(+1, -2, -2, -1, -1, +2, +2, +1) spx;
    alias TypeTuple!(+2, +1, -1, +2, -2, +1, -1, -2) spy;
    static assert(spx.length == spy.length);
    bool ok = true;
    foreach (i, sx; spx)
        ok = ok && !knightsTour(board, xpos + sx, ypos + spy[i], nmove + 1);
    if (ok) {
        board[xpos][ypos] = N * N;
        return false;
    }

    return true;
}
*/

void main() {
    enum int side = 8;
    enum int xpos = 0;
    enum int ypos = xpos;

    int[side][side] board;
    printf("Executing Knight's Tour...\n");

    if (knightsTour(board, xpos, ypos, 1)) {
        printf("Solution found:\n");
        foreach (ref row; board) {
            foreach (item; row)
                printf("%3d", item);
            printf("\n");
        }
    } else
        printf("No solution found.\n");
}


The first part of the assembly of knightsTour() shows the call to
isValidMove():

_D4test20__T11knightsTourVk8Z11knightsTourFKG8G8iiiiZb  comdat
    sub ESP,02Ch
    push EBX
    push EBP
    push ESI
    mov ESI,044h[ESP]
    push EDI
    mov 038h[ESP],EAX
    push ESI
    push dword ptr 048h[ESP]
    push dword ptr 048h[ESP]
    call near ptr _D4test20__T11isValidMoveVk8Z11isValidMoveFKG8G8iiiiZb
    xor AL,1
    je  L2D
    pop EDI
    ...

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
Apr 14 2011