www.digitalmars.com         C & C++   DMDScript  

digitalmars.D.learn - Array assign

reply bearophile <bearophileHUGS lycos.com> writes:
I've found that when lenght is small (about 50-250 4-byte items or less) the
array filling operation is quite slow compared to a normal loop:
a[] = x;

So I suggest DMD frontend to inline little loop when a.length is small.

To further improve the a[]=x; I have tried to speed up the larger case too, so
I've used the movntps instruction. See info:

http://www.gamedev.net/community/forums/topic.asp?topic_id=532112

http://www.sesp.cse.clrc.ac.uk/html/SoftwareTools/vtune/users_guide/mergedProjects/analyzer_ec/mergedProjects/reference_olh/mergedProjects/instructions/instruct32_hh/vc197.htm

The following is my attempt, my experience of x86 asm programming is minimal
still (I'll try to learn more) so while this program seems to somehow work,
it's a pessimization compared to a normal loop, so probably I've done several
mistakes/bugs. Can someone help me improve the code? If the result is fast
enough, something similar can be added to d runtime.

This array4Set() is designed for 4 bytes long items, but I can write something
similar for other data sizes too.


version (Tango) {
    import tango.stdc.stdio: printf;
    import tango.stdc.time: clock, CLOCKS_PER_SEC;
    import tango.math.Math: sqrt;
    import tango.stdc.string: memset;
} else {
    import std.c.stdio: printf;
    import std.c.time: clock, CLOCKS_PER_SEC;
    import std.math: sqrt;
    import std.c.string: memset;
}


double myclock() {
    return cast(double)clock() / CLOCKS_PER_SEC;
}


void array4Set(T)(T[] a, T value) {
    static assert(T.sizeof == 4);
    if (!a.length)
        return;
    auto a_ptr = a.ptr;
    auto a_end = a_ptr + a.length;

    // align pointer
    size_t aux = ((cast(size_t)a_ptr + 15) & ~15);
    auto n = cast(T*)aux;
    while (a_ptr < n)
        *a_ptr++ = value;
    n = cast(T*)((cast(size_t)a_end) & ~15);

    if (a_ptr < n && (a_end - a_ptr) >= 16) {
        // Aligned case
        asm {
            mov ESI, a_ptr;
            mov EDI, n;

            movss XMM0, value; // XMM0 = value,value,value,value
            shufps XMM0, XMM0, 0;

            align 8;
            LOOP:
                add ESI, 64;
                movntps [ESI+ 0-64], XMM0;
                movntps [ESI+16-64], XMM0;
                movntps [ESI+32-64], XMM0;
                movntps [ESI+48-64], XMM0;
                cmp ESI, EDI;
            jb LOOP;

            mov a_ptr, ESI;
        }
    }

    // trailing ones
    while (a_ptr < a_end)
        *a_ptr++ = value;
}


T test(T, bool withLoop)(int len, int nloops) {
    auto a = new T[len];

    for (int i; i < nloops; i++) {
        static if (withLoop)
            for (int j; j < a.length; j++)
                a[j] = T.init;
        else
            //a[] = T.init;
            //memset(a.ptr, 0, a.length * T.sizeof);
            array4Set(a, 0);

        a[i % len] = i;
    }

    return a[0];
}


void show(float[] a) {
    printf("[");

    if (a.length > 1)
        foreach (el; a[0 .. $-1])
            printf("%.1f, ", el);
    if (a.length)
        printf("%.1f", a[$-1]);
    printf("]\n");
}

void main() {
    // small test
    auto a = new float[20];
    foreach (i, ref el; a)
        el = i;

    // [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
    show(a);

    array4Set(a[2 .. 2], 0.5f);

    // [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
    show(a);

    array4Set(a[2 .. 9], 0.5f);

    // [0, 1, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19]
    show(a);
    printf("---------------------\n\n");

    auto lens = [2, 4, 5, 8, 15, 16, 25, 32, 50, 64, 100, 250, 256, 1000, 2048,
2500];
    //auto lens = [1 << 12, 1 << 13, 1 << 15, 1 << 16, 1 << 17];
    foreach (len; lens) {
        int nloops = cast(int)(cast(double)60_000_000 / sqrt(cast(double)len));
        auto t0 = myclock();

        alias int T;
        test!(T, true)(len, nloops);
        auto t1 = myclock();

        auto t2 = myclock();
        test!(T, false)(len, nloops);
        auto t3 = myclock();

        printf("len=%d, nloops=%d, time with loop=%.3f, time without loop=%.3f,
ratio=%.3f\n",
               len, nloops, t1-t0, t3-t2, (t1-t0) / (t3-t2));
    }
}

Bye,
bearophile
Sep 18 2009
parent bearophile <bearophileHUGS lycos.com> writes:
I have done more experiments, it seems movntps instruction gives a performance
gain only when the array is longer than about 200_000 ints (celeron CPU).
For [25, 200_000] integers the movaps is better (and better than C memset).
For n < 25 the best thing I've found is just an inlined loop.

I'll post some code in the main D newsgroup in a short time.

Bye,
bearophile
Sep 19 2009