diff options
author | Jeff Johnston <jjohnstn@redhat.com> | 2008-05-26 23:23:15 +0000 |
---|---|---|
committer | Jeff Johnston <jjohnstn@redhat.com> | 2008-05-26 23:23:15 +0000 |
commit | a6bd72a27873294887681d3bd102d848e5777e2c (patch) | |
tree | 4da6a66d14c0993b5445d9bf6c5df596b72c47ed /newlib/libc/machine/i386 | |
parent | cae28869c106eb342dd5a1c8242f933efab6f772 (diff) | |
download | cygnal-a6bd72a27873294887681d3bd102d848e5777e2c.tar.gz cygnal-a6bd72a27873294887681d3bd102d848e5777e2c.tar.bz2 cygnal-a6bd72a27873294887681d3bd102d848e5777e2c.zip |
2008-05-26 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 memset.
* libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]:
Pre-align pointer so unaligned stores aren't penalized.
* libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]:
Pre-align pointer so unaligned stores aren't penalized. Prefer
8-byte over 4-byte alignment. Reduce register pressure.
Diffstat (limited to 'newlib/libc/machine/i386')
-rw-r--r-- | newlib/libc/machine/i386/memset.S | 68 |
1 files changed, 54 insertions, 14 deletions
diff --git a/newlib/libc/machine/i386/memset.S b/newlib/libc/machine/i386/memset.S index ce40820ff..36637fc21 100644 --- a/newlib/libc/machine/i386/memset.S +++ b/newlib/libc/machine/i386/memset.S @@ -1,6 +1,6 @@ /* * ==================================================== - * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved. + * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved. * * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice @@ -18,43 +18,83 @@ SYM (memset): pushl ebp movl esp,ebp pushl edi - pushl ebx movl 8(ebp),edi movl 12(ebp),eax movl 16(ebp),ecx cld #ifndef __OPTIMIZE_SIZE__ - andl $255,eax - movl ecx,ebx - testl $3,edi - jne .L19 +/* Less than 16 bytes won't benefit from the 'rep stosl' loop. */ cmpl $16,ecx jbe .L19 + cbw + testl $7,edi + je .L10 - movl eax,edx - sall $8,eax - orl edx,eax +/* It turns out that 8-byte aligned 'rep stosl' outperforms + 4-byte aligned on some x86 platforms. */ + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + + movb al,(edi) + incl edi + decl ecx + +/* At this point, ecx>8 and edi%8==0. */ +.L10: + movb al,ah movl eax,edx sall $16,edx orl edx,eax + movl ecx,edx shrl $2,ecx - andl $3,ebx + andl $3,edx rep stosl - movl ebx,ecx + movl edx,ecx #endif /* not __OPTIMIZE_SIZE__ */ - + .L19: rep stosb movl 8(ebp),eax - leal -8(ebp),esp - popl ebx + leal -4(ebp),esp popl edi leave ret |