看板 DFBSD_submit 關於我們 聯絡資訊
--xHFwDpU9dbj6ez1V Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Hi all, as promised on commits@, here is a generic 64 bit add operator for Pentium+ and the necessary change for gencount_inc. Also attached a small hack for cpuperf used for the numbers below. The good message is that gencount_inc can be made critical section free, the bad is the performance of cmpxchg8b on p4. Like so many other ops, it totally sucks. My P4 notebook: 115.857nS/loop for cmpxchg8b, compared to 7.517nS/loop for cmpxchg. Leaf (AMD64): 6.788nS/loop for cmpxchg8b, compared to 1.293nS/loop for cmpxchg. Intel sucks. Conclusion: The overhead on AMD64 is much less and seems completely acceptable, for P4 it depends. Matt, what's the speed of critical sections on P4? I'd like to get some numbers for other processors as well. Joerg --xHFwDpU9dbj6ez1V Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="atomic64.diff" Index: atomic.h =================================================================== RCS file: /home/joerg/wd/repository/dragonflybsd/src/sys/i386/include/atomic.h,v retrieving revision 1.8 diff -u -r1.8 atomic.h --- atomic.h 29 Jul 2004 20:31:13 -0000 1.8 +++ atomic.h 9 Dec 2004 02:26:30 -0000 @@ -159,4 +159,26 @@ #endif +#if defined(I586_CPU) || defined(I686_CPU) +static __inline +void +atomic_add_long_long(unsigned long long *p, unsigned long long v) +{ + __asm __volatile( + "1:\n" + "\tmovl %0, %%eax\n" + "\taddl %%eax, %%ebx\n" + "\tmovl 4+%0, %%edx\n" + "\taddl %%edx, %%ecx\n" + "\tjnc 2f\n" + "\tincl %%ebx\n" + "2:\n" + "\tlock; cmpxchg8b %0\n" + "\tjnz 1b" + : + : "m" (*p), "b" ((u_long)v), "c" ((u_long)(v >> 32)) + : "memory", "ax", "dx"); +} +#endif + #endif /* ! _MACHINE_ATOMIC_H_ */ Index: gencount.h =================================================================== RCS file: /home/joerg/wd/repository/dragonflybsd/src/sys/i386/include/gencount.h,v retrieving revision 1.1 diff -u -r1.1 gencount.h --- gencount.h 8 Dec 2004 23:19:51 -0000 1.1 +++ gencount.h 9 Dec 2004 02:39:04 -0000 @@ -41,6 +41,8 @@ #error "no user-servicable parts inside" #endif +#include <machine/md_var.h> +#include <machine/specialreg.h> #include <sys/types.h> #include <sys/thread2.h> @@ -60,6 +62,12 @@ static __inline void gencount_inc(gencount_t *gencnt) { +#if defined(I586_CPU) || defined(I686_CPU) + if (cpu_feature & CPUID_CX8) { + atomic_add_long_long((uint64_t *)gencnt, 1); + return; + } +#endif crit_enter(); if (++gencnt->high == 0) ++gencnt->low; --xHFwDpU9dbj6ez1V Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="cpuperf.diff" Index: Makefile =================================================================== RCS file: /home/joerg/wd/repository/dragonflybsd/src/test/cpuperf/Makefile,v retrieving revision 1.2 diff -u -r1.2 Makefile --- Makefile 9 Feb 2004 18:15:35 -0000 1.2 +++ Makefile 9 Dec 2004 02:43:30 -0000 @@ -4,7 +4,7 @@ .PATH: ${.CURDIR}/../sysperf TARGETS=/tmp/cpu_add /tmp/cpu_ladd /tmp/cpu_cmpadd /tmp/cpu_cmpexg \ - /tmp/cpu_lcmpexg /tmp/cpu_call + /tmp/cpu_lcmpexg /tmp/cpu_call /tmp/cpu_cmpexg8b /tmp/cpu_lcmpexg8b CFLAGS= -O2 -g -I../sysperf @@ -22,9 +22,15 @@ /tmp/cpu_cmpexg: cputest.c cpu_cmpexg.S blib.c $(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET} +/tmp/cpu_cmpexg8b: cputest.c cpu_cmpexg8b.S blib.c + $(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET} + /tmp/cpu_lcmpexg: cputest.c cpu_lcmpexg.S blib.c $(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET} +/tmp/cpu_lcmpexg8b: cputest.c cpu_lcmpexg8b.S blib.c + $(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET} + /tmp/cpu_call: cputest.c cpu_call.S blib.c $(CC) $(CFLAGS) ${.ALLSRC} -o ${.TARGET} Index: cpu_cmpexg8b.S =================================================================== RCS file: cpu_cmpexg8b.S diff -N cpu_cmpexg8b.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ cpu_cmpexg8b.S 9 Dec 2004 02:41:38 -0000 @@ -0,0 +1,31 @@ +/* $DragonFly: src/test/cpuperf/cpu_cmpexg.S,v 1.1 2004/02/09 18:08:54 dillon Exp $ */ + + .globl test_dummy + .globl test_load + .globl test_str + + .p2align 5 +test_dummy: + movl 4(%esp),%ecx + movl $0,%edx + movl $1,%eax + movl %ebx,%edx + movl $0,(%ecx) + addl $3,%eax + ret + + .p2align 5 +test_load: + movl 4(%esp),%ecx + movl $0,%edx + movl $1,%eax + cmpxchg8b (%ecx) /* instruction under test */ + movl %ebx,%edx + movl $0,(%ecx) + addl $3,%eax + ret + + .p2align 5 +test_str: + .asciz "non-locked cmpxchg8b (successful exchange) in pipeline" + Index: cpu_lcmpexg8b.S =================================================================== RCS file: cpu_lcmpexg8b.S diff -N cpu_lcmpexg8b.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ cpu_lcmpexg8b.S 9 Dec 2004 02:43:02 -0000 @@ -0,0 +1,31 @@ +/* $DragonFly: src/test/cpuperf/cpu_lcmpexg.S,v 1.1 2004/02/09 18:08:54 dillon Exp $ */ + + .globl test_dummy + .globl test_load + .globl test_str + + .p2align 5 +test_dummy: + movl 4(%esp),%ecx + movl $0,%edx + movl $1,%eax + movl %ebx,%edx + movl $0,(%ecx) + addl $3,%eax + ret + + .p2align 5 +test_load: + movl 4(%esp),%ecx + movl $0,%edx + movl $1,%eax + lock; cmpxchg8b (%ecx) /* instruction under test */ + movl %ebx,%edx + movl $0,(%ecx) + addl $3,%eax + ret + + .p2align 5 +test_str: + .asciz "bus-locked cmpxchg8b (successful exchange) in pipeline" + Index: cputest.c =================================================================== RCS file: /home/joerg/wd/repository/dragonflybsd/src/test/cpuperf/cputest.c,v retrieving revision 1.1 diff -u -r1.1 cputest.c --- cputest.c 9 Feb 2004 18:08:54 -0000 1.1 +++ cputest.c 9 Dec 2004 02:48:36 -0000 @@ -11,7 +11,7 @@ extern char test_str[]; -int junk; +int junk[10]; int main(int ac, char **av) @@ -24,19 +24,19 @@ printf("CPUTEST %s\n", test_str); start_timing(); for (i = 0; ; ++i) { - test_load(&junk); + test_load(junk); if ((i & 65535) == 0 && get_timing() > 1000000) break; } ttl = i * 4; start_timing(); for (i = 0; i < ttl; ++i) { - test_dummy(&junk); + test_dummy(junk); } us1 = get_timing(); start_timing(); for (i = 0; i < ttl; ++i) { - test_load(&junk); + test_load(junk); } us2 = get_timing(); stop_timing2(ttl, us2 - us1, "instruction overhead:"); --xHFwDpU9dbj6ez1V--