[IA64] fsys_getcpu for IA64

On 1.6GHz Montectio Tiger4, the following performance data is measured with
kernel built with defconfig which has NUMA configured:

Fastest sys_getcpu: 502 itc counts.
Fastest fsys_getcpu: 28 itc counts.

fsys_getcpu performance is largly impacted by whether data (node_to_cpu_map
etc) is in cache. It can take fsys_getcpu up to ~150 itc counts in cold
cache case.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
This commit is contained in:
Fenghua Yu 2007-02-12 16:27:10 -08:00 committed by Tony Luck
parent ddbad07630
commit 3bc207d2b7
2 changed files with 106 additions and 0 deletions

View File

@ -35,6 +35,7 @@ void foo(void)
BLANK();
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
BLANK();

View File

@ -10,6 +10,8 @@
* probably broke it along the way... ;-)
* 13-Jul-04 clameter Implement fsys_clock_gettime and revise fsys_gettimeofday to make
* it capable of using memory based clocks without falling back to C code.
* 08-Feb-07 Fenghua Yu Implement fsys_getcpu.
*
*/
#include <asm/asmmacro.h>
@ -505,6 +507,59 @@ EX(.fail_efault, (p15) st8 [r34]=r3)
#endif
END(fsys_rt_sigprocmask)
/*
* fsys_getcpu doesn't use the third parameter in this implementation. It reads
* current_thread_info()->cpu and corresponding node in cpu_to_node_map.
*/
ENTRY(fsys_getcpu)
.prologue
.altrp b6
.body
;;
add r2=TI_FLAGS+IA64_TASK_SIZE,r16
tnat.nz p6,p0 = r32 // guard against NaT argument
add r3=TI_CPU+IA64_TASK_SIZE,r16
;;
ld4 r3=[r3] // M r3 = thread_info->cpu
ld4 r2=[r2] // M r2 = thread_info->flags
(p6) br.cond.spnt.few .fail_einval // B
;;
tnat.nz p7,p0 = r33 // I guard against NaT argument
(p7) br.cond.spnt.few .fail_einval // B
#ifdef CONFIG_NUMA
movl r17=cpu_to_node_map
;;
EX(.fail_efault, probe.w.fault r32, 3) // M This takes 5 cycles
EX(.fail_efault, probe.w.fault r33, 3) // M This takes 5 cycles
shladd r18=r3,1,r17
;;
ld2 r20=[r18] // r20 = cpu_to_node_map[cpu]
and r2 = TIF_ALLWORK_MASK,r2
;;
cmp.ne p8,p0=0,r2
(p8) br.spnt.many fsys_fallback_syscall
;;
;;
EX(.fail_efault, st4 [r32] = r3)
EX(.fail_efault, st2 [r33] = r20)
mov r8=0
;;
#else
EX(.fail_efault, probe.w.fault r32, 3) // M This takes 5 cycles
EX(.fail_efault, probe.w.fault r33, 3) // M This takes 5 cycles
and r2 = TIF_ALLWORK_MASK,r2
;;
cmp.ne p8,p0=0,r2
(p8) br.spnt.many fsys_fallback_syscall
;;
EX(.fail_efault, st4 [r32] = r3)
EX(.fail_efault, st2 [r33] = r0)
mov r8=0
;;
#endif
FSYS_RETURN
END(fsys_getcpu)
ENTRY(fsys_fallback_syscall)
.prologue
.altrp b6
@ -878,6 +933,56 @@ fsyscall_table:
data8 0 // timer_delete
data8 0 // clock_settime
data8 fsys_clock_gettime // clock_gettime
data8 0 // clock_getres // 1255
data8 0 // clock_nanosleep
data8 0 // fstatfs64
data8 0 // statfs64
data8 0 // mbind
data8 0 // get_mempolicy // 1260
data8 0 // set_mempolicy
data8 0 // mq_open
data8 0 // mq_unlink
data8 0 // mq_timedsend
data8 0 // mq_timedreceive // 1265
data8 0 // mq_notify
data8 0 // mq_getsetattr
data8 0 // kexec_load
data8 0 // vserver
data8 0 // waitid // 1270
data8 0 // add_key
data8 0 // request_key
data8 0 // keyctl
data8 0 // ioprio_set
data8 0 // ioprio_get // 1275
data8 0 // move_pages
data8 0 // inotify_init
data8 0 // inotify_add_watch
data8 0 // inotify_rm_watch
data8 0 // migrate_pages // 1280
data8 0 // openat
data8 0 // mkdirat
data8 0 // mknodat
data8 0 // fchownat
data8 0 // futimesat // 1285
data8 0 // newfstatat
data8 0 // unlinkat
data8 0 // renameat
data8 0 // linkat
data8 0 // symlinkat // 1290
data8 0 // readlinkat
data8 0 // fchmodat
data8 0 // faccessat
data8 0
data8 0 // 1295
data8 0 // unshare
data8 0 // splice
data8 0 // set_robust_list
data8 0 // get_robust_list
data8 0 // sync_file_range // 1300
data8 0 // tee
data8 0 // vmsplice
data8 0
data8 fsys_getcpu // getcpu // 1304
// fill in zeros for the remaining entries
.zero: