Skip to content

Commit 7a0268f

Browse files
antonblanchardpaulusmack
authored andcommitted
[PATCH] powerpc/64: per cpu data optimisations
The current ppc64 per cpu data implementation is quite slow. eg: lhz 11,18(13) /* smp_processor_id() */ ld 9,.LC63-.LCTOC1(30) /* per_cpu__variable_name */ ld 8,.LC61-.LCTOC1(30) /* __per_cpu_offset */ sldi 11,11,3 /* form index into __per_cpu_offset */ mr 10,9 ldx 9,11,8 /* __per_cpu_offset[smp_processor_id()] */ ldx 0,10,9 /* load per cpu data */ 5 loads for something that is supposed to be fast, pretty awful. One reason for the large number of loads is that we have to synthesize 2 64bit constants (per_cpu__variable_name and __per_cpu_offset). By putting __per_cpu_offset into the paca we can avoid the 2 loads associated with it: ld 11,56(13) /* paca->data_offset */ ld 9,.LC59-.LCTOC1(30) /* per_cpu__variable_name */ ldx 0,9,11 /* load per cpu data Longer term we can should be able to do even better than 3 loads. If per_cpu__variable_name wasnt a 64bit constant and paca->data_offset was in a register we could cut it down to one load. A suggestion from Rusty is to use gcc's __thread extension here. In order to do this we would need to free up r13 (the __thread register and where the paca currently is). So far Ive had a few unsuccessful attempts at doing that :) The patch also allocates per cpu memory node local on NUMA machines. This patch from Rusty has been sitting in my queue _forever_ but stalled when I hit the compiler bug. Sorry about that. Finally I also only allocate per cpu data for possible cpus, which comes straight out of the x86-64 port. On a pseries kernel (with NR_CPUS == 128) and 4 possible cpus we see some nice gains: total used free shared buffers cached Mem: 4012228 212860 3799368 0 0 162424 total used free shared buffers cached Mem: 4016200 212984 3803216 0 0 162424 A saving of 3.75MB. Quite nice for smaller machines. Note: we now have to be careful of per cpu users that touch data for !possible cpus. At this stage it might be worth making the NUMA and possible cpu optimisations generic, but per cpu init is done so early we have to be careful that all architectures have their possible map setup correctly. Signed-off-by: Anton Blanchard <[email protected]> Signed-off-by: Paul Mackerras <[email protected]>
1 parent 193cac9 commit 7a0268f

File tree

3 files changed

+83
-0
lines changed

3 files changed

+83
-0
lines changed

arch/powerpc/kernel/setup_64.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include <linux/unistd.h>
3434
#include <linux/serial.h>
3535
#include <linux/serial_8250.h>
36+
#include <linux/bootmem.h>
3637
#include <asm/io.h>
3738
#include <asm/kdump.h>
3839
#include <asm/prom.h>
@@ -654,3 +655,28 @@ void cpu_die(void)
654655
if (ppc_md.cpu_die)
655656
ppc_md.cpu_die();
656657
}
658+
659+
#ifdef CONFIG_SMP
660+
void __init setup_per_cpu_areas(void)
661+
{
662+
int i;
663+
unsigned long size;
664+
char *ptr;
665+
666+
/* Copy section for each CPU (we discard the original) */
667+
size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
668+
#ifdef CONFIG_MODULES
669+
if (size < PERCPU_ENOUGH_ROOM)
670+
size = PERCPU_ENOUGH_ROOM;
671+
#endif
672+
673+
for_each_cpu(i) {
674+
ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
675+
if (!ptr)
676+
panic("Cannot allocate cpu data for CPU %d\n", i);
677+
678+
paca[i].data_offset = ptr - __per_cpu_start;
679+
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
680+
}
681+
}
682+
#endif

include/asm-powerpc/paca.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ struct paca_struct {
6565
u64 stab_real; /* Absolute address of segment table */
6666
u64 stab_addr; /* Virtual address of segment table */
6767
void *emergency_sp; /* pointer to emergency stack */
68+
u64 data_offset; /* per cpu data offset */
6869
s16 hw_cpu_id; /* Physical processor number */
6970
u8 cpu_start; /* At startup, processor spins until */
7071
/* this becomes non-zero. */

include/asm-powerpc/percpu.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,57 @@
1+
#ifndef _ASM_POWERPC_PERCPU_H_
2+
#define _ASM_POWERPC_PERCPU_H_
3+
#ifdef __powerpc64__
4+
#include <linux/compiler.h>
5+
6+
/*
7+
* Same as asm-generic/percpu.h, except that we store the per cpu offset
8+
* in the paca. Based on the x86-64 implementation.
9+
*/
10+
11+
#ifdef CONFIG_SMP
12+
13+
#include <asm/paca.h>
14+
15+
#define __per_cpu_offset(cpu) (paca[cpu].data_offset)
16+
#define __my_cpu_offset() get_paca()->data_offset
17+
18+
/* Separate out the type, so (int[3], foo) works. */
19+
#define DEFINE_PER_CPU(type, name) \
20+
__attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
21+
22+
/* var is in discarded region: offset to particular copy we want */
23+
#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
24+
#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
25+
26+
/* A macro to avoid #include hell... */
27+
#define percpu_modcopy(pcpudst, src, size) \
28+
do { \
29+
unsigned int __i; \
30+
for (__i = 0; __i < NR_CPUS; __i++) \
31+
if (cpu_possible(__i)) \
32+
memcpy((pcpudst)+__per_cpu_offset(__i), \
33+
(src), (size)); \
34+
} while (0)
35+
36+
extern void setup_per_cpu_areas(void);
37+
38+
#else /* ! SMP */
39+
40+
#define DEFINE_PER_CPU(type, name) \
41+
__typeof__(type) per_cpu__##name
42+
43+
#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var))
44+
#define __get_cpu_var(var) per_cpu__##var
45+
46+
#endif /* SMP */
47+
48+
#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
49+
50+
#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
51+
#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
52+
53+
#else
154
#include <asm-generic/percpu.h>
55+
#endif
56+
57+
#endif /* _ASM_POWERPC_PERCPU_H_ */

0 commit comments

Comments
 (0)