From: Manfred Spraul <manfred@colorfullife.com>

The attached patch adds a simple kmem_cache_alloc_node function: allocate
memory on a given node.  The function is intended for cpu bound structures.
 It's used for alloc_percpu and for the slab-internal per-cpu structures. 
Jack Steiner reported a ~3% performance increase for AIM7 on a 64-way
Itanium 2.

Port maintainers: The patch could cause problems if CPU_UP_PREPARE is
called for a cpu on a node before the corresponding memory is attached
and/or if alloc_pages_node doesn't fall back to memory from another node if
there is no memory in the requested node.  I think noone does that, but I'm
not sure.


---

 25-akpm/include/linux/slab.h |    1 
 25-akpm/mm/slab.c            |  201 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 151 insertions(+), 51 deletions(-)

diff -puN include/linux/slab.h~add-kmem_cache_alloc_node include/linux/slab.h
--- 25/include/linux/slab.h~add-kmem_cache_alloc_node	2004-05-15 23:51:38.973971776 -0700
+++ 25-akpm/include/linux/slab.h	2004-05-15 23:51:38.978971016 -0700
@@ -61,6 +61,7 @@ extern kmem_cache_t *kmem_cache_create(c
 extern int kmem_cache_destroy(kmem_cache_t *);
 extern int kmem_cache_shrink(kmem_cache_t *);
 extern void *kmem_cache_alloc(kmem_cache_t *, int);
+extern void *kmem_cache_alloc_node(kmem_cache_t *, int);
 extern void kmem_cache_free(kmem_cache_t *, void *);
 extern unsigned int kmem_cache_size(kmem_cache_t *);
 
diff -puN mm/slab.c~add-kmem_cache_alloc_node mm/slab.c
--- 25/mm/slab.c~add-kmem_cache_alloc_node	2004-05-15 23:51:38.975971472 -0700
+++ 25-akpm/mm/slab.c	2004-05-15 23:51:38.982970408 -0700
@@ -612,6 +612,26 @@ static void stop_cpu_timer(int cpu)
 }
 #endif
 
+static struct array_cache *alloc_arraycache(int cpu, int entries, int batchcount)
+{
+	int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
+	struct array_cache *nc = NULL;
+
+	if (cpu != -1) {
+		nc = kmem_cache_alloc_node(kmem_find_general_cachep(memsize,
+					GFP_KERNEL), cpu_to_node(cpu));
+	}
+	if (!nc)
+		nc = kmalloc(memsize, GFP_KERNEL);
+	if (nc) {
+		nc->avail = 0;
+		nc->limit = entries;
+		nc->batchcount = batchcount;
+		nc->touched = 0;
+	}
+	return nc;
+}
+
 static int __devinit cpuup_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
@@ -623,17 +643,11 @@ static int __devinit cpuup_callback(stru
 	case CPU_UP_PREPARE:
 		down(&cache_chain_sem);
 		list_for_each_entry(cachep, &cache_chain, next) {
-			int memsize;
 			struct array_cache *nc;
 
-			memsize = sizeof(void*)*cachep->limit+sizeof(struct array_cache);
-			nc = kmalloc(memsize, GFP_KERNEL);
+			nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
 			if (!nc)
 				goto bad;
-			nc->avail = 0;
-			nc->limit = cachep->limit;
-			nc->batchcount = cachep->batchcount;
-			nc->touched = 0;
 
 			spin_lock_irq(&cachep->spinlock);
 			cachep->array[cpu] = nc;
@@ -829,23 +843,32 @@ __initcall(cpucache_init);
  * did not request dmaable memory, we might get it, but that
  * would be relatively rare and ignorable.
  */
-static inline void *kmem_getpages(kmem_cache_t *cachep, unsigned long flags)
+static void *kmem_getpages(kmem_cache_t *cachep, int flags, int nodeid)
 {
+	struct page *page;
 	void *addr;
+	int i;
 
 	flags |= cachep->gfpflags;
-	addr = (void*)__get_free_pages(flags, cachep->gfporder);
-	if (addr) {
-		int i = (1 << cachep->gfporder);
-		struct page *page = virt_to_page(addr);
-
-		if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-			atomic_add(i, &slab_reclaim_pages);
-		add_page_state(nr_slab, i);
-		while (i--) {
-			SetPageSlab(page);
-			page++;
-		}
+	if (likely(nodeid == -1)) {
+		addr = (void*)__get_free_pages(flags, cachep->gfporder);
+		if (!addr)
+			return NULL;
+		page = virt_to_page(addr);
+	} else {
+		page = alloc_pages_node(nodeid, flags, cachep->gfporder);
+		if (!page)
+			return NULL;
+		addr = page_address(page);
+	}
+
+	i = (1 << cachep->gfporder);
+	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+		atomic_add(i, &slab_reclaim_pages);
+	add_page_state(nr_slab, i);
+	while (i--) {
+		SetPageSlab(page);
+		page++;
 	}
 	return addr;
 }
@@ -1652,6 +1675,21 @@ static void kmem_flagcheck(kmem_cache_t 
 	}
 }
 
+static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
+{
+	int i;
+	struct page *page;
+
+	/* Nasty!!!!!! I hope this is OK. */
+	i = 1 << cachep->gfporder;
+	page = virt_to_page(objp);
+	do {
+		SET_PAGE_CACHE(page, cachep);
+		SET_PAGE_SLAB(page, slabp);
+		page++;
+	} while (--i);
+}
+
 /*
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
@@ -1659,10 +1697,9 @@ static void kmem_flagcheck(kmem_cache_t 
 static int cache_grow (kmem_cache_t * cachep, int flags)
 {
 	struct slab	*slabp;
-	struct page	*page;
 	void		*objp;
 	size_t		 offset;
-	unsigned int	 i, local_flags;
+	int		 local_flags;
 	unsigned long	 ctor_flags;
 
 	/* Be lazy and only check for valid flags here,
@@ -1708,21 +1745,14 @@ static int cache_grow (kmem_cache_t * ca
 
 
 	/* Get mem for the objs. */
-	if (!(objp = kmem_getpages(cachep, flags)))
+	if (!(objp = kmem_getpages(cachep, flags, -1)))
 		goto failed;
 
 	/* Get slab management. */
 	if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
 		goto opps1;
 
-	/* Nasty!!!!!! I hope this is OK. */
-	i = 1 << cachep->gfporder;
-	page = virt_to_page(objp);
-	do {
-		SET_PAGE_CACHE(page, cachep);
-		SET_PAGE_SLAB(page, slabp);
-		page++;
-	} while (--i);
+	set_slab_attr(cachep, slabp, objp);
 
 	cache_init_objs(cachep, slabp, ctor_flags);
 
@@ -2239,6 +2269,81 @@ out:
 }
 
 /**
+ * kmem_cache_alloc_node - Allocate an object on the specified node
+ * @cachep: The cache to allocate from.
+ * @flags: See kmalloc().
+ * @nodeid: node number of the target node.
+ *
+ * Identical to kmem_cache_alloc, except that this function is slow
+ * and can sleep. And it will allocate memory on the given node, which
+ * can improve the performance for cpu bound structures.
+ */
+void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid)
+{
+	size_t offset;
+	void *objp;
+	struct slab *slabp;
+	kmem_bufctl_t next;
+
+	/* The main algorithms are not node aware, thus we have to cheat:
+	 * We bypass all caches and allocate a new slab.
+	 * The following code is a streamlined copy of cache_grow().
+	 */
+
+	/* Get colour for the slab, and update the next value. */
+	spin_lock_irq(&cachep->spinlock);
+	offset = cachep->colour_next;
+	cachep->colour_next++;
+	if (cachep->colour_next >= cachep->colour)
+		cachep->colour_next = 0;
+	offset *= cachep->colour_off;
+	spin_unlock_irq(&cachep->spinlock);
+
+	/* Get mem for the objs. */
+	if (!(objp = kmem_getpages(cachep, GFP_KERNEL, nodeid)))
+		goto failed;
+
+	/* Get slab management. */
+	if (!(slabp = alloc_slabmgmt(cachep, objp, offset, GFP_KERNEL)))
+		goto opps1;
+
+	set_slab_attr(cachep, slabp, objp);
+	cache_init_objs(cachep, slabp, SLAB_CTOR_CONSTRUCTOR);
+
+	/* The first object is ours: */
+	objp = slabp->s_mem + slabp->free*cachep->objsize;
+	slabp->inuse++;
+	next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+#endif
+	slabp->free = next;
+
+	/* add the remaining objects into the cache */
+	spin_lock_irq(&cachep->spinlock);
+	check_slabp(cachep, slabp);
+	STATS_INC_GROWN(cachep);
+	/* Make slab active. */
+	if (slabp->free == BUFCTL_END) {
+		list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_full));
+	} else {
+		list_add_tail(&slabp->list,
+				&(list3_data(cachep)->slabs_partial));
+		list3_data(cachep)->free_objects += cachep->num-1;
+	}
+	spin_unlock_irq(&cachep->spinlock);
+	objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
+					__builtin_return_address(0));
+	return objp;
+opps1:
+	kmem_freepages(cachep, objp);
+failed:
+	return NULL;
+
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+
+/**
  * kmalloc - allocate memory
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
@@ -2302,7 +2407,10 @@ void *__alloc_percpu(size_t size, size_t
 	for (i = 0; i < NR_CPUS; i++) {
 		if (!cpu_possible(i))
 			continue;
-		pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+		pdata->ptrs[i] = kmem_cache_alloc_node(
+				kmem_find_general_cachep(size, GFP_KERNEL),
+				cpu_to_node(i));
+
 		if (!pdata->ptrs[i])
 			goto unwind_oom;
 		memset(pdata->ptrs[i], 0, size);
@@ -2441,19 +2549,15 @@ static int do_tune_cpucache (kmem_cache_
 
 	memset(&new.new,0,sizeof(new.new));
 	for (i = 0; i < NR_CPUS; i++) {
-		struct array_cache *ccnew;
-
-		ccnew = kmalloc(sizeof(void*)*limit+
-				sizeof(struct array_cache), GFP_KERNEL);
-		if (!ccnew) {
-			for (i--; i >= 0; i--) kfree(new.new[i]);
-			return -ENOMEM;
-		}
-		ccnew->avail = 0;
-		ccnew->limit = limit;
-		ccnew->batchcount = batchcount;
-		ccnew->touched = 0;
-		new.new[i] = ccnew;
+		if (cpu_online(i)) {
+			new.new[i] = alloc_arraycache(i, limit, batchcount);
+			if (!new.new[i]) {
+				for (i--; i >= 0; i--) kfree(new.new[i]);
+				return -ENOMEM;
+			}
+		} else {
+			new.new[i] = NULL;
+		}
 	}
 	new.cachep = cachep;
 
@@ -2475,14 +2579,9 @@ static int do_tune_cpucache (kmem_cache_
 		spin_unlock_irq(&cachep->spinlock);
 		kfree(ccold);
 	}
-	new_shared = kmalloc(sizeof(void*)*batchcount*shared+
-				sizeof(struct array_cache), GFP_KERNEL);
+	new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
 	if (new_shared) {
 		struct array_cache *old;
-		new_shared->avail = 0;
-		new_shared->limit = batchcount*shared;
-		new_shared->batchcount = 0xbaadf00d;
-		new_shared->touched = 0;
 
 		spin_lock_irq(&cachep->spinlock);
 		old = cachep->lists.shared;

_