
From: Oleg Nesterov <oleg@tv-sign.ru>

There are problems with del_timer_sync().

- Scalability.  All cpus are scanned to determine if the timer is running on
  that cpu.

- It is racy.  The timer can be fired again after del_timer_sync have
  checked all cpus and before it will recheck timer_pending().

This patch adds 'pending flag' to timer_list.  This flag is encoded in the
least significant bit of timer->base.  This way we do not waste the space and
can read/write base+pending atomically.

__run_timers(), del_timer() do not set ->base = NULL anymore, they only clear
pending flag, so that del_timer_sync can wait while timer->base->running_timer
== timer.

It works only if recurring timer do not use add_timer_on().

The following patch renames ->base to ->_base.  Now this field is used
directly only in __get_base/__set_base helpers.

No changes in kernel/timer.o.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/include/linux/timer.h |   13 +++++++++----
 25-akpm/kernel/timer.c        |   36 +++++++++++++++++++++++++-----------
 2 files changed, 34 insertions(+), 15 deletions(-)

diff -puN include/linux/timer.h~timers-prepare-for-del_timer_sync-changes include/linux/timer.h
--- 25/include/linux/timer.h~timers-prepare-for-del_timer_sync-changes	2005-03-19 14:19:49.000000000 -0800
+++ 25-akpm/include/linux/timer.h	2005-03-19 14:19:49.000000000 -0800
@@ -18,16 +18,21 @@ struct timer_list {
 	void (*function)(unsigned long);
 	unsigned long data;
 
-	struct tvec_t_base_s *base;
+	struct tvec_t_base_s *_base;
 };
 
+static inline int __timer_pending(struct tvec_t_base_s *base)
+{
+	return base != NULL;
+}
+
 #define TIMER_MAGIC	0x4b87ad6e
 
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
-		.base = NULL,					\
+		._base = NULL,					\
 		.magic = TIMER_MAGIC,				\
 		.lock = SPIN_LOCK_UNLOCKED,			\
 	}
@@ -41,7 +46,7 @@ struct timer_list {
  */
 static inline void init_timer(struct timer_list * timer)
 {
-	timer->base = NULL;
+	timer->_base = NULL;
 	timer->magic = TIMER_MAGIC;
 	spin_lock_init(&timer->lock);
 }
@@ -58,7 +63,7 @@ static inline void init_timer(struct tim
  */
 static inline int timer_pending(const struct timer_list * timer)
 {
-	return timer->base != NULL;
+	return __timer_pending(timer->_base);
 }
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
diff -puN kernel/timer.c~timers-prepare-for-del_timer_sync-changes kernel/timer.c
--- 25/kernel/timer.c~timers-prepare-for-del_timer_sync-changes	2005-03-19 14:19:49.000000000 -0800
+++ 25-akpm/kernel/timer.c	2005-03-19 14:19:49.000000000 -0800
@@ -86,6 +86,20 @@ static inline void set_running_timer(tve
 #endif
 }
 
+static inline tvec_base_t *__get_base(struct timer_list *timer)
+{
+	return timer->_base;
+}
+
+static inline void __set_base(struct timer_list *timer,
+				tvec_base_t *base, int pending)
+{
+	if (pending)
+		timer->_base = base;
+	else
+		timer->_base = NULL;
+}
+
 /* Fake initialization */
 static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
 
@@ -169,7 +183,7 @@ int __mod_timer(struct timer_list *timer
 	spin_lock_irqsave(&timer->lock, flags);
 	new_base = &__get_cpu_var(tvec_bases);
 repeat:
-	old_base = timer->base;
+	old_base = __get_base(timer);
 
 	/*
 	 * Prevent deadlocks via ordering by old_base < new_base.
@@ -186,14 +200,14 @@ repeat:
 		 * The timer base might have been cancelled while we were
 		 * trying to take the lock(s):
 		 */
-		if (timer->base != old_base) {
+		if (__get_base(timer) != old_base) {
 			spin_unlock(&new_base->lock);
 			spin_unlock(&old_base->lock);
 			goto repeat;
 		}
 	} else {
 		spin_lock(&new_base->lock);
-		if (timer->base != old_base) {
+		if (__get_base(timer) != old_base) {
 			spin_unlock(&new_base->lock);
 			goto repeat;
 		}
@@ -209,7 +223,7 @@ repeat:
 	}
 	timer->expires = expires;
 	internal_add_timer(new_base, timer);
-	timer->base = new_base;
+	__set_base(timer, new_base, 1);
 
 	if (old_base && (new_base != old_base))
 		spin_unlock(&old_base->lock);
@@ -239,7 +253,7 @@ void add_timer_on(struct timer_list *tim
 
 	spin_lock_irqsave(&base->lock, flags);
 	internal_add_timer(base, timer);
-	timer->base = base;
+	__set_base(timer, base, 1);
 	spin_unlock_irqrestore(&base->lock, flags);
 }
 
@@ -301,18 +315,18 @@ int del_timer(struct timer_list *timer)
 	check_timer(timer);
 
 repeat:
- 	base = timer->base;
+ 	base = __get_base(timer);
 	if (!base)
 		return 0;
 	spin_lock_irqsave(&base->lock, flags);
-	if (base != timer->base) {
+	if (base != __get_base(timer)) {
 		spin_unlock_irqrestore(&base->lock, flags);
 		goto repeat;
 	}
 	list_del(&timer->entry);
 	/* Need to make sure that anybody who sees a NULL base also sees the list ops */
 	smp_wmb();
-	timer->base = NULL;
+	__set_base(timer, base, 0);
 	spin_unlock_irqrestore(&base->lock, flags);
 
 	return 1;
@@ -415,7 +429,7 @@ static int cascade(tvec_base_t *base, tv
 		struct timer_list *tmp;
 
 		tmp = list_entry(curr, struct timer_list, entry);
-		BUG_ON(tmp->base != base);
+		BUG_ON(__get_base(tmp) != base);
 		curr = curr->next;
 		internal_add_timer(base, tmp);
 	}
@@ -465,7 +479,7 @@ repeat:
 			list_del(&timer->entry);
 			set_running_timer(base, timer);
 			smp_wmb();
-			timer->base = NULL;
+			__set_base(timer, base, 0);
 			spin_unlock_irq(&base->lock);
 			{
 				u32 preempt_count = preempt_count();
@@ -1315,7 +1329,7 @@ static int migrate_timer_list(tvec_base_
 			return 0;
 		list_del(&timer->entry);
 		internal_add_timer(new_base, timer);
-		timer->base = new_base;
+		__set_base(timer, new_base, 1);
 		spin_unlock(&timer->lock);
 	}
 	return 1;
_
