Makefile                            |    2 +-
 arch/sparc64/Kconfig                |    2 +
 arch/sparc64/kernel/smp.c           |    1 +
 arch/sparc64/kernel/systbls.S       |   20 +-
 arch/x86/Kconfig                    |    2 +
 arch/x86/kernel/Makefile_32         |    3 +
 arch/x86/kernel/ft_event.c          |  104 +++++
 arch/x86/kernel/smp_32.c            |    1 +
 arch/x86/kernel/syscall_table_32.S  |   16 +
 fs/exec.c                           |    3 +
 fs/inode.c                          |    2 +
 include/asm-sparc64/feather_trace.h |   22 +
 include/asm-sparc64/unistd.h        |    6 +-
 include/asm-x86/feather_trace.h     |  104 +++++
 include/asm-x86/unistd_32.h         |    6 +-
 include/linux/completion.h          |    2 +-
 include/linux/fs.h                  |    5 +
 include/linux/sched.h               |   11 +
 include/linux/tick.h                |    3 +
 include/linux/uaccess.h             |   16 +
 include/litmus/edf_common.h         |   25 +
 include/litmus/fdso.h               |   69 +++
 include/litmus/feather_buffer.h     |   94 ++++
 include/litmus/feather_trace.h      |   37 ++
 include/litmus/heap.h               |  301 +++++++++++++
 include/litmus/jobs.h               |    9 +
 include/litmus/litmus.h             |  205 +++++++++
 include/litmus/norqlock.h           |   26 ++
 include/litmus/rt_domain.h          |  127 ++++++
 include/litmus/rt_param.h           |  170 +++++++
 include/litmus/sched_plugin.h       |  144 ++++++
 include/litmus/sched_trace.h        |   31 ++
 include/litmus/trace.h              |  107 +++++
 include/litmus/unistd.h             |   20 +
 kernel/exit.c                       |    4 +
 kernel/fork.c                       |    8 +
 kernel/printk.c                     |   10 +-
 kernel/sched.c                      |   93 ++++-
 kernel/sched_fair.c                 |    2 +-
 kernel/sched_rt.c                   |    2 +-
 kernel/time/tick-sched.c            |   37 ++-
 litmus/Kconfig                      |   78 ++++
 litmus/Makefile                     |   14 +
 litmus/edf_common.c                 |   94 ++++
 litmus/fdso.c                       |  282 ++++++++++++
 litmus/fmlp.c                       |  262 +++++++++++
 litmus/ft_event.c                   |   43 ++
 litmus/jobs.c                       |   43 ++
 litmus/litmus.c                     |  826 +++++++++++++++++++++++++++++++++++
 litmus/norqlock.c                   |   56 +++
 litmus/rt_domain.c                  |  138 ++++++
 litmus/sched_cedf.c                 |  717 ++++++++++++++++++++++++++++++
 litmus/sched_gsn_edf.c              |  731 +++++++++++++++++++++++++++++++
 litmus/sched_litmus.c               |  228 ++++++++++
 litmus/sched_pfair.c                |  785 +++++++++++++++++++++++++++++++++
 litmus/sched_plugin.c               |  185 ++++++++
 litmus/sched_psn_edf.c              |  454 +++++++++++++++++++
 litmus/sched_trace.c                |  569 ++++++++++++++++++++++++
 litmus/srp.c                        |  318 ++++++++++++++
 litmus/sync.c                       |   86 ++++
 litmus/trace.c                      |  335 ++++++++++++++
 61 files changed, 8075 insertions(+), 21 deletions(-)

diff --git a/Makefile b/Makefile
index 189d8ef..d9e4495 100644
--- a/Makefile
+++ b/Makefile
@@ -597,7 +597,7 @@ export mod_strip_cmd
 
 
 ifeq ($(KBUILD_EXTMOD),)
-core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/
 
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 10b212a..8d90b5a 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -471,3 +471,5 @@ source "security/Kconfig"
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
+
+source "litmus/Kconfig"
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index c399449..cd2bc7e 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -1033,6 +1033,7 @@ void smp_receive_signal(int cpu)
 void smp_receive_signal_client(int irq, struct pt_regs *regs)
 {
 	clear_softint(1 << irq);
+	set_tsk_need_resched(current);
 }
 
 void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S
index 06d1090..7fc7615 100644
--- a/arch/sparc64/kernel/systbls.S
+++ b/arch/sparc64/kernel/systbls.S
@@ -82,6 +82,13 @@ sys_call_table32:
 	.word compat_sys_set_mempolicy, compat_sys_kexec_load, compat_sys_move_pages, sys_getcpu, compat_sys_epoll_pwait
 /*310*/	.word compat_sys_utimensat, compat_sys_signalfd, compat_sys_timerfd, sys_eventfd, compat_sys_fallocate
 
+/*LITMUS, 315*/
+	.word sys_set_rt_task_param, sys_get_rt_task_param, sys_complete_job, sys_register_np_flag, sys_exit_np
+/*320*/
+	.word sys_od_open, sys_od_close, sys_fmlp_down, sys_fmlp_up, sys_srp_down
+/*325*/ .word sys_srp_up, sys_query_job_no, sys_wait_for_job_release, sys_wait_for_ts_release, sys_release_ts
+
+
 #endif /* CONFIG_COMPAT */
 
 	/* Now the 64-bit native Linux syscall table. */
@@ -154,6 +161,12 @@ sys_call_table:
 	.word sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait
 /*310*/	.word sys_utimensat, sys_signalfd, sys_timerfd, sys_eventfd, sys_fallocate
 
+/*LITMUS, 315*/
+	.word sys_set_rt_task_param, sys_get_rt_task_param, sys_complete_job, sys_register_np_flag, sys_exit_np
+/*320*/
+	.word sys_od_open, sys_od_close, sys_fmlp_down, sys_fmlp_up, sys_srp_down
+/*325*/ .word sys_srp_up, sys_query_job_no, sys_wait_for_job_release, sys_wait_for_ts_release, sys_release_ts
+	
 #if defined(CONFIG_SUNOS_EMUL) || defined(CONFIG_SOLARIS_EMUL) || \
     defined(CONFIG_SOLARIS_EMUL_MODULE)
 	/* Now the 32-bit SunOS syscall table. */
@@ -271,6 +284,11 @@ sunos_sys_table:
 	.word sunos_nosys, sunos_nosys, sunos_nosys
 	.word sunos_nosys
 /*310*/	.word sunos_nosys, sunos_nosys, sunos_nosys
-	.word sunos_nosys, sunos_nosys
+	.word sunos_nosys, sunos_nosys, sunos_nosys
+	.word sunos_nosys, sunos_nosys, sunos_nosys
+	.word sunos_nosys
+/*320*/ .word sunos_nosys, sunos_nosys, sunos_nosys
+	.word sunos_nosys, sunos_nosys, sunos_nosys
+	.word sunos_nosys, sunos_nosys, sunos_nosys
 
 #endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 80b7ba4..f99330f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1620,3 +1620,5 @@ source "security/Kconfig"
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
+
+source "litmus/Kconfig"
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
index a7bc93c..5f87f32 100644
--- a/arch/x86/kernel/Makefile_32
+++ b/arch/x86/kernel/Makefile_32
@@ -49,6 +49,9 @@ obj-y				+= pcspeaker.o
 
 obj-$(CONFIG_SCx200)		+= scx200_32.o
 
+obj-$(CONFIG_FEATHER_TRACE)	+= ft_event.o
+
+
 # vsyscall_32.o contains the vsyscall DSO images as __initdata.
 # We must build both images before we can assemble it.
 # Note: kbuild does not track this dependency due to usage of .incbin
diff --git a/arch/x86/kernel/ft_event.c b/arch/x86/kernel/ft_event.c
new file mode 100644
index 0000000..b1d80c5
--- /dev/null
+++ b/arch/x86/kernel/ft_event.c
@@ -0,0 +1,104 @@
+#include <linux/types.h>
+
+#include <litmus/feather_trace.h>
+
+/* the feather trace management functions assume
+ * exclusive access to the event table
+ */
+
+
+#define BYTE_JUMP      0xeb
+#define BYTE_JUMP_LEN  0x02
+
+/* for each event, there is an entry in the event table */
+struct trace_event {
+	long 	id;
+	long	count;
+	long	start_addr;
+	long	end_addr;
+};
+
+extern struct trace_event  __start___event_table[];
+extern struct trace_event  __stop___event_table[];
+
+int ft_enable_event(unsigned long id)
+{
+	struct trace_event* te = __start___event_table;
+	int count = 0;
+	char* delta;
+	unsigned char* instr;
+
+	while (te < __stop___event_table) {
+		if (te->id == id && ++te->count == 1) {
+			instr  = (unsigned char*) te->start_addr;
+			/* make sure we don't clobber something wrong */
+			if (*instr == BYTE_JUMP) {
+				delta  = (((unsigned char*) te->start_addr) + 1);
+				*delta = 0;
+			}
+		}
+		if (te->id == id)
+			count++;
+		te++;
+	}
+	return count;
+}
+
+int ft_disable_event(unsigned long id)
+{
+	struct trace_event* te = __start___event_table;
+	int count = 0;
+	char* delta;
+	unsigned char* instr;
+
+	while (te < __stop___event_table) {
+		if (te->id == id && --te->count == 0) {
+			instr  = (unsigned char*) te->start_addr;
+			if (*instr == BYTE_JUMP) {
+				delta  = (((unsigned char*) te->start_addr) + 1);
+				*delta = te->end_addr - te->start_addr -
+					BYTE_JUMP_LEN;
+			}
+		}
+		if (te->id == id)
+			count++;
+		te++;
+	}
+	return count;
+}
+
+int ft_disable_all_events(void)
+{
+	struct trace_event* te = __start___event_table;
+	int count = 0;
+	char* delta;
+	unsigned char* instr;
+
+	while (te < __stop___event_table) {
+		if (te->count) {
+			instr  = (unsigned char*) te->start_addr;
+			if (*instr == BYTE_JUMP) {
+				delta  = (((unsigned char*) te->start_addr)
+					  + 1);
+				*delta = te->end_addr - te->start_addr -
+					BYTE_JUMP_LEN;
+				te->count = 0;
+				count++;
+			}
+		}
+		te++;
+	}
+	return count;
+}
+
+int ft_is_event_enabled(unsigned long id)
+{
+	struct trace_event* te = __start___event_table;
+
+	while (te < __stop___event_table) {
+		if (te->id == id)
+			return te->count;
+		te++;
+	}
+	return 0;
+}
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
index fcaa026..1063dfc 100644
--- a/arch/x86/kernel/smp_32.c
+++ b/arch/x86/kernel/smp_32.c
@@ -641,6 +641,7 @@ static void native_smp_send_stop(void)
 fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
+	set_tsk_need_resched(current);
 	__get_cpu_var(irq_stat).irq_resched_count++;
 }
 
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8344c70..f6fdb0a 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -324,3 +324,19 @@ ENTRY(sys_call_table)
 	.long sys_timerfd
 	.long sys_eventfd
 	.long sys_fallocate
+	/* LITMUS */
+	.long sys_set_rt_task_param	/* 325 */
+	.long sys_get_rt_task_param
+	.long sys_complete_job
+	.long sys_register_np_flag
+	.long sys_exit_np
+	.long sys_od_open		/* 330 */
+	.long sys_od_close
+	.long sys_fmlp_down
+	.long sys_fmlp_up
+	.long sys_srp_down
+	.long sys_srp_up		/* 335 */
+	.long sys_query_job_no
+	.long sys_wait_for_job_release
+	.long sys_wait_for_ts_release
+	.long sys_release_ts		/* 339 */
diff --git a/fs/exec.c b/fs/exec.c
index 282240a..6f47786 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,6 +56,8 @@
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
 
+#include <litmus/litmus.h>
+
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
 #endif
@@ -1309,6 +1311,7 @@ int do_execve(char * filename,
 		goto out_kfree;
 
 	sched_exec();
+	litmus_exec();
 
 	bprm->file = file;
 	bprm->filename = filename;
diff --git a/fs/inode.c b/fs/inode.c
index ed35383..ef71ea0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -220,6 +220,8 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->inotify_watches);
 	mutex_init(&inode->inotify_mutex);
 #endif
+	INIT_LIST_HEAD(&inode->i_obj_list);
+	mutex_init(&inode->i_obj_mutex);
 }
 
 EXPORT_SYMBOL(inode_init_once);
diff --git a/include/asm-sparc64/feather_trace.h b/include/asm-sparc64/feather_trace.h
new file mode 100644
index 0000000..35ec70f
--- /dev/null
+++ b/include/asm-sparc64/feather_trace.h
@@ -0,0 +1,22 @@
+#ifndef _ARCH_FEATHER_TRACE_H
+#define _ARCH_FEATHER_TRACE_H
+
+#include <asm/atomic.h>
+#include <asm/timex.h>
+
+static inline int  fetch_and_inc(int *val)
+{
+	return atomic_add_ret(1, (atomic_t*) val) - 1;
+}
+
+static inline int  fetch_and_dec(int *val)
+{
+	return atomic_sub_ret(1, (atomic_t*) val) + 1;
+}
+
+static inline unsigned long long ft_timestamp(void)
+{
+	return get_cycles();
+}
+
+#endif
diff --git a/include/asm-sparc64/unistd.h b/include/asm-sparc64/unistd.h
index cb751b4..ebebde6 100644
--- a/include/asm-sparc64/unistd.h
+++ b/include/asm-sparc64/unistd.h
@@ -333,7 +333,11 @@
 #define __NR_eventfd		313
 #define __NR_fallocate		314
 
-#define NR_SYSCALLS		315
+#define __NR_LITMUS		315
+
+#include "litmus/unistd.h"
+
+#define NR_SYSCALLS		315 + NR_litmus_syscalls
 
 #ifdef __KERNEL__
 /* sysconf options, for SunOS compatibility */
diff --git a/include/asm-x86/feather_trace.h b/include/asm-x86/feather_trace.h
new file mode 100644
index 0000000..253067e
--- /dev/null
+++ b/include/asm-x86/feather_trace.h
@@ -0,0 +1,104 @@
+#ifndef _ARCH_FEATHER_TRACE_H
+#define _ARCH_FEATHER_TRACE_H
+
+static inline int  fetch_and_inc(int *val)
+{
+	int ret = 1;
+	__asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
+	return ret;
+}
+
+static inline int  fetch_and_dec(int *val)
+{
+	int ret = -1;
+	__asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
+	return ret;
+}
+
+#define feather_callback __attribute__((regparm(0)))
+
+/* make the compiler reload any register that is not saved in
+ * a cdecl function call
+ */
+#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx"
+
+#define ft_event(id, callback)                                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : : CLOBBER_LIST)
+
+#define ft_event0(id, callback)                                 \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " subl $4, %%esp                              \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+	    " call " #callback "                          \n\t" \
+	    " addl $4, %%esp                              \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : :  : CLOBBER_LIST)
+
+#define ft_event1(id, callback, param)                          \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " subl $8, %%esp                              \n\t" \
+	    " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+	    " call " #callback "                          \n\t" \
+	    " addl $8, %%esp                              \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (param)  : CLOBBER_LIST)
+
+#define ft_event2(id, callback, param, param2)                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " subl $12, %%esp                             \n\t" \
+	    " movl %1, 8(%%esp)                           \n\t" \
+	    " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+	    " call " #callback "                          \n\t" \
+	    " addl $12, %%esp                             \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (param), "r" (param2)  : CLOBBER_LIST)
+
+
+#define ft_event3(id, callback, p, p2, p3)                      \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " subl $16, %%esp                             \n\t" \
+	    " movl %1, 12(%%esp)                          \n\t" \
+	    " movl %1, 8(%%esp)                           \n\t" \
+	    " movl %0, 4(%%esp)                           \n\t" \
+            " movl $" #id  ", (%%esp)                     \n\t" \
+	    " call " #callback "                          \n\t" \
+	    " addl $16, %%esp                             \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+        : : "r" (p), "r" (p2), "r" (p3)  : CLOBBER_LIST)
+
+
+static inline unsigned long long ft_timestamp(void)
+{
+	unsigned long long ret;
+	__asm__ __volatile__("rdtsc" : "=A" (ret));
+	return ret;
+}
+
+#define __ARCH_HAS_FEATHER_TRACE
+
+#endif
diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h
index 9b15545..36fec84 100644
--- a/include/asm-x86/unistd_32.h
+++ b/include/asm-x86/unistd_32.h
@@ -331,9 +331,13 @@
 #define __NR_eventfd		323
 #define __NR_fallocate		324
 
+#define __NR_LITMUS		325
+
+#include "litmus/unistd.h"
+
 #ifdef __KERNEL__
 
-#define NR_syscalls 325
+#define NR_syscalls 324 + NR_litmus_syscalls
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 33d6aaf..5b55e97 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -51,7 +51,7 @@ extern unsigned long wait_for_completion_interruptible_timeout(
 
 extern void complete(struct completion *);
 extern void complete_all(struct completion *);
-
+extern void complete_n(struct completion *, int n);
 #define INIT_COMPLETION(x)	((x).done = 0)
 
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ec4a4..22f856c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -588,6 +588,8 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
 #define i_size_ordered_init(inode) do { } while (0)
 #endif
 
+struct inode_obj_id_table;
+
 struct inode {
 	struct hlist_node	i_hash;
 	struct list_head	i_list;
@@ -653,6 +655,9 @@ struct inode {
 	void			*i_security;
 #endif
 	void			*i_private; /* fs or device private pointer */
+
+	struct list_head	i_obj_list;
+	struct mutex		i_obj_mutex;
 };
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cc14656..76e28f1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -37,6 +37,7 @@
 #define SCHED_BATCH		3
 /* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE		5
+#define SCHED_LITMUS		6
 
 #ifdef __KERNEL__
 
@@ -91,6 +92,8 @@ struct sched_param {
 
 #include <asm/processor.h>
 
+#include <litmus/rt_param.h>
+
 struct exec_domain;
 struct futex_pi_state;
 struct bio;
@@ -914,6 +917,8 @@ struct sched_entity {
 #endif
 };
 
+struct od_table_entry;
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1178,6 +1183,12 @@ struct task_struct {
 	int make_it_fail;
 #endif
 	struct prop_local_single dirties;
+
+	/* litmus parameters and state */
+	struct rt_param rt_param;
+
+	/* references to PI semaphores, etc. */
+	struct od_table_entry* od_table;
 };
 
 /*
diff --git a/include/linux/tick.h b/include/linux/tick.h
index f4a1395..7eae358 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -64,6 +64,9 @@ extern int tick_is_oneshot_available(void);
 extern struct tick_device *tick_get_device(int cpu);
 
 # ifdef CONFIG_HIGH_RES_TIMERS
+#define LINUX_DEFAULT_TICKS	0
+#define LITMUS_ALIGNED_TICKS	1
+#define LITMUS_STAGGERED_TICKS	2
 extern int tick_init_highres(void);
 extern int tick_program_event(ktime_t expires, int force);
 extern void tick_setup_sched_timer(void);
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 975c963..6ae0ff9 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -84,4 +84,20 @@ static inline unsigned long __copy_from_user_nocache(void *to,
 		ret;					\
 	})
 
+/* This is a naive attempt at a write version of the above native Linux macro.
+ */
+#define poke_kernel_address(val, addr)			\
+	({						\
+		long ret;				\
+		mm_segment_t old_fs = get_fs();		\
+							\
+		set_fs(KERNEL_DS);			\
+		pagefault_disable();			\
+		ret = __put_user(val, (__force typeof(val) __user *)(addr)); \
+		pagefault_enable();			\
+		set_fs(old_fs);				\
+		ret;					\
+	})
+
+
 #endif		/* __LINUX_UACCESS_H__ */
diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h
new file mode 100644
index 0000000..4dff77a
--- /dev/null
+++ b/include/litmus/edf_common.h
@@ -0,0 +1,25 @@
+/* EDF common data structures and utility functions shared by all EDF
+ * based scheduler plugins
+ */
+
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_EDF_COMMON_H__
+#define __UNC_EDF_COMMON_H__
+
+#include <litmus/rt_domain.h>
+
+
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched, 
+		     release_job_t release);
+
+int edf_higher_prio(struct task_struct* first,
+		    struct task_struct* second);
+
+int  edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
+
+int edf_set_hp_task(struct pi_semaphore *sem);
+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu);
+#endif
diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
new file mode 100644
index 0000000..286e10f
--- /dev/null
+++ b/include/litmus/fdso.h
@@ -0,0 +1,69 @@
+/* fdso.h - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ */
+
+#ifndef _LINUX_FDSO_H_
+#define _LINUX_FDSO_H_
+
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+#include <linux/fs.h>
+
+#define MAX_OBJECT_DESCRIPTORS 32
+
+typedef enum  {
+	MIN_OBJ_TYPE 	= 0,
+
+	FMLP_SEM	= 0,
+	SRP_SEM		= 1,
+
+	MAX_OBJ_TYPE	= 1
+} obj_type_t;
+
+struct inode_obj_id {
+	struct list_head	list;
+	atomic_t		count;
+	struct inode*		inode;
+
+	obj_type_t 		type;
+	void*			obj;
+	unsigned int		id;
+};
+
+
+struct od_table_entry {
+	unsigned int		used;
+
+	struct inode_obj_id*	obj;
+	void*			extra;
+};
+
+struct fdso_ops {
+	void* (*create)	(void);
+	void  (*destroy)(void*);
+	int   (*open)	(struct od_table_entry*, void* __user);
+	int   (*close)	(struct od_table_entry*);
+};
+
+/* translate a userspace supplied od into the raw table entry
+ * returns NULL if od is invalid
+ */
+struct od_table_entry* __od_lookup(int od);
+
+/* translate a userspace supplied od into the associated object
+ * returns NULL if od is invalid
+ */
+static inline void* od_lookup(int od, obj_type_t type)
+{
+	struct od_table_entry* e = __od_lookup(od);
+	return e && e->obj->type == type ? e->obj->obj : NULL;
+}
+
+#define lookup_fmlp_sem(od)((struct pi_semaphore*)  od_lookup(od, FMLP_SEM))
+#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
+#define lookup_ics(od)     ((struct ics*)           od_lookup(od, ICS_ID))
+
+
+#endif
diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h
new file mode 100644
index 0000000..6c18277
--- /dev/null
+++ b/include/litmus/feather_buffer.h
@@ -0,0 +1,94 @@
+#ifndef _FEATHER_BUFFER_H_
+#define _FEATHER_BUFFER_H_
+
+/* requires UINT_MAX and memcpy */
+
+#define SLOT_FREE	0
+#define	SLOT_BUSY 	1
+#define	SLOT_READY	2
+
+struct ft_buffer {
+	unsigned int	slot_count;
+	unsigned int	slot_size;
+
+	int 		free_count;
+	unsigned int 	write_idx;
+	unsigned int 	read_idx;
+
+	char*		slots;
+	void*		buffer_mem;
+	unsigned int	failed_writes;
+};
+
+static inline int init_ft_buffer(struct ft_buffer*	buf,
+				 unsigned int 		slot_count,
+				 unsigned int 		slot_size,
+				 char*			slots,
+				 void* 			buffer_mem)
+{
+	int i = 0;
+	if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
+		/* The slot count must divide UNIT_MAX + 1 so that when it
+		 * wraps around the index correctly points to 0.
+		 */
+		return 0;
+	} else {
+		buf->slot_count    = slot_count;
+		buf->slot_size     = slot_size;
+		buf->slots         = slots;
+		buf->buffer_mem    = buffer_mem;
+		buf->free_count    = slot_count;
+		buf->write_idx     = 0;
+		buf->read_idx      = 0;
+		buf->failed_writes = 0;
+		for (i = 0; i < slot_count; i++)
+			buf->slots[i] = SLOT_FREE;
+		return 1;
+	}
+}
+
+static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
+{
+	int free = fetch_and_dec(&buf->free_count);
+	unsigned int idx;
+	if (free <= 0) {
+		fetch_and_inc(&buf->free_count);
+		*ptr = 0;
+		fetch_and_inc(&buf->failed_writes);
+		return 0;
+	} else {
+		idx  = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
+		buf->slots[idx] = SLOT_BUSY;
+		*ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
+		return 1;
+	}
+}
+
+static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
+{
+	unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
+	buf->slots[idx]  = SLOT_READY;
+}
+
+
+/* exclusive reader access is assumed */
+static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
+{
+	unsigned int idx;
+	if (buf->free_count == buf->slot_count)
+		/* nothing available */
+		return 0;
+	idx = buf->read_idx % buf->slot_count;
+	if (buf->slots[idx] == SLOT_READY) {
+		memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
+		       buf->slot_size);
+		buf->slots[idx] = SLOT_FREE;
+		buf->read_idx++;
+		fetch_and_inc(&buf->free_count);
+		return 1;
+	} else
+		return 0;
+}
+
+
+#endif
diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h
new file mode 100644
index 0000000..f8fb7ba
--- /dev/null
+++ b/include/litmus/feather_trace.h
@@ -0,0 +1,37 @@
+#ifndef _FEATHER_TRACE_H_
+#define _FEATHER_TRACE_H_
+
+#include <asm/feather_trace.h>
+
+int ft_enable_event(unsigned long id);
+int ft_disable_event(unsigned long id);
+int ft_is_event_enabled(unsigned long id);
+int ft_disable_all_events(void);
+
+#ifndef __ARCH_HAS_FEATHER_TRACE
+/* provide default implementation */
+
+#define feather_callback
+
+#define MAX_EVENTS 1024
+
+extern int ft_events[MAX_EVENTS];
+
+#define ft_event(id, callback) \
+	if (ft_events[id]) callback();
+
+#define ft_event0(id, callback) \
+	if (ft_events[id]) callback(id);
+
+#define ft_event1(id, callback, param) \
+	if (ft_events[id]) callback(id, param);
+
+#define ft_event2(id, callback, param, param2) \
+	if (ft_events[id]) callback(id, param, param2);
+
+#define ft_event3(id, callback, p, p2, p3) \
+	if (ft_events[id]) callback(id, p, p2, p3);
+#endif
+
+
+#endif
diff --git a/include/litmus/heap.h b/include/litmus/heap.h
new file mode 100644
index 0000000..b26804f
--- /dev/null
+++ b/include/litmus/heap.h
@@ -0,0 +1,301 @@
+/* heaps.h -- Binomial Heaps
+ *
+ * (c) 2008 Bjoern Brandenburg
+ */
+
+#ifndef HEAP_H
+#define HEAP_H
+
+#define NOT_IN_HEAP UINT_MAX
+
+struct heap_node {
+	struct heap_node* 	parent;
+	struct heap_node* 	next;
+	struct heap_node* 	child;
+
+	unsigned int 		degree;
+	void*			value;
+	struct heap_node**	ref;
+};
+
+struct heap {
+	struct heap_node* 	head;
+	/* We cache the minimum of the heap.
+	 * This speeds up repeated peek operations.
+	 */
+	struct heap_node*	min;
+};
+
+typedef int (*heap_prio_t)(struct heap_node* a, struct heap_node* b);
+
+static inline void heap_init(struct heap* heap)
+{
+	heap->head = NULL;
+	heap->head = NULL;
+}
+
+static inline void heap_node_init(struct heap_node** _h, void* value)
+{
+	struct heap_node* h = *_h;
+	h->parent = NULL;
+	h->next   = NULL;
+	h->child  = NULL;
+	h->degree = NOT_IN_HEAP;
+	h->value  = value;
+	h->ref    = _h;
+}
+
+static inline int heap_node_in_heap(struct heap_node* h)
+{
+	return h->degree != NOT_IN_HEAP;
+}
+
+static inline int heap_empty(struct heap* heap)
+{
+	return heap->head == NULL && heap->min == NULL;
+}
+
+/* make child a subtree of root */
+static inline void __heap_link(struct heap_node* root,
+			       struct heap_node* child)
+{
+	child->parent = root;
+	child->next   = root->child;
+	root->child   = child;
+	root->degree++;
+}
+
+/* merge root lists */
+static inline struct heap_node* __heap_merge(struct heap_node* a,
+					     struct heap_node* b)
+{
+	struct heap_node* head = NULL;
+	struct heap_node** pos = &head;
+
+	while (a && b) {
+		if (a->degree < b->degree) {
+			*pos = a;
+			a = a->next;
+		} else {
+			*pos = b;
+			b = b->next;
+		}
+		pos = &(*pos)->next;
+	}
+	if (a)
+		*pos = a;
+	else
+		*pos = b;
+	return head;
+}
+
+/* reverse a linked list of nodes. also clears parent pointer */
+static inline struct heap_node* __heap_reverse(struct heap_node* h)
+{
+	struct heap_node* tail = NULL;
+	struct heap_node* next;
+
+	if (!h)
+		return h;
+
+	h->parent = NULL;
+	while (h->next) {
+		next    = h->next;
+		h->next = tail;
+		tail    = h;
+		h       = next;
+		h->parent = NULL;
+	}
+	h->next = tail;
+	return h;
+}
+
+static inline void __heap_min(heap_prio_t higher_prio, struct heap* heap,
+			      struct heap_node** prev, struct heap_node** node)
+{
+	struct heap_node *_prev, *cur;
+	*prev = NULL;
+
+	if (!heap->head) {
+		*node = NULL;
+		return;
+	}
+
+	*node = heap->head;
+	_prev = heap->head;
+	cur   = heap->head->next;
+	while (cur) {
+		if (higher_prio(cur, *node)) {
+			*node = cur;
+			*prev = _prev;
+		}
+		_prev = cur;
+		cur   = cur->next;
+	}
+}
+
+static inline void __heap_union(heap_prio_t higher_prio, struct heap* heap,
+				struct heap_node* h2)
+{
+	struct heap_node* h1;
+	struct heap_node *prev, *x, *next;
+	if (!h2)
+		return;
+	h1 = heap->head;
+	if (!h1) {
+		heap->head = h2;
+		return;
+	}
+	h1 = __heap_merge(h1, h2);
+	prev = NULL;
+	x    = h1;
+	next = x->next;
+	while (next) {
+		if (x->degree != next->degree ||
+		    (next->next && next->next->degree == x->degree)) {
+			/* nothing to do, advance */
+			prev = x;
+			x    = next;
+		} else if (higher_prio(x, next)) {
+			/* x becomes the root of next */
+			x->next = next->next;
+			__heap_link(x, next);
+		} else {
+			/* next becomes the root of x */
+			if (prev)
+				prev->next = next;
+			else
+				h1 = next;
+			__heap_link(next, x);
+			x = next;
+		}
+		next = x->next;
+	}
+	heap->head = h1;
+}
+
+static inline struct heap_node* __heap_extract_min(heap_prio_t higher_prio,
+						   struct heap* heap)
+{
+	struct heap_node *prev, *node;
+	__heap_min(higher_prio, heap, &prev, &node);
+	if (!node)
+		return NULL;
+	if (prev)
+		prev->next = node->next;
+	else
+		heap->head = node->next;
+	__heap_union(higher_prio, heap, __heap_reverse(node->child));
+	return node;
+}
+
+/* insert (and reinitialize) a node into the heap */
+static inline void heap_insert(heap_prio_t higher_prio, struct heap* heap,
+			       struct heap_node* node)
+{
+	struct heap_node *min;
+	node->child  = NULL;
+	node->parent = NULL;
+	node->next   = NULL;
+	node->degree = 0;
+	if (heap->min && higher_prio(node, heap->min)) {
+		/* swap min cache */
+		min = heap->min;
+		min->child  = NULL;
+		min->parent = NULL;
+		min->next   = NULL;
+		min->degree = 0;
+		__heap_union(higher_prio, heap, min);
+		heap->min   = node;
+	} else
+		__heap_union(higher_prio, heap, node);
+}
+
+static inline void __uncache_min(heap_prio_t higher_prio, struct heap* heap)
+{
+	struct heap_node* min;
+	if (heap->min) {
+		min = heap->min;
+		heap->min = NULL;
+		heap_insert(higher_prio, heap, min);
+	}
+}
+
+/* merge addition into target */
+static inline void heap_union(heap_prio_t higher_prio,
+			      struct heap* target, struct heap* addition)
+{
+	/* first insert any cached minima, if necessary */
+	__uncache_min(higher_prio, target);
+	__uncache_min(higher_prio, addition);
+	__heap_union(higher_prio, target, addition->head);
+	/* this is a destructive merge */
+	addition->head = NULL;
+}
+
+static inline struct heap_node* heap_peek(heap_prio_t higher_prio,
+					  struct heap* heap)
+{
+	if (!heap->min)
+		heap->min = __heap_extract_min(higher_prio, heap);
+	return heap->min;
+}
+
+static inline struct heap_node* heap_take(heap_prio_t higher_prio,
+					  struct heap* heap)
+{
+	struct heap_node *node;
+	if (!heap->min)
+		heap->min = __heap_extract_min(higher_prio, heap);
+	node = heap->min;
+	heap->min = NULL;
+	if (node)
+		node->degree = NOT_IN_HEAP;
+	return node;
+}
+
+static inline void heap_delete(heap_prio_t higher_prio, struct heap* heap,
+			       struct heap_node* node)
+{
+	struct heap_node *parent, *prev, *pos;
+	struct heap_node** tmp_ref;
+	void* tmp;
+
+	if (heap->min != node) {
+		/* bubble up */
+		parent = node->parent;
+		while (parent) {
+			/* swap parent and node */
+			tmp           = parent->value;
+			parent->value = node->value;
+			node->value   = tmp;
+			/* swap references */
+			*(parent->ref) = node;
+			*(node->ref)   = parent;
+			tmp_ref        = parent->ref;
+			parent->ref    = node->ref;
+			node->ref      = tmp_ref;
+			/* step up */
+			node   = parent;
+			parent = node->parent;
+		}
+		/* now delete:
+		 * first find prev */
+		prev = NULL;
+		pos  = heap->head;
+		while (pos != node) {
+			prev = pos;
+			pos  = pos->next;
+		}
+		/* we have prev, now remove node */
+		if (prev)
+			prev->next = node->next;
+		else
+			heap->head = node->next;
+		__heap_union(higher_prio, heap, __heap_reverse(node->child));
+	} else
+		heap->min = NULL;
+	node->degree = NOT_IN_HEAP;
+}
+
+#endif
diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h
new file mode 100644
index 0000000..9bd361e
--- /dev/null
+++ b/include/litmus/jobs.h
@@ -0,0 +1,9 @@
+#ifndef __LITMUS_JOBS_H__
+#define __LITMUS_JOBS_H__
+
+void prepare_for_next_period(struct task_struct *t);
+void release_at(struct task_struct *t, lt_t start);
+long complete_job(void);
+
+#endif
+
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
new file mode 100644
index 0000000..de2a3c2
--- /dev/null
+++ b/include/litmus/litmus.h
@@ -0,0 +1,205 @@
+/*
+ * Constant definitions related to
+ * scheduling policy.
+ */
+
+#ifndef _LINUX_LITMUS_H_
+#define _LINUX_LITMUS_H_
+
+#include <linux/jiffies.h>
+#include <litmus/sched_trace.h>
+
+/*	RT mode start time	*/
+extern volatile unsigned long rt_start_time;
+
+extern atomic_t __log_seq_no;
+
+#define TRACE(fmt, args...) \
+	sched_trace_log_message("%d P%d: " fmt, atomic_add_return(1, &__log_seq_no), \
+				raw_smp_processor_id(), ## args)
+
+#define TRACE_TASK(t, fmt, args...) \
+	TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args)
+
+#define TRACE_CUR(fmt, args...) \
+	TRACE_TASK(current, fmt, ## args)
+
+#define TRACE_BUG_ON(cond) \
+	do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \
+			     "called from %p current=%s/%d state=%d " \
+			     "flags=%x partition=%d cpu=%d rtflags=%d"\
+			     " job=%u knp=%d timeslice=%u\n",		\
+	#cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \
+	current->pid, current->state, current->flags,  \
+	get_partition(current), smp_processor_id(), get_rt_flags(current), \
+	current->rt_param.job_params.job_no, current->rt_param.kernel_np, \
+	current->time_slice\
+	); } while(0);
+
+
+/* in_list - is a given list_head queued on some list?
+ */
+static inline int in_list(struct list_head* list)
+{
+	return !(  /* case 1: deleted */
+		   (list->next == LIST_POISON1 &&
+		    list->prev == LIST_POISON2)
+		 ||
+		   /* case 2: initialized */
+		   (list->next == list &&
+		    list->prev == list)
+		);
+}
+
+typedef int (*list_cmp_t)(struct list_head*, struct list_head*);
+
+static inline unsigned int list_insert(struct list_head* new,
+				       struct list_head* head,
+				       list_cmp_t order_before)
+{
+	struct list_head *pos;
+	unsigned int passed = 0;
+
+	BUG_ON(!new);
+
+	/* find a spot where the new entry is less than the next */
+	list_for_each(pos, head) {
+		if (unlikely(order_before(new, pos))) {
+			/* pos is not less than new, thus insert here */
+			__list_add(new, pos->prev, pos);
+			goto out;
+		}
+		passed++;
+	}
+	/* if we get to this point either the list is empty or every entry
+	 * queued element is less than new.
+	 * Let's add new to the end. */
+	list_add_tail(new, head);
+ out:
+	return passed;
+}
+
+void list_qsort(struct list_head* list, list_cmp_t less_than);
+
+
+#define RT_PREEMPTIVE 		0x2050 /* = NP */
+#define RT_NON_PREEMPTIVE 	0x4e50 /* =  P */
+#define RT_EXIT_NP_REQUESTED	0x5251 /* = RQ */
+
+
+/* kill naughty tasks
+ */
+void scheduler_signal(struct task_struct *t, unsigned int signal);
+void send_scheduler_signals(void);
+void np_mem_kill(struct task_struct *t);
+
+void litmus_fork(struct task_struct *tsk);
+void litmus_exec(void);
+/* clean up real-time state of a task */
+void exit_litmus(struct task_struct *dead_tsk);
+
+long litmus_admit_task(struct task_struct *tsk);
+void litmus_exit_task(struct task_struct *tsk);
+
+#define is_realtime(t) 		((t)->policy == SCHED_LITMUS)
+#define rt_transition_pending(t) \
+	((t)->rt_param.transition_pending)
+
+#define tsk_rt(t)		(&(t)->rt_param)
+
+/*	Realtime utility macros */
+#define get_rt_flags(t)		(tsk_rt(t)->flags)
+#define set_rt_flags(t,f) 	(tsk_rt(t)->flags=(f))
+#define get_exec_cost(t)  	(tsk_rt(t)->task_params.exec_cost)
+#define get_exec_time(t)	(tsk_rt(t)->job_params.exec_time)
+#define get_rt_period(t)	(tsk_rt(t)->task_params.period)
+#define get_partition(t) 	(tsk_rt(t)->task_params.cpu)
+#define get_deadline(t)		(tsk_rt(t)->job_params.deadline)
+#define get_class(t)		(tsk_rt(t)->task_params.cls)
+
+inline static int budget_exhausted(struct task_struct* t)
+{
+	return get_exec_time(t) >= get_exec_cost(t);
+}
+
+
+#define is_hrt(t)     		\
+	(tsk_rt(t)->task_params.class == RT_CLASS_HARD)
+#define is_srt(t)     		\
+	(tsk_rt(t)->task_params.class == RT_CLASS_SOFT)
+#define is_be(t)      		\
+	(tsk_rt(t)->task_params.class == RT_CLASS_BEST_EFFORT)
+
+#define get_release(t) (tsk_rt(t)->job_params.release)
+
+/* Our notion of time within LITMUS: kernel monotonic time. */
+static inline lt_t litmus_clock(void)
+{
+	return ktime_to_ns(ktime_get());
+}
+
+/* A macro to convert from nanoseconds to ktime_t. */
+#define ns_to_ktime(t)		ktime_add_ns(ktime_set(0, 0), t)
+
+/* The high-resolution release timer for a task. */
+#define release_timer(t) (tsk_rt(t)->release_timer)
+
+/* The high-resolution release timer for a task. */
+#define get_domain(t) (tsk_rt(t)->domain)
+
+/* Honor the flag in the preempt_count variable that is set
+ * when scheduling is in progress.
+ */
+#define is_running(t) 			\
+	((t)->state == TASK_RUNNING || 	\
+	 task_thread_info(t)->preempt_count & PREEMPT_ACTIVE)
+
+#define is_blocked(t)       \
+	(!is_running(t))
+#define is_released(t, now)	\
+	(lt_before_eq(get_release(t), now))
+#define is_tardy(t, now)    \
+	(lt_before_eq(tsk_rt(t)->job_params.deadline, now))
+
+/* real-time comparison macros */
+#define earlier_deadline(a, b) (lt_before(\
+	(a)->rt_param.job_params.deadline,\
+	(b)->rt_param.job_params.deadline))
+#define earlier_release(a, b)  (lt_before(\
+	(a)->rt_param.job_params.release,\
+	(b)->rt_param.job_params.release))
+
+#define make_np(t) do {t->rt_param.kernel_np++;} while(0);
+#define take_np(t) do {t->rt_param.kernel_np--;} while(0);
+
+#ifdef CONFIG_SRP
+void srp_ceiling_block(void);
+#else
+#define srp_ceiling_block() /* nothing */
+#endif
+
+#define heap2task(hn) ((struct task_struct*) hn->value)
+
+
+#ifdef CONFIG_NP_SECTION
+/* returns 1 if task t has registered np flag and set it to RT_NON_PREEMPTIVE
+ */
+int is_np(struct task_struct *t);
+
+/* request that the task should call sys_exit_np()
+ */
+void request_exit_np(struct task_struct *t);
+
+#else
+
+static inline int is_np(struct task_struct *t)
+{
+	return tsk_rt(t)->kernel_np;
+}
+
+#define  request_exit_np(t)
+
+#endif
+
+
+#endif
diff --git a/include/litmus/norqlock.h b/include/litmus/norqlock.h
new file mode 100644
index 0000000..e4c1d06
--- /dev/null
+++ b/include/litmus/norqlock.h
@@ -0,0 +1,26 @@
+#ifndef NORQLOCK_H
+#define NORQLOCK_H
+
+typedef void (*work_t)(unsigned long arg);
+
+struct no_rqlock_work {
+	int			active;
+	work_t			work;
+	unsigned long		arg;
+	struct no_rqlock_work*  next;
+};
+
+void init_no_rqlock_work(struct no_rqlock_work* w, work_t work,
+			 unsigned long arg);
+
+void __do_without_rqlock(struct no_rqlock_work *work);
+
+static inline void do_without_rqlock(struct no_rqlock_work *work)
+{
+	if (!test_and_set_bit(0, (void*)&work->active))
+		__do_without_rqlock(work);
+}
+
+void tick_no_rqlock(void);
+
+#endif
diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h
new file mode 100644
index 0000000..47cd123
--- /dev/null
+++ b/include/litmus/rt_domain.h
@@ -0,0 +1,127 @@
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_RT_DOMAIN_H__
+#define __UNC_RT_DOMAIN_H__
+
+#include <litmus/norqlock.h>
+#include <litmus/heap.h>
+
+struct _rt_domain;
+
+typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
+typedef void (*release_job_t)(struct task_struct *t, struct _rt_domain *rt);
+
+typedef struct _rt_domain {
+	struct no_rqlock_work		arm_timers;
+
+	/* runnable rt tasks are in here */
+	spinlock_t 			ready_lock;
+	struct heap	 		ready_queue;
+
+	/* real-time tasks waiting for release are in here */
+	spinlock_t 			release_lock;
+	struct list_head 		release_queue;
+
+	/* how do we check if we need to kick another CPU? */
+	check_resched_needed_t		check_resched;
+
+	/* how do we release a job? */
+	release_job_t			release_job;
+
+	/* how are tasks ordered in the ready queue? */
+	heap_prio_t			order;
+} rt_domain_t;
+
+static inline struct task_struct* __next_ready(rt_domain_t* rt)
+{
+	struct heap_node *hn = heap_peek(rt->order, &rt->ready_queue);
+	if (hn)
+		return heap2task(hn);
+	else
+		return NULL;
+}
+
+void rt_domain_init(rt_domain_t *rt, heap_prio_t order,
+		    check_resched_needed_t check,
+		    release_job_t relase);
+
+void __add_ready(rt_domain_t* rt, struct task_struct *new);
+void __add_release(rt_domain_t* rt, struct task_struct *task);
+
+static inline struct task_struct* __take_ready(rt_domain_t* rt)
+{
+	struct heap_node* hn = heap_take(rt->order, &rt->ready_queue);
+	if (hn)
+		return heap2task(hn);
+	else
+		return NULL;
+}
+
+static inline struct task_struct* __peek_ready(rt_domain_t* rt)
+{
+	struct heap_node* hn = heap_peek(rt->order, &rt->ready_queue);
+	if (hn)
+		return heap2task(hn);
+	else
+		return NULL;
+}
+
+static inline int  is_queued(struct task_struct *t)
+{
+	return heap_node_in_heap(tsk_rt(t)->heap_node);
+}
+
+static inline void remove(rt_domain_t* rt, struct task_struct *t)
+{
+	heap_delete(rt->order, &rt->ready_queue, tsk_rt(t)->heap_node);
+}
+
+static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+	unsigned long flags;
+	/* first we need the write lock for rt_ready_queue */
+	spin_lock_irqsave(&rt->ready_lock, flags);
+	__add_ready(rt, new);
+	spin_unlock_irqrestore(&rt->ready_lock, flags);
+}
+
+static inline struct task_struct* take_ready(rt_domain_t* rt)
+{
+	unsigned long flags;
+	struct task_struct* ret;
+	/* first we need the write lock for rt_ready_queue */
+	spin_lock_irqsave(&rt->ready_lock, flags);
+	ret = __take_ready(rt);
+	spin_unlock_irqrestore(&rt->ready_lock, flags);
+	return ret;
+}
+
+
+static inline void add_release(rt_domain_t* rt, struct task_struct *task)
+{
+	unsigned long flags;
+	/* first we need the write lock for rt_ready_queue */
+	spin_lock_irqsave(&rt->release_lock, flags);
+	__add_release(rt, task);
+	spin_unlock_irqrestore(&rt->release_lock, flags);
+}
+
+static inline int __jobs_pending(rt_domain_t* rt)
+{
+	return !heap_empty(&rt->ready_queue);
+}
+
+static inline int jobs_pending(rt_domain_t* rt)
+{
+	unsigned long flags;
+	int ret;
+	/* first we need the write lock for rt_ready_queue */
+	spin_lock_irqsave(&rt->ready_lock, flags);
+	ret = !heap_empty(&rt->ready_queue);
+	spin_unlock_irqrestore(&rt->ready_lock, flags);
+	return ret;
+}
+
+#endif
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
new file mode 100644
index 0000000..7bb5684
--- /dev/null
+++ b/include/litmus/rt_param.h
@@ -0,0 +1,170 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_RT_PARAM_H_
+#define _LINUX_RT_PARAM_H_
+
+/* Litmus time type. */
+typedef unsigned long long lt_t;
+
+static inline int lt_after(lt_t a, lt_t b)
+{
+	return ((long long) b) - ((long long) a) < 0;
+}
+#define lt_before(a, b) lt_after(b, a)
+
+static inline int lt_after_eq(lt_t a, lt_t b)
+{
+	return ((long long) a) - ((long long) b) >= 0;
+}
+#define lt_before_eq(a, b) lt_after_eq(b, a)
+
+/* different types of clients */
+typedef enum {
+	RT_CLASS_HARD,
+	RT_CLASS_SOFT,
+	RT_CLASS_BEST_EFFORT
+} task_class_t;
+
+struct rt_task {
+	lt_t 		exec_cost;
+	lt_t 		period;
+	lt_t		phase;
+	unsigned int  	cpu;
+	task_class_t  	cls;
+};
+
+/* don't export internal data structures to user space (liblitmus) */
+#ifdef __KERNEL__
+
+struct _rt_domain;
+struct heap_node;
+
+struct rt_job {
+	/* Time instant the the job was or will be released.  */
+	lt_t	release;
+	/* What is the current deadline? */
+	lt_t   	deadline;
+
+	/* How much service has this job received so far? */
+	lt_t	exec_time;
+
+	/* Which job is this. This is used to let user space
+	 * specify which job to wait for, which is important if jobs
+	 * overrun. If we just call sys_sleep_next_period() then we
+	 * will unintentionally miss jobs after an overrun.
+	 *
+	 * Increase this sequence number when a job is released.
+	 */
+	unsigned int    job_no;
+
+	/* when did this job start executing? */
+	lt_t	exec_start;
+};
+
+
+struct pfair_param;
+
+/*	RT task parameters for scheduling extensions
+ *	These parameters are inherited during clone and therefore must
+ *	be explicitly set up before the task set is launched.
+ */
+struct rt_param {
+	/* is the task sleeping? */
+	unsigned int 		flags:8;
+
+	/* do we need to check for srp blocking? */
+	unsigned int		srp_non_recurse:1;
+
+	/* user controlled parameters */
+	struct rt_task 		task_params;
+
+	/* timing parameters */
+	struct rt_job 		job_params;
+
+	/* task representing the current "inherited" task
+	 * priority, assigned by inherit_priority and
+	 * return priority in the scheduler plugins.
+	 * could point to self if PI does not result in
+	 * an increased task priority.
+	 */
+	 struct task_struct*	inh_task;
+
+	/* Don't just dereference this pointer in kernel space!
+	 * It might very well point to junk or nothing at all.
+	 * NULL indicates that the task has not requested any non-preemptable
+	 * section support.
+	 * Not inherited upon fork.
+	 */
+	short* 			np_flag;
+
+	/* For the FMLP under PSN-EDF, it is required to make the task
+	 * non-preemptive from kernel space. In order not to interfere with
+	 * user space, this counter indicates the kernel space np setting.
+	 * kernel_np > 0 => task is non-preemptive
+	 */
+	unsigned int 		kernel_np;
+
+	/* This field can be used by plugins to store where the task
+	 * is currently scheduled. It is the responsibility of the
+	 * plugin to avoid race conditions.
+	 *
+	 * This used by GSN-EDF and PFAIR.
+	 */
+	volatile int		scheduled_on;
+
+	/* Is the stack of the task currently in use? This is updated by
+	 * the LITMUS core.
+	 *
+	 * Be careful to avoid deadlocks!
+	 */
+	volatile int		stack_in_use;
+
+	/* This field can be used by plugins to store where the task
+	 * is currently linked. It is the responsibility of the plugin
+	 * to avoid race conditions.
+	 *
+	 * Used by GSN-EDF.
+	 */
+	volatile int		linked_on;
+
+	/* PFAIR/PD^2 state. Allocated on demand. */
+	struct pfair_param*	pfair;
+
+	/* Fields saved before BE->RT transition.
+	 */
+	int old_policy;
+	int old_prio;
+
+	/* The high-resolution timer used to control its release. */
+	struct hrtimer release_timer;
+
+	/* ready queue for this task */
+	struct _rt_domain* domain;
+
+	/* heap element for this task
+	 *
+	 * Warning: Don't statically allocate this node. The heap
+	 *          implementation swaps these between tasks, thus after
+	 *          dequeuing from a heap you may end up with a different node
+	 *          then the one you had when enqueuing the task.  For the same
+	 *          reason, don't obtain and store references to this node
+	 *          other than this pointer (which is updated by the heap
+	 *          implementation).
+	 */
+	struct heap_node*	heap_node;
+
+	/* Used by rt_domain to queue task in release list.
+	 */
+	struct list_head list;
+};
+
+/*	Possible RT flags	*/
+#define RT_F_RUNNING		0x00000000
+#define RT_F_SLEEP		0x00000001
+#define RT_F_EXIT_SEM		0x00000008
+
+#endif
+
+#endif
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
new file mode 100644
index 0000000..aba7522
--- /dev/null
+++ b/include/litmus/sched_plugin.h
@@ -0,0 +1,144 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_SCHED_PLUGIN_H_
+#define _LINUX_SCHED_PLUGIN_H_
+
+#include <linux/sched.h>
+
+/* struct for semaphore with priority inheritance */
+struct pi_semaphore {
+	atomic_t count;
+	int sleepers;
+	wait_queue_head_t wait;
+	union {
+		/* highest-prio holder/waiter */
+		struct task_struct *task;
+		struct task_struct* cpu_task[NR_CPUS];
+	} hp;
+	/* current lock holder */
+	struct task_struct *holder;
+};
+
+
+/********************* scheduler invocation ******************/
+
+/*  Plugin-specific realtime tick handler */
+typedef void (*scheduler_tick_t) (struct task_struct *cur);
+/* Novell make sched decision function */
+typedef struct task_struct* (*schedule_t)(struct task_struct * prev);
+/* Clean up after the task switch has occured.
+ * This function is called after every (even non-rt) task switch.
+ */
+typedef void (*finish_switch_t)(struct task_struct *prev);
+
+
+/********************* task state changes ********************/
+
+/* Called to setup a new real-time task.
+ * Release the first job, enqueue, etc.
+ * Task may already be running.
+ */
+typedef void (*task_new_t) (struct task_struct *task,
+			    int on_rq,
+			    int running);
+
+/* Called to re-introduce a task after blocking.
+ * Can potentially be called multiple times.
+ */
+typedef void (*task_wake_up_t) (struct task_struct *task);
+/* called to notify the plugin of a blocking real-time task
+ * it will only be called for real-time tasks and before schedule is called */
+typedef void (*task_block_t)  (struct task_struct *task);
+/* Called when a real-time task exits or changes to a different scheduling
+ * class.
+ * Free any allocated resources
+ */
+typedef void (*task_exit_t)    (struct task_struct *);
+
+/* Called when the new_owner is released from the wait queue
+ * it should now inherit the priority from sem, _before_ it gets readded
+ * to any queue
+ */
+typedef long (*inherit_priority_t) (struct pi_semaphore *sem,
+				    struct task_struct *new_owner);
+
+/* Called when the current task releases a semahpore where it might have
+ * inherited a piority from
+ */
+typedef long (*return_priority_t) (struct pi_semaphore *sem);
+
+/* Called when a task tries to acquire a semaphore and fails. Check if its
+ * priority is higher than that of the current holder.
+ */
+typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t);
+
+
+/********************* sys call backends  ********************/
+/* This function causes the caller to sleep until the next release */
+typedef long (*complete_job_t) (void);
+
+typedef long (*admit_task_t)(struct task_struct* tsk);
+
+typedef void (*release_at_t)(struct task_struct *t, lt_t start);
+
+struct sched_plugin {
+	struct list_head	list;
+	/* 	basic info 		*/
+	char 			*plugin_name;
+#ifdef CONFIG_SRP
+	unsigned int		srp_active;
+#endif
+
+	/* 	scheduler invocation 	*/
+	scheduler_tick_t        tick;
+	schedule_t 		schedule;
+	finish_switch_t 	finish_switch;
+
+	/*	syscall backend 	*/
+	complete_job_t 		complete_job;
+	release_at_t		release_at;
+
+	/*	task state changes 	*/
+	admit_task_t		admit_task;
+
+        task_new_t 		task_new;
+	task_wake_up_t		task_wake_up;
+	task_block_t		task_block;
+	task_exit_t 		task_exit;
+
+#ifdef CONFIG_FMLP
+	/*     priority inheritance 	*/
+	unsigned int		fmlp_active;
+	inherit_priority_t	inherit_priority;
+	return_priority_t	return_priority;
+	pi_block_t		pi_block;
+#endif
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+
+
+extern struct sched_plugin *litmus;
+
+int register_sched_plugin(struct sched_plugin* plugin);
+struct sched_plugin* find_sched_plugin(const char* name);
+int print_sched_plugins(char* buf, int max);
+
+static inline int srp_active(void)
+{
+#ifdef CONFIG_SRP
+	return litmus->srp_active;
+#else
+	return 0;
+#endif
+}
+static inline int fmlp_active(void)
+{
+#ifdef CONFIG_FMLP
+	return litmus->fmlp_active;
+#else
+	return 0;
+#endif
+}
+
+#endif
diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
new file mode 100644
index 0000000..60dcbfb
--- /dev/null
+++ b/include/litmus/sched_trace.h
@@ -0,0 +1,31 @@
+/* sched_trace.h -- record scheduler events to a byte stream for offline analysis.
+ */
+#ifndef _LINUX_SCHED_TRACE_H_
+#define _LINUX_SCHED_TRACE_H_
+
+#include <linux/sched.h>
+
+/* dummies, need to be re-implemented */
+
+/* used in sched.c */
+#define  sched_trace_task_arrival(t)
+#define sched_trace_task_departure(t)
+#define sched_trace_task_preemption(t, by)
+#define sched_trace_task_scheduled(t)
+
+/* used in scheduler plugins */
+#define sched_trace_job_release(t)
+#define sched_trace_job_completion(t)
+
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+void sched_trace_log_message(const char* fmt, ...);
+
+#else
+
+#define sched_trace_log_message(fmt, ...)
+
+#endif
+
+
+#endif
diff --git a/include/litmus/trace.h b/include/litmus/trace.h
new file mode 100644
index 0000000..2c8e141
--- /dev/null
+++ b/include/litmus/trace.h
@@ -0,0 +1,107 @@
+#ifndef _SYS_TRACE_H_
+#define	_SYS_TRACE_H_
+
+#ifdef CONFIG_FEATHER_TRACE
+
+#include <litmus/feather_trace.h>
+#include <litmus/feather_buffer.h>
+
+
+/*********************** TIMESTAMPS ************************/
+
+enum task_type_marker {
+	TSK_BE,
+	TSK_RT,
+	TSK_UNKNOWN
+};
+
+struct timestamp {
+	uint64_t		timestamp;
+	uint32_t		seq_no;
+	uint8_t			cpu;
+	uint8_t			event;
+	uint8_t			task_type;
+};
+
+
+/* buffer holding time stamps - will be provided by driver */
+extern struct ft_buffer* trace_ts_buf;
+
+/* tracing callbacks */
+feather_callback void save_timestamp(unsigned long event);
+feather_callback void save_timestamp_def(unsigned long event, unsigned long type);
+feather_callback void save_timestamp_task(unsigned long event, unsigned long t_ptr);
+
+#define TIMESTAMP(id) ft_event0(id, save_timestamp)
+
+#define DTIMESTAMP(id, def)  ft_event1(id, save_timestamp_def, def)
+
+#define TTIMESTAMP(id, task) ft_event1(id, save_timestamp_task, (unsigned long) task)
+
+#else /* !CONFIG_FEATHER_TRACE */
+
+#define TIMESTAMP(id)        /* no tracing */
+
+#define DTIMESTAMP(id, def)  /* no tracing */
+
+#define TTIMESTAMP(id, task) /* no tracing */
+
+#endif
+
+
+/* Convention for timestamps
+ * =========================
+ *
+ * In order to process the trace files with a common tool, we use the following
+ * convention to measure execution times: The end time id of a code segment is
+ * always the next number after the start time event id.
+ */
+
+#define TS_SCHED_START			DTIMESTAMP(100, TSK_UNKNOWN) /* we only
+								      * care
+								      * about
+								      * next */
+#define TS_SCHED_END(t)			TTIMESTAMP(101, t)
+#define TS_SCHED2_START(t) 		TTIMESTAMP(102, t)
+#define TS_SCHED2_END(t)       		TTIMESTAMP(103, t)
+
+#define TS_CXS_START(t)			TTIMESTAMP(104, t)
+#define TS_CXS_END(t)			TTIMESTAMP(105, t)
+
+#define TS_RELEASE_START		DTIMESTAMP(106, TSK_RT)
+#define TS_RELEASE_END			DTIMESTAMP(107, TSK_RT)
+
+#define TS_TICK_START(t)		TTIMESTAMP(110, t)
+#define TS_TICK_END(t) 			TTIMESTAMP(111, t)
+
+
+#define TS_PLUGIN_SCHED_START		/* TIMESTAMP(120) */  /* currently unused */
+#define TS_PLUGIN_SCHED_END		/* TIMESTAMP(121) */
+
+#define TS_PLUGIN_TICK_START		/* TIMESTAMP(130) */
+#define TS_PLUGIN_TICK_END		/* TIMESTAMP(131) */
+
+#define TS_ENTER_NP_START		TIMESTAMP(140)
+#define TS_ENTER_NP_END			TIMESTAMP(141)
+
+#define TS_EXIT_NP_START		TIMESTAMP(150)
+#define TS_EXIT_NP_END			TIMESTAMP(151)
+
+#define TS_SRP_UP_START			TIMESTAMP(160)
+#define TS_SRP_UP_END			TIMESTAMP(161)
+#define TS_SRP_DOWN_START		TIMESTAMP(162)
+#define TS_SRP_DOWN_END			TIMESTAMP(163)
+
+#define TS_PI_UP_START			TIMESTAMP(170)
+#define TS_PI_UP_END			TIMESTAMP(171)
+#define TS_PI_DOWN_START		TIMESTAMP(172)
+#define TS_PI_DOWN_END			TIMESTAMP(173)
+
+#define TS_FIFO_UP_START		TIMESTAMP(180)
+#define TS_FIFO_UP_END			TIMESTAMP(181)
+#define TS_FIFO_DOWN_START		TIMESTAMP(182)
+#define TS_FIFO_DOWN_END		TIMESTAMP(183)
+
+
+
+#endif /* !_SYS_TRACE_H_ */
diff --git a/include/litmus/unistd.h b/include/litmus/unistd.h
new file mode 100644
index 0000000..8224235
--- /dev/null
+++ b/include/litmus/unistd.h
@@ -0,0 +1,20 @@
+
+#define __LSC(x) (__NR_LITMUS + x)
+
+#define __NR_set_rt_task_param	__LSC(0)
+#define __NR_get_rt_task_param	__LSC(1)
+#define __NR_sleep_next_period  __LSC(2)
+#define __NR_register_np_flag   __LSC(3)
+#define __NR_exit_np            __LSC(4)
+#define __NR_od_open		__LSC(5)
+#define __NR_od_close		__LSC(6)
+#define __NR_fmlp_down		__LSC(7)
+#define __NR_fmlp_up		__LSC(8)
+#define __NR_srp_down		__LSC(9)
+#define __NR_srp_up		__LSC(10)
+#define __NR_query_job_no	__LSC(11)
+#define __NR_wait_for_job_release __LSC(12)
+#define __NR_wait_for_ts_release __LSC(13)
+#define __NR_release_ts		__LSC(14)
+
+#define NR_litmus_syscalls 15
diff --git a/kernel/exit.c b/kernel/exit.c
index 549c055..bc313b7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,8 @@
 
 extern void sem_exit (void);
 
+extern void exit_od_table(struct task_struct* t);
+
 static void exit_mm(struct task_struct * tsk);
 
 static void __unhash_process(struct task_struct *p)
@@ -987,6 +989,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
 
+	exit_od_table(tsk);
+
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 8dd8ff2..4c322d4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -59,6 +59,9 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -121,6 +124,8 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	exit_litmus(tsk);
+
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
@@ -182,6 +187,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	*tsk = *orig;
 	tsk->stack = ti;
 
+	/* Don't let the new task be a real-time task. */
+	memset(&tsk->rt_param, 0, sizeof(struct rt_task));
+
 	err = prop_local_init_single(&tsk->dirties);
 	if (err) {
 		free_thread_info(ti);
diff --git a/kernel/printk.c b/kernel/printk.c
index 89011bf..9eb2dc5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -54,6 +54,12 @@ int console_printk[4] = {
 	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
 };
 
+/* divert printk() messages when we have a LITMUS^RT
+ * debug listener
+ */
+#include <litmus/litmus.h>
+int trace_override = 0;
+
 /*
  * Low level drivers may need that to know if they can schedule in
  * their unblank() callback or not. So let's export it.
@@ -652,6 +658,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 
 	/* Emit the output into the temporary buffer */
 	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+	if (trace_override)
+		TRACE("%s", printk_buf);
 
 	/*
 	 * Copy the output into log_buf.  If the caller didn't provide
@@ -932,7 +940,7 @@ int is_console_locked(void)
 
 void wake_up_klogd(void)
 {
-	if (!oops_in_progress && waitqueue_active(&log_wait))
+	if (!trace_override && !oops_in_progress && waitqueue_active(&log_wait))
 		wake_up_interruptible(&log_wait);
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index e76b11c..9ee07ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -67,6 +67,10 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 
+#include <litmus/trace.h>
+
+#include <litmus/norqlock.h>
+
 /*
  * Scheduler clock - returns current time in nanosec units.
  * This is default implementation.
@@ -324,6 +328,8 @@ struct rq {
 
 	atomic_t nr_iowait;
 
+	struct task_struct* litmus_next;
+
 #ifdef CONFIG_SMP
 	struct sched_domain *sd;
 
@@ -875,11 +881,12 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "../litmus/sched_litmus.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
 #endif
 
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&litmus_sched_class)
 
 /*
  * Update delta_exec, delta_fair fields for rq.
@@ -1516,6 +1523,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	int new_cpu;
 #endif
 
+	if (is_realtime(p))
+		TRACE_TASK(p, "try_to_wake_up()\n");
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!(old_state & state))
@@ -1529,7 +1538,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	this_cpu = smp_processor_id();
 
 #ifdef CONFIG_SMP
-	if (unlikely(task_running(rq, p)))
+	if (unlikely(task_running(rq, p) || is_realtime(p)))
 		goto out_activate;
 
 	new_cpu = cpu;
@@ -1650,8 +1659,10 @@ out_activate:
 out_running:
 	p->state = TASK_RUNNING;
 out:
+	if (is_realtime(p))
+		TRACE_TASK(p, "try_to_wake_up() done, p->state=%d\n", p->state);
 	task_rq_unlock(rq, &flags);
-
+	tick_no_rqlock();
 	return success;
 }
 
@@ -1890,6 +1901,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
+	litmus->finish_switch(prev);
+	prev->rt_param.stack_in_use = NO_CPU;
 	finish_lock_switch(rq, prev);
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
@@ -3480,6 +3493,7 @@ void scheduler_tick(void)
 	struct task_struct *curr = rq->curr;
 	u64 next_tick = rq->tick_timestamp + TICK_NSEC;
 
+	TS_TICK_START(current);
 	spin_lock(&rq->lock);
 	__update_rq_clock(rq);
 	/*
@@ -3491,12 +3505,17 @@ void scheduler_tick(void)
 	update_cpu_load(rq);
 	if (curr != rq->idle) /* FIXME: needed? */
 		curr->sched_class->task_tick(rq, curr);
+	TS_PLUGIN_TICK_START;
+	litmus_tick(rq, curr);
+	TS_PLUGIN_TICK_END;
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
-	trigger_load_balance(rq, cpu);
+	if (!is_realtime(current))
+		trigger_load_balance(rq, cpu);
 #endif
+	TS_TICK_END(current);
 }
 
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
@@ -3594,11 +3613,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
-	if (likely(rq->nr_running == rq->cfs.nr_running)) {
+	/* Don't do that for LITMUS.
+	  if (likely(rq->nr_running == rq->cfs.nr_running)) {
 		p = fair_sched_class.pick_next_task(rq);
 		if (likely(p))
 			return p;
 	}
+	*/
 
 	class = sched_class_highest;
 	for ( ; ; ) {
@@ -3633,6 +3654,7 @@ need_resched:
 
 	release_kernel_lock(prev);
 need_resched_nonpreemptible:
+	TS_SCHED_START;
 
 	schedule_debug(prev);
 
@@ -3643,6 +3665,9 @@ need_resched_nonpreemptible:
 	__update_rq_clock(rq);
 	spin_lock(&rq->lock);
 	clear_tsk_need_resched(prev);
+	TS_PLUGIN_SCHED_START;
+	litmus_schedule(rq, prev);
+	TS_PLUGIN_SCHED_END;
 
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
@@ -3667,18 +3692,32 @@ need_resched_nonpreemptible:
 		rq->curr = next;
 		++*switch_count;
 
+		TS_SCHED_END(next);
+		TS_CXS_START(next);
 		context_switch(rq, prev, next); /* unlocks the rq */
-	} else
+		TS_CXS_END(current);
+	} else {
+		TS_SCHED_END(prev);
 		spin_unlock_irq(&rq->lock);
+	}
+	TS_SCHED2_START(current);
+
+	tick_no_rqlock();
 
 	if (unlikely(reacquire_kernel_lock(current) < 0)) {
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
+		TS_SCHED2_END(current);
 		goto need_resched_nonpreemptible;
 	}
 	preempt_enable_no_resched();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) {
+		TS_SCHED2_END(current);
 		goto need_resched;
+	}
+	TS_SCHED2_END(current);
+	if (srp_active())
+		srp_ceiling_block();
 }
 EXPORT_SYMBOL(schedule);
 
@@ -3886,6 +3925,18 @@ void complete_all(struct completion *x)
 }
 EXPORT_SYMBOL(complete_all);
 
+void complete_n(struct completion *x, int n)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&x->wait.lock, flags);
+	x->done += n;
+	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+			 n, 0, NULL);
+	spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_n);
+
 static inline long __sched
 do_wait_for_common(struct completion *x, long timeout, int state)
 {
@@ -4236,6 +4287,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	case SCHED_RR:
 		p->sched_class = &rt_sched_class;
 		break;
+	case SCHED_LITMUS:
+		p->sched_class = &litmus_sched_class;
+		break;
 	}
 
 	p->rt_priority = prio;
@@ -4268,7 +4322,7 @@ recheck:
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
 			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-			policy != SCHED_IDLE)
+			policy != SCHED_IDLE && policy != SCHED_LITMUS)
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
@@ -4282,6 +4336,9 @@ recheck:
 	if (rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 
+	if (policy == SCHED_LITMUS && policy == p->policy)
+		return -EINVAL;
+
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
@@ -4316,6 +4373,12 @@ recheck:
 			return -EPERM;
 	}
 
+	if (policy == SCHED_LITMUS) {
+		retval = litmus_admit_task(p);
+		if (retval)
+			return retval;
+	}
+
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
 		return retval;
@@ -4345,9 +4408,17 @@ recheck:
 			p->sched_class->put_prev_task(rq, p);
 	}
 
+	if (p->policy == SCHED_LITMUS)
+		litmus_exit_task(p);
+
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 
+	if (policy == SCHED_LITMUS) {
+		p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU;
+		litmus->task_new(p, on_rq, running);
+	}
+
 	if (on_rq) {
 		if (running)
 			p->sched_class->set_curr_task(rq);
@@ -4364,6 +4435,7 @@ recheck:
 			check_preempt_curr(rq, p);
 		}
 	}
+
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -4494,10 +4566,11 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 	read_lock(&tasklist_lock);
 
 	p = find_process_by_pid(pid);
-	if (!p) {
+	if (!p || is_realtime(p)) {
+		/* LITMUS tasks don't get to do this, transition to BE first */
 		read_unlock(&tasklist_lock);
 		mutex_unlock(&sched_hotcpu_mutex);
-		return -ESRCH;
+		return p ? -EPERM : -ESRCH;
 	}
 
 	/*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index da7c061..de30496 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -845,7 +845,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	unsigned long gran;
 
-	if (unlikely(rt_prio(p->prio))) {
+	if (unlikely(rt_prio(p->prio) || p->policy == SCHED_LITMUS)) {
 		update_rq_clock(rq);
 		update_curr(cfs_rq);
 		resched_task(curr);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9ba3daa..c7c938c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -70,7 +70,7 @@ yield_task_rt(struct rq *rq)
  */
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 {
-	if (p->prio < rq->curr->prio)
+	if (p->prio < rq->curr->prio || p->policy == SCHED_LITMUS)
 		resched_task(rq->curr);
 }
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb89fa8..3b1936f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -568,6 +568,22 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 }
 
 /**
+ * tick_set_quanta_type - get the quanta type as a boot option
+ * Default is standard setup with ticks staggered over first
+ * half of tick period.
+ */
+int quanta_type = LINUX_DEFAULT_TICKS;
+static int __init tick_set_quanta_type(char *str)
+{
+	if (strcmp("aligned", str) == 0)
+		quanta_type = LITMUS_ALIGNED_TICKS;
+	else if (strcmp("staggered", str) == 0)
+		quanta_type = LITMUS_STAGGERED_TICKS;
+	return 1;
+}
+__setup("quanta=", tick_set_quanta_type);
+
+/**
  * tick_setup_sched_timer - setup the tick emulation timer
  */
 void tick_setup_sched_timer(void)
@@ -585,9 +601,24 @@ void tick_setup_sched_timer(void)
 
 	/* Get the next period (per cpu) */
 	ts->sched_timer.expires = tick_init_jiffy_update();
-	offset = ktime_to_ns(tick_period) >> 1;
-	do_div(offset, num_possible_cpus());
-	offset *= smp_processor_id();
+
+	/* Offset must be set correctly to achieve desired quanta type. */
+	switch (quanta_type) {
+		case LITMUS_ALIGNED_TICKS:
+			offset = 0;
+			break;
+		case LITMUS_STAGGERED_TICKS:
+			offset = ktime_to_ns(tick_period);
+			do_div(offset, num_possible_cpus());
+			offset *= smp_processor_id();
+			break;
+		default:
+			offset = ktime_to_ns(tick_period) >> 1;
+			do_div(offset, num_possible_cpus());
+			offset *= smp_processor_id();
+	}
+
+	/* Add correct offset to expiration time. */
 	ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset);
 
 	for (;;) {
diff --git a/litmus/Kconfig b/litmus/Kconfig
new file mode 100644
index 0000000..9a2ab90
--- /dev/null
+++ b/litmus/Kconfig
@@ -0,0 +1,78 @@
+menu "LITMUS^RT"
+
+menu "Real-Time Synchronization"
+
+config NP_SECTION
+	bool "Non-preemptive section support"
+	depends on !SPARC64
+	default n
+	help
+	  Include support for flag-based non-preemptive section signaling
+	  from userspace.
+
+	  (currently broken on SPARC64)
+
+	  Say Yes if you want FMLP short critical section  synchronization support.
+
+
+config SRP
+	bool "Stack Resource Policy (SRP)"
+	default n
+	help
+	  Include support for Baker's Stack Resource Policy. 
+
+	  Say Yes if you want FMLP local long  critical section synchronization support.
+
+config FMLP
+	bool "FMLP support"
+	depends on NP_SECTION
+	default n
+	help
+	  Include support for deterministic multiprocessor real-time
+	  synchronization support.
+
+	  Say Yes if you want FMLP long critical section synchronization support.
+
+endmenu
+
+menu "Tracing"
+
+config SCHED_TASK_TRACE
+	bool "Trace real-time tasks"
+	default y
+	help
+	  Include support for the sched_trace_XXX() tracing functions. This
+          allows the collection of real-time task events such as job
+	  completions, job releases, early completions, etc. This results in  a
+	  small overhead in the scheduling code. Disable if the overhead is not
+	  acceptable (e.g., benchmarking).
+
+	  Say Yes for debugging.
+	  Say No for overhead tracing.
+
+config SCHED_DEBUG_TRACE
+	bool "TRACE() debugging"
+	default y
+	help
+	  Include support for sched_trace_log_messageg(), which is used to
+	  implement TRACE(). If disabled, no TRACE() messages will be included
+	  in the kernel, and no overheads due to debugging statements will be
+	  incurred by the scheduler. Disable if the overhead is not acceptable
+	  (e.g. benchmarking).
+
+	  Say Yes for debugging.
+	  Say No for overhead tracing.
+
+config FEATHER_TRACE
+	bool "Feather-Trace Instrumentation Support"
+	default y
+	help
+	  Include Feather-Trace trace points. Currently not supported on
+	  sparc64.
+
+	  Say Yes for overhead tracing.
+
+
+endmenu
+
+endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
new file mode 100644
index 0000000..5452038
--- /dev/null
+++ b/litmus/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for LITMUS^RT
+#
+
+obj-y     = sched_plugin.o litmus.o sched_trace.o \
+	    edf_common.o jobs.o \
+	    rt_domain.o fdso.o sync.o \
+	    fmlp.o srp.o norqlock.o \
+            sched_gsn_edf.o \
+	    sched_psn_edf.o \
+            sched_cedf.o \
+            sched_pfair.o
+
+obj-$(CONFIG_FEATHER_TRACE) += trace.o ft_event.o
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
new file mode 100644
index 0000000..d7567ac
--- /dev/null
+++ b/litmus/edf_common.c
@@ -0,0 +1,94 @@
+/*
+ * kernel/edf_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+
+#include <litmus/edf_common.h>
+
+/* edf_higher_prio -  returns true if first has a higher EDF priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * first first must not be NULL and a real-time task.
+ * second may be NULL or a non-rt task.
+ */
+int edf_higher_prio(struct task_struct* first,
+		    struct task_struct* second)
+{
+	struct task_struct *first_task = first;
+	struct task_struct *second_task = second;
+
+	/* Check for inherited priorities. Change task
+	 * used for comparison in such a case.
+	 */
+	if (first && first->rt_param.inh_task)
+		first_task = first->rt_param.inh_task;
+	if (second && second->rt_param.inh_task)
+		second_task = second->rt_param.inh_task;
+
+	return
+		/* does the second task exist and is it a real-time task?  If
+		 * not, the first task (which is a RT task) has higher
+		 * priority.
+		 */
+		!second_task || !is_realtime(second_task)  ||
+
+		/* is the deadline of the first task earlier?
+		 * Then it has higher priority.
+		 */
+		earlier_deadline(first_task, second_task) ||
+
+		/* Do we have a deadline tie?
+		 * Then break by PID.
+		 */
+		(get_deadline(first_task) == get_deadline(second_task) &&
+	        (first_task->pid < second_task->pid ||
+
+		/* If the PIDs are the same then the task with the inherited
+		 * priority wins.
+		 */
+		(first_task->pid == second_task->pid &&
+		 !second->rt_param.inh_task)));
+}
+
+int edf_ready_order(struct heap_node* a, struct heap_node* b)
+{
+	return edf_higher_prio(heap2task(a), heap2task(b));
+}
+
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		      release_job_t release)
+{
+	rt_domain_init(rt,  edf_ready_order, resched, release);
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+	/* we need the read lock for edf_ready_queue */
+	/* no need to preempt if there is nothing pending */
+	if (!__jobs_pending(rt))
+		return 0;
+	/* we need to reschedule if t doesn't exist */
+	if (!t)
+		return 1;
+
+	/* NOTE: We cannot check for non-preemptibility since we
+	 *       don't know what address space we're currently in.
+	 */
+
+	/* make sure to get non-rt stuff out of the way */
+	return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/fdso.c b/litmus/fdso.c
new file mode 100644
index 0000000..81ab0af
--- /dev/null
+++ b/litmus/fdso.c
@@ -0,0 +1,282 @@
+/* fdso.c - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ *
+ * Notes:
+ *   - objects descriptor (OD) tables are not cloned during a fork.
+ *   - objects are created on-demand, and freed after the last reference
+ *     is dropped.
+ *   - for now, object types are hard coded.
+ *   - As long as we have live objects, we keep a reference to the inode.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+
+#include <litmus/fdso.h>
+
+extern struct fdso_ops fmlp_sem_ops;
+extern struct fdso_ops srp_sem_ops;
+
+static const struct fdso_ops* fdso_ops[] = {
+	&fmlp_sem_ops,
+	&srp_sem_ops,
+};
+
+static void* fdso_create(obj_type_t type)
+{
+	if (fdso_ops[type]->create)
+		return fdso_ops[type]->create();
+	else
+		return NULL;
+}
+
+static void fdso_destroy(obj_type_t type, void* obj)
+{
+	fdso_ops[type]->destroy(obj);
+}
+
+static int fdso_open(struct od_table_entry* entry, void* __user config)
+{
+	if (fdso_ops[entry->obj->type]->open)
+		return fdso_ops[entry->obj->type]->open(entry, config);
+	else
+		return 0;
+}
+
+static int fdso_close(struct od_table_entry* entry)
+{
+	if (fdso_ops[entry->obj->type]->close)
+		return fdso_ops[entry->obj->type]->close(entry);
+	else
+		return 0;
+}
+
+/* inode must be locked already */
+static struct inode_obj_id* alloc_inode_obj(struct inode* inode,
+					    obj_type_t type,
+					    unsigned int id)
+{
+	struct inode_obj_id* obj;
+	void* raw_obj;
+
+	raw_obj = fdso_create(type);
+	if (!raw_obj)
+		return NULL;
+
+	obj = kmalloc(sizeof(struct inode_obj_id), GFP_KERNEL);
+	if (!obj)
+		return NULL;
+	INIT_LIST_HEAD(&obj->list);
+	atomic_set(&obj->count, 1);
+	obj->type  = type;
+	obj->id    = id;
+	obj->obj   = raw_obj;
+	obj->inode = inode;
+
+	list_add(&obj->list, &inode->i_obj_list);
+	atomic_inc(&inode->i_count);
+
+	printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
+	return obj;
+}
+
+/* inode must be locked already */
+static struct inode_obj_id* get_inode_obj(struct inode* inode,
+					  obj_type_t type,
+					  unsigned int id)
+{
+	struct list_head* pos;
+	struct inode_obj_id* obj = NULL;
+
+	list_for_each(pos, &inode->i_obj_list) {
+		obj = list_entry(pos, struct inode_obj_id, list);
+		if (obj->id == id && obj->type == type) {
+			atomic_inc(&obj->count);
+			return obj;
+		}
+	}
+	printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
+	return NULL;
+}
+
+
+static void put_inode_obj(struct inode_obj_id* obj)
+{
+	struct inode* inode;
+	int let_go = 0;
+
+	inode = obj->inode;
+	if (atomic_dec_and_test(&obj->count)) {
+
+		mutex_lock(&inode->i_obj_mutex);
+		/* no new references can be obtained */
+		if (!atomic_read(&obj->count)) {
+			list_del(&obj->list);
+			fdso_destroy(obj->type, obj->obj);
+			kfree(obj);
+			let_go = 1;
+		}
+		mutex_unlock(&inode->i_obj_mutex);
+		if (let_go)
+			iput(inode);
+	}
+}
+
+static struct od_table_entry*  get_od_entry(struct task_struct* t)
+{
+	struct od_table_entry* table;
+	int i;
+
+
+	table = t->od_table;
+	if (!table) {
+		table = (struct od_table_entry*)
+			kzalloc(sizeof(struct  od_table_entry) *
+				MAX_OBJECT_DESCRIPTORS, GFP_KERNEL);
+		t->od_table = table;
+	}
+
+	for (i = 0; table &&  i < MAX_OBJECT_DESCRIPTORS; i++)
+		if (!table[i].used) {
+			table[i].used = 1;
+			return table + i;
+		}
+	return NULL;
+}
+
+static int put_od_entry(struct od_table_entry* od)
+{
+	put_inode_obj(od->obj);
+	od->used = 0;
+	return 0;
+}
+
+void exit_od_table(struct task_struct* t)
+{
+	int i;
+
+	if (t->od_table) {
+		for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
+			if (t->od_table[i].used)
+				put_od_entry(t->od_table + i);
+		kfree(t->od_table);
+		t->od_table = NULL;
+	}
+}
+
+static int do_sys_od_open(struct file* file, obj_type_t type, int id,
+			  void* __user config)
+{
+	int idx = 0, err;
+	struct inode* inode;
+	struct inode_obj_id* obj = NULL;
+	struct od_table_entry* entry;
+
+	inode = file->f_dentry->d_inode;
+
+	entry = get_od_entry(current);
+	if (!entry)
+		return -ENOMEM;
+
+	mutex_lock(&inode->i_obj_mutex);
+	obj = get_inode_obj(inode, type, id);
+	if (!obj)
+		obj = alloc_inode_obj(inode, type, id);
+	if (!obj) {
+		idx = -ENOMEM;
+		entry->used = 0;
+	} else {
+		entry->obj   = obj;
+		entry->extra = NULL;
+		idx = entry - current->od_table;
+	}
+
+	mutex_unlock(&inode->i_obj_mutex);
+
+	err = fdso_open(entry, config);
+	if (err < 0) {
+		/* The class rejected the open call.
+		 * We need to clean up and tell user space.
+		 */
+		put_od_entry(entry);
+		idx = err;
+	}
+
+	return idx;
+}
+
+
+struct od_table_entry* __od_lookup(int od)
+{
+	struct task_struct *t = current;
+
+	if (!t->od_table)
+		return NULL;
+	if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+		return NULL;
+	if (!t->od_table[od].used)
+		return NULL;
+	return t->od_table + od;
+}
+
+
+asmlinkage int sys_od_open(int fd, int type, int obj_id, void* __user config)
+{
+	int ret = 0;
+	struct file*  file;
+
+	/*
+	   1) get file from fd, get inode from file
+	   2) lock inode
+	   3) try to lookup object
+	   4) if not present create and enqueue object, inc inode refcnt
+	   5) increment refcnt of object
+	   6) alloc od_table_entry, setup ptrs
+	   7) unlock inode
+	   8) return offset in od_table as OD
+	 */
+
+	if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	file = fget(fd);
+	if (!file) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	ret = do_sys_od_open(file, type, obj_id, config);
+
+	fput(file);
+
+out:
+	return ret;
+}
+
+
+asmlinkage int sys_od_close(int od)
+{
+	int ret = -EINVAL;
+	struct task_struct *t = current;
+
+	if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+		return ret;
+
+	if (!t->od_table || !t->od_table[od].used)
+		return ret;
+
+
+	/* give the class a chance to reject the close
+	 */
+	ret = fdso_close(t->od_table + od);
+	if (ret == 0)
+		ret = put_od_entry(t->od_table + od);
+
+	return ret;
+}
diff --git a/litmus/fmlp.c b/litmus/fmlp.c
new file mode 100644
index 0000000..f34eeea
--- /dev/null
+++ b/litmus/fmlp.c
@@ -0,0 +1,262 @@
+/*
+ * FMLP implementation.
+ * Much of the code here is borrowed from include/asm-i386/semaphore.h.
+ */
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+
+#include <litmus/fdso.h>
+
+#include <litmus/trace.h>
+
+#ifdef CONFIG_FMLP
+
+static  void* create_fmlp_semaphore(void)
+{
+	struct pi_semaphore* sem;
+	int i;
+
+	sem = kmalloc(sizeof(struct pi_semaphore), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+	atomic_set(&sem->count, 1);
+	sem->sleepers = 0;
+	init_waitqueue_head(&sem->wait);
+	sem->hp.task = NULL;
+	sem->holder = NULL;
+	for (i = 0; i < NR_CPUS; i++)
+		sem->hp.cpu_task[i] = NULL;
+	return sem;
+}
+
+static int open_fmlp_semaphore(struct od_table_entry* entry, void* __user arg)
+{
+	if (!fmlp_active())
+		return -EBUSY;
+	return 0;
+}
+
+static void destroy_fmlp_semaphore(void* sem)
+{
+	/* XXX assert invariants */
+	kfree(sem);
+}
+
+struct fdso_ops fmlp_sem_ops = {
+	.create  = create_fmlp_semaphore,
+	.open    = open_fmlp_semaphore,
+	.destroy = destroy_fmlp_semaphore
+};
+
+struct wq_pair {
+	struct task_struct*  tsk;
+	struct pi_semaphore* sem;
+};
+
+static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+			   void *key)
+{
+	struct wq_pair* wqp   = (struct wq_pair*) wait->private;
+	set_rt_flags(wqp->tsk, RT_F_EXIT_SEM);
+	litmus->inherit_priority(wqp->sem, wqp->tsk);
+	TRACE_TASK(wqp->tsk,
+		   "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n");
+	/* point to task for default_wake_function() */
+	wait->private = wqp->tsk;
+	default_wake_function(wait, mode, sync, key);
+
+	/* Always return true since we know that if we encountered a task
+	 * that was already running the wake_up raced with the schedule in
+	 * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled
+	 * immediately and own the lock. We must not wake up another task in
+	 * any case.
+	 */
+	return 1;
+}
+
+/* caller is responsible for locking */
+int edf_set_hp_task(struct pi_semaphore *sem)
+{
+	struct list_head	*tmp, *next;
+	struct task_struct 	*queued;
+	int ret = 0;
+
+	sem->hp.task = NULL;
+	list_for_each_safe(tmp, next, &sem->wait.task_list) {
+		queued  = ((struct wq_pair*)
+			list_entry(tmp, wait_queue_t,
+				   task_list)->private)->tsk;
+
+		/* Compare task prios, find high prio task. */
+		if (edf_higher_prio(queued, sem->hp.task)) {
+			sem->hp.task = queued;
+			ret = 1;
+		}
+	}
+	return ret;
+}
+
+/* caller is responsible for locking */
+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu)
+{
+	struct list_head	*tmp, *next;
+	struct task_struct 	*queued;
+	int ret = 0;
+
+	sem->hp.cpu_task[cpu] = NULL;
+	list_for_each_safe(tmp, next, &sem->wait.task_list) {
+		queued  = ((struct wq_pair*)
+			list_entry(tmp, wait_queue_t,
+				   task_list)->private)->tsk;
+
+		/* Compare task prios, find high prio task. */
+		if (get_partition(queued) == cpu &&
+		    edf_higher_prio(queued, sem->hp.cpu_task[cpu])) {
+			sem->hp.cpu_task[cpu] = queued;
+			ret = 1;
+		}
+	}
+	return ret;
+}
+
+static int do_fmlp_down(struct pi_semaphore* sem)
+{
+	unsigned long flags;
+	struct task_struct *tsk = current;
+	struct wq_pair pair;
+	int suspended = 1;
+	wait_queue_t wait = {
+		.private = &pair,
+		.func    = rt_pi_wake_up,
+		.task_list = {NULL, NULL}
+	};
+
+	pair.tsk = tsk;
+	pair.sem = sem;
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (atomic_dec_return(&sem->count) < 0 ||
+	    waitqueue_active(&sem->wait)) {
+		/* we need to suspend */
+		tsk->state = TASK_UNINTERRUPTIBLE;
+		add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+		TRACE_CUR("suspends on PI lock %p\n", sem);
+		litmus->pi_block(sem, tsk);
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		TS_PI_DOWN_END;
+		preempt_enable_no_resched();
+
+
+		/* we depend on the FIFO order
+		 * Thus, we don't need to recheck when we wake up, we
+		 * are guaranteed to have the lock since there is only one
+		 * wake up per release
+		 */
+		schedule();
+
+		TRACE_CUR("woke up, now owns PI lock %p\n", sem);
+
+		/* try_to_wake_up() set our state to TASK_RUNNING,
+		 * all we need to do is to remove our wait queue entry
+		 */
+		remove_wait_queue(&sem->wait, &wait);
+	} else {
+		/* no priority inheritance necessary, since there are no queued
+		 * tasks.
+		 */
+		suspended = 0;
+		TRACE_CUR("acquired PI lock %p, no contention\n", sem);
+		sem->holder  = tsk;
+		sem->hp.task = tsk;
+		litmus->inherit_priority(sem, tsk);
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+	return suspended;
+}
+
+static void do_fmlp_up(struct pi_semaphore* sem)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	TRACE_CUR("releases PI lock %p\n", sem);
+	litmus->return_priority(sem);
+	sem->holder = NULL;
+	if (atomic_inc_return(&sem->count) < 1)
+		/* there is a task queued */
+		wake_up_locked(&sem->wait);
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+}
+
+asmlinkage long sys_fmlp_down(int sem_od)
+{
+	long ret = 0;
+	struct pi_semaphore * sem;
+	int suspended = 0;
+
+	preempt_disable();
+	TS_PI_DOWN_START;
+
+	sem = lookup_fmlp_sem(sem_od);
+	if (sem)
+		suspended = do_fmlp_down(sem);
+	else
+		ret = -EINVAL;
+
+	if (!suspended) {
+		TS_PI_DOWN_END;
+		preempt_enable();
+	}
+
+	return ret;
+}
+
+asmlinkage long sys_fmlp_up(int sem_od)
+{
+	long ret = 0;
+	struct pi_semaphore * sem;
+
+	preempt_disable();
+	TS_PI_UP_START;
+
+	sem = lookup_fmlp_sem(sem_od);
+	if (sem)
+		do_fmlp_up(sem);
+	else
+		ret = -EINVAL;
+
+
+	TS_PI_UP_END;
+	preempt_enable();
+
+	return ret;
+}
+
+#else
+
+struct fdso_ops fmlp_sem_ops = {};
+
+asmlinkage long sys_fmlp_down(int sem_od)
+{
+	return -ENOSYS;
+}
+
+asmlinkage long sys_fmlp_up(int sem_od)
+{
+	return -ENOSYS;
+}
+
+#endif
diff --git a/litmus/ft_event.c b/litmus/ft_event.c
new file mode 100644
index 0000000..6084b6d
--- /dev/null
+++ b/litmus/ft_event.c
@@ -0,0 +1,43 @@
+#include <linux/types.h>
+
+#include <litmus/feather_trace.h>
+
+#ifndef __ARCH_HAS_FEATHER_TRACE
+/* provide dummy implementation */
+
+int ft_events[MAX_EVENTS];
+
+int ft_enable_event(unsigned long id)
+{
+	if (id < MAX_EVENTS) {
+		ft_events[id]++;
+		return 1;
+	} else
+		return 0;
+}
+
+int ft_disable_event(unsigned long id)
+{
+	if (id < MAX_EVENTS && ft_events[id]) {
+		ft_events[id]--;
+		return 1;
+	} else
+		return 0;
+}
+
+int ft_disable_all_events(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_EVENTS; i++)
+		ft_events[i] = 0;
+
+	return MAX_EVENTS;
+}
+
+int ft_is_event_enabled(unsigned long id)
+{
+	return 	id < MAX_EVENTS && ft_events[id];
+}
+
+#endif
diff --git a/litmus/jobs.c b/litmus/jobs.c
new file mode 100644
index 0000000..e294bc5
--- /dev/null
+++ b/litmus/jobs.c
@@ -0,0 +1,43 @@
+/* litmus/jobs.c - common job control code
+ */
+
+#include <linux/sched.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+
+void prepare_for_next_period(struct task_struct *t)
+{
+	BUG_ON(!t);
+	/* prepare next release */
+	t->rt_param.job_params.release   = t->rt_param.job_params.deadline;
+	t->rt_param.job_params.deadline += get_rt_period(t);
+	t->rt_param.job_params.exec_time = 0;
+	/* update job sequence number */
+	t->rt_param.job_params.job_no++;
+
+	/* don't confuse Linux */
+	t->time_slice = 1;
+}
+
+void release_at(struct task_struct *t, lt_t start)
+{
+	t->rt_param.job_params.deadline = start;
+	prepare_for_next_period(t);
+	set_rt_flags(t, RT_F_RUNNING);
+}
+
+
+/*
+ *	Deactivate current task until the beginning of the next period.
+ */
+long complete_job(void)
+{
+	/* Mark that we do not excute anymore */
+	set_rt_flags(current, RT_F_SLEEP);
+	/* call schedule, this will return when a new job arrives
+	 * it also takes care of preparing for the next release
+	 */
+	schedule();
+	return 0;
+}
diff --git a/litmus/litmus.c b/litmus/litmus.c
new file mode 100644
index 0000000..979985e
--- /dev/null
+++ b/litmus/litmus.c
@@ -0,0 +1,826 @@
+/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization code,
+ *             and the procfs interface..
+ */
+#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/sysrq.h>
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <linux/sched.h>
+#include <litmus/sched_plugin.h>
+
+#include <litmus/heap.h>
+
+#include <litmus/trace.h>
+
+/* Number of RT tasks that exist in the system */
+atomic_t rt_task_count 		= ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(task_transition_lock);
+
+/* Give log messages sequential IDs. */
+atomic_t __log_seq_no = ATOMIC_INIT(0);
+
+/* To send signals from the scheduler
+ * Must drop locks first.
+ */
+static LIST_HEAD(sched_sig_list);
+static DEFINE_SPINLOCK(sched_sig_list_lock);
+
+static struct kmem_cache * heap_node_cache;
+
+/*
+ * sys_set_task_rt_param
+ * @pid: Pid of the task which scheduling parameters must be changed
+ * @param: New real-time extension parameters such as the execution cost and
+ *         period
+ * Syscall for manipulating with task rt extension params
+ * Returns EFAULT  if param is NULL.
+ *         ESRCH   if pid is not corrsponding
+ *	           to a valid task.
+ *	   EINVAL  if either period or execution cost is <=0
+ *	   EPERM   if pid is a real-time task
+ *	   0       if success
+ *
+ * Only non-real-time tasks may be configured with this system call
+ * to avoid races with the scheduler. In practice, this means that a
+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
+ */
+asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+	struct rt_task tp;
+	struct task_struct *target;
+	int retval = -EINVAL;
+
+	printk("Setting up rt task parameters for process %d.\n", pid);
+
+	if (pid < 0 || param == 0) {
+		goto out;
+	}
+	if (copy_from_user(&tp, param, sizeof(tp))) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	/*      Task search and manipulation must be protected */
+	read_lock_irq(&tasklist_lock);
+	if (!(target = find_task_by_pid(pid))) {
+		retval = -ESRCH;
+		goto out_unlock;
+	}
+
+	if (is_realtime(target)) {
+		/* The task is already a real-time task.
+		 * We cannot not allow parameter changes at this point.
+		 */
+		retval = -EBUSY;
+		goto out_unlock;
+	}
+
+	if (tp.exec_cost <= 0)
+		goto out_unlock;
+	if (tp.period <= 0)
+		goto out_unlock;
+	if (!cpu_online(tp.cpu))
+		goto out_unlock;
+	if (tp.period < tp.exec_cost)
+	{
+		printk(KERN_INFO "litmus: real-time task %d rejected "
+		       "because wcet > period\n", pid);
+		goto out_unlock;
+	}
+
+	target->rt_param.task_params = tp;
+
+	retval = 0;
+      out_unlock:
+	read_unlock_irq(&tasklist_lock);
+      out:
+	return retval;
+}
+
+/*	Getter of task's RT params
+ *	returns EINVAL if param or pid is NULL
+ *	returns ESRCH  if pid does not correspond to a valid task
+ *	returns EFAULT if copying of parameters has failed.
+ */
+asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+	int retval = -EINVAL;
+	struct task_struct *source;
+	struct rt_task lp;
+	if (param == 0 || pid < 0)
+		goto out;
+	read_lock(&tasklist_lock);
+	if (!(source = find_task_by_pid(pid))) {
+		retval = -ESRCH;
+		goto out_unlock;
+	}
+	lp = source->rt_param.task_params;
+	read_unlock(&tasklist_lock);
+	/* Do copying outside the lock */
+	retval =
+	    copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
+	return retval;
+      out_unlock:
+	read_unlock(&tasklist_lock);
+      out:
+	return retval;
+
+}
+
+/*
+ *	This is the crucial function for periodic task implementation,
+ *	It checks if a task is periodic, checks if such kind of sleep
+ *	is permitted and calls plugin-specific sleep, which puts the
+ *	task into a wait array.
+ *	returns 0 on successful wakeup
+ *	returns EPERM if current conditions do not permit such sleep
+ *	returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_complete_job(void)
+{
+	int retval = -EPERM;
+	if (!is_realtime(current)) {
+		retval = -EINVAL;
+		goto out;
+	}
+	/* Task with negative or zero period cannot sleep */
+	if (get_rt_period(current) <= 0) {
+		retval = -EINVAL;
+		goto out;
+	}
+	/* The plugin has to put the task into an
+	 * appropriate queue and call schedule
+	 */
+	retval = litmus->complete_job();
+      out:
+	return retval;
+}
+
+/*	This is an "improved" version of sys_complete_job that
+ *      addresses the problem of unintentionally missing a job after
+ *      an overrun.
+ *
+ *	returns 0 on successful wakeup
+ *	returns EPERM if current conditions do not permit such sleep
+ *	returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_wait_for_job_release(unsigned int job)
+{
+	int retval = -EPERM;
+	if (!is_realtime(current)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	/* Task with negative or zero period cannot sleep */
+	if (get_rt_period(current) <= 0) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	retval = 0;
+
+	/* first wait until we have "reached" the desired job
+	 *
+	 * This implementation has at least two problems:
+	 *
+	 * 1) It doesn't gracefully handle the wrap around of
+	 *    job_no. Since LITMUS is a prototype, this is not much
+	 *    of a problem right now.
+	 *
+	 * 2) It is theoretically racy if a job release occurs
+	 *    between checking job_no and calling sleep_next_period().
+	 *    A proper solution would requiring adding another callback
+	 *    in the plugin structure and testing the condition with
+	 *    interrupts disabled.
+	 *
+	 * FIXME: At least problem 2 should be taken care of eventually.
+	 */
+	while (!retval && job > current->rt_param.job_params.job_no)
+		/* If the last job overran then job <= job_no and we
+		 * don't send the task to sleep.
+		 */
+		retval = litmus->complete_job();
+      out:
+	return retval;
+}
+
+/*	This is a helper syscall to query the current job sequence number.
+ *
+ *	returns 0 on successful query
+ *	returns EPERM if task is not a real-time task.
+ *      returns EFAULT if &job is not a valid pointer.
+ */
+asmlinkage long sys_query_job_no(unsigned int __user *job)
+{
+	int retval = -EPERM;
+	if (is_realtime(current))
+		retval = put_user(current->rt_param.job_params.job_no, job);
+
+	return retval;
+}
+
+struct sched_sig {
+	struct list_head 	list;
+	struct task_struct*	task;
+	unsigned int		signal:31;
+	int			force:1;
+};
+
+static void __scheduler_signal(struct task_struct *t, unsigned int signo,
+			       int force)
+{
+	struct sched_sig* sig;
+
+	sig = kmalloc(GFP_ATOMIC, sizeof(struct sched_sig));
+	if (!sig) {
+		TRACE_TASK(t, "dropping signal: %u\n", t);
+		return;
+	}
+
+	spin_lock(&sched_sig_list_lock);
+
+	sig->signal = signo;
+	sig->force  = force;
+	sig->task   = t;
+	get_task_struct(t);
+	list_add(&sig->list, &sched_sig_list);
+
+	spin_unlock(&sched_sig_list_lock);
+}
+
+void scheduler_signal(struct task_struct *t, unsigned int signo)
+{
+	__scheduler_signal(t, signo, 0);
+}
+
+void force_scheduler_signal(struct task_struct *t, unsigned int signo)
+{
+	__scheduler_signal(t, signo, 1);
+}
+
+/* FIXME: get rid of the locking and do this on a per-processor basis */
+void send_scheduler_signals(void)
+{
+	unsigned long flags;
+	struct list_head *p, *extra;
+	struct siginfo info;
+	struct sched_sig* sig;
+	struct task_struct* t;
+	struct list_head claimed;
+
+	if (spin_trylock_irqsave(&sched_sig_list_lock, flags)) {
+		if (list_empty(&sched_sig_list))
+			p = NULL;
+		else {
+			p = sched_sig_list.next;
+			list_del(&sched_sig_list);
+			INIT_LIST_HEAD(&sched_sig_list);
+		}
+		spin_unlock_irqrestore(&sched_sig_list_lock, flags);
+
+		/* abort if there are no signals */
+		if (!p)
+			return;
+
+		/* take signal list we just obtained */
+		list_add(&claimed, p);
+
+		list_for_each_safe(p, extra, &claimed) {
+			list_del(p);
+			sig = list_entry(p, struct sched_sig, list);
+			t = sig->task;
+			info.si_signo = sig->signal;
+			info.si_errno = 0;
+			info.si_code  = SI_KERNEL;
+			info.si_pid   = 1;
+			info.si_uid   = 0;
+			TRACE("sending signal %d to %d\n", info.si_signo,
+			      t->pid);
+			if (sig->force)
+				force_sig_info(sig->signal, &info, t);
+			else
+				send_sig_info(sig->signal, &info, t);
+			put_task_struct(t);
+			kfree(sig);
+		}
+	}
+
+}
+
+#ifdef CONFIG_NP_SECTION
+
+static inline void np_mem_error(struct task_struct* t, const char* reason)
+{
+	if (t->state != TASK_DEAD && !(t->flags & PF_EXITING)) {
+		TRACE("np section: %s => %s/%d killed\n",
+		      reason, t->comm, t->pid);
+		force_scheduler_signal(t, SIGKILL);
+	}
+}
+
+/*	sys_register_np_flag() allows real-time tasks to register an
+ *	np section indicator.
+ *	returns 0      if the flag was successfully registered
+ *	returns EINVAL if current task is not a real-time task
+ *	returns EFAULT if *flag couldn't be written
+ */
+asmlinkage long sys_register_np_flag(short __user *flag)
+{
+	int retval = -EINVAL;
+	short test_val = RT_PREEMPTIVE;
+
+	/* avoid races with the scheduler */
+	preempt_disable();
+	TRACE("reg_np_flag(%p) for %s/%d\n", flag,
+	      current->comm, current->pid);
+
+	/* Let's first try to write to the address.
+	 * That way it is initialized and any bugs
+	 * involving dangling pointers will caught
+	 * early.
+	 * NULL indicates disabling np section support
+	 * and should not be tested.
+	 */
+	if (flag)
+	  retval = poke_kernel_address(test_val, flag);
+	else
+	  retval = 0;
+	TRACE("reg_np_flag: retval=%d\n", retval);
+	if (unlikely(0 != retval))
+		np_mem_error(current, "np flag: not writable");
+	else
+	  /* the pointer is ok */
+	  current->rt_param.np_flag = flag;
+
+	preempt_enable();
+	return retval;
+}
+
+
+void request_exit_np(struct task_struct *t)
+{
+	int ret;
+	short flag;
+
+	/* We can only do this if t is actually currently scheduled on this CPU
+	 * because otherwise we are in the wrong address space. Thus make sure
+	 * to check.
+	 */
+        BUG_ON(t != current);
+
+	if (unlikely(!is_realtime(t) || !t->rt_param.np_flag)) {
+		TRACE_TASK(t, "request_exit_np(): BAD TASK!\n");
+		return;
+	}
+
+	flag = RT_EXIT_NP_REQUESTED;
+	ret  = poke_kernel_address(flag, t->rt_param.np_flag + 1);
+	TRACE("request_exit_np(%s/%d)\n", t->comm, t->pid);
+	if (unlikely(0 != ret))
+		np_mem_error(current, "request_exit_np(): flag not writable");
+
+}
+
+
+int is_np(struct task_struct* t)
+{
+	int ret;
+	unsigned short flag = 0x5858; /* = XX, looks nicer in debug*/
+
+        BUG_ON(t != current);
+
+	if (unlikely(t->rt_param.kernel_np))
+		return 1;
+	else if (unlikely(t->rt_param.np_flag == NULL) ||
+		 t->flags & PF_EXITING ||
+		 t->state == TASK_DEAD)
+		return 0;
+	else {
+		/* This is the tricky part. The process has registered a
+		 * non-preemptive section marker. We now need to check whether
+		 * it is set to to NON_PREEMPTIVE. Along the way we could
+		 * discover that the pointer points to an unmapped region (=>
+		 * kill the task) or that the location contains some garbage
+		 * value (=> also kill the task). Killing the task in any case
+		 * forces userspace to play nicely. Any bugs will be discovered
+		 * immediately.
+		 */
+		ret = probe_kernel_address(t->rt_param.np_flag, flag);
+		if (0 == ret && (flag == RT_NON_PREEMPTIVE ||
+				 flag == RT_PREEMPTIVE))
+		return flag != RT_PREEMPTIVE;
+		else {
+			/* either we could not read from the address or
+			 * it contained garbage => kill the process
+			 * FIXME: Should we cause a SEGFAULT instead?
+			 */
+			TRACE("is_np: ret=%d flag=%c%c (%x)\n", ret,
+			      flag & 0xff, (flag >> 8) & 0xff, flag);
+			np_mem_error(t, "is_np() could not read");
+			return 0;
+		}
+	}
+}
+
+/*
+ *	sys_exit_np() allows real-time tasks to signal that it left a
+ *      non-preemptable section. It will be called after the kernel requested a
+ *      callback in the preemption indicator flag.
+ *	returns 0      if the signal was valid and processed.
+ *	returns EINVAL if current task is not a real-time task
+ */
+asmlinkage long sys_exit_np(void)
+{
+	int retval = -EINVAL;
+
+	TS_EXIT_NP_START;
+
+	if (!is_realtime(current))
+		goto out;
+
+	TRACE("sys_exit_np(%s/%d)\n", current->comm, current->pid);
+	/* force rescheduling so that we can be preempted */
+	set_tsk_need_resched(current);
+	retval = 0;
+      out:
+
+	TS_EXIT_NP_END;
+	return retval;
+}
+
+#else /* !CONFIG_NP_SECTION */
+
+asmlinkage long sys_register_np_flag(short __user *flag)
+{
+	return -ENOSYS;
+}
+
+asmlinkage long sys_exit_np(void)
+{
+	return -ENOSYS;
+}
+
+#endif /* CONFIG_NP_SECTION */
+
+
+/* p is a real-time task. Re-init its state as a best-effort task. */
+static void reinit_litmus_state(struct task_struct* p, int restore)
+{
+	struct rt_task  user_config = {};
+	__user short   *np_flag     = NULL;
+
+	if (restore) {
+		/* Safe user-space provided configuration data. */
+		user_config = p->rt_param.task_params;
+		np_flag     = p->rt_param.np_flag;
+	}
+
+	/* We probably should not be inheriting any task's priority
+	 * at this point in time.
+	 */
+	WARN_ON(p->rt_param.inh_task);
+
+	/* We need to restore the priority of the task. */
+//	__setscheduler(p, p->rt_param.old_policy, p->rt_param.old_prio);
+
+	/* Cleanup everything else. */
+	memset(&p->rt_param, 0, sizeof(struct rt_task));
+
+	/* Restore preserved fields. */
+	if (restore) {
+		p->rt_param.task_params = user_config;
+		p->rt_param.np_flag      = np_flag;
+	}
+}
+
+long litmus_admit_task(struct task_struct* tsk)
+{
+	long retval;
+	long flags;
+
+	BUG_ON(is_realtime(tsk));
+
+	if (get_rt_period(tsk) == 0 ||
+	    get_exec_cost(tsk) > get_rt_period(tsk)) {
+		TRACE_TASK(tsk, "litmus admit: invalid task parameters "
+			   "(%lu, %lu)\n",
+		       get_exec_cost(tsk), get_rt_period(tsk));
+		return -EINVAL;
+	}
+
+	if (!cpu_online(get_partition(tsk)))
+	{
+		TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
+			   get_partition(tsk));
+		return -EINVAL;
+	}
+
+	INIT_LIST_HEAD(&tsk_rt(tsk)->list);
+
+	/* avoid scheduler plugin changing underneath us */
+	spin_lock_irqsave(&task_transition_lock, flags);
+	retval = litmus->admit_task(tsk);
+
+	/* allocate heap node for this task */
+	tsk_rt(tsk)->heap_node = kmem_cache_alloc(heap_node_cache, GFP_ATOMIC);
+	if (!tsk_rt(tsk)->heap_node)
+		retval = -ENOMEM;
+	else
+		heap_node_init(&tsk_rt(tsk)->heap_node, tsk);
+
+	if (!retval)
+		atomic_inc(&rt_task_count);
+
+	spin_unlock_irqrestore(&task_transition_lock, flags);
+
+	return retval;
+
+}
+
+void litmus_exit_task(struct task_struct* tsk)
+{
+	if (is_realtime(tsk)) {
+		litmus->task_exit(tsk);
+		BUG_ON(heap_node_in_heap(tsk_rt(tsk)->heap_node));
+		kmem_cache_free(heap_node_cache, tsk_rt(tsk)->heap_node);
+		atomic_dec(&rt_task_count);
+		reinit_litmus_state(tsk, 1);
+	}
+}
+
+/* Switching a plugin in use is tricky.
+ * We must watch out that no real-time tasks exists
+ * (and that none is created in parallel) and that the plugin is not
+ * currently in use on any processor (in theory).
+ *
+ * For now, we don't enforce the second part since it is unlikely to cause
+ * any trouble by itself as long as we don't unload modules.
+ */
+int switch_sched_plugin(struct sched_plugin* plugin)
+{
+	long flags;
+	int ret = 0;
+
+	BUG_ON(!plugin);
+
+	/* stop task transitions */
+	spin_lock_irqsave(&task_transition_lock, flags);
+
+	/* don't switch if there are active real-time tasks */
+	if (atomic_read(&rt_task_count) == 0) {
+		printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
+		litmus = plugin;
+	} else
+		ret = -EBUSY;
+
+	spin_unlock_irqrestore(&task_transition_lock, flags);
+	return ret;
+}
+
+/* Called upon fork.
+ * p is the newly forked task.
+ */
+void litmus_fork(struct task_struct* p)
+{
+	if (is_realtime(p))
+		/* clean out any litmus related state, don't preserve anything*/
+		reinit_litmus_state(p, 0);
+}
+
+/* Called upon execve().
+ * current is doing the exec.
+ * Don't let address space specific stuff leak.
+ */
+void litmus_exec(void)
+{
+	struct task_struct* p = current;
+
+	if (is_realtime(p)) {
+		WARN_ON(p->rt_param.inh_task);
+		p->rt_param.np_flag = NULL;
+	}
+}
+
+void exit_litmus(struct task_struct *dead_tsk)
+{
+	if (is_realtime(dead_tsk))
+		litmus_exit_task(dead_tsk);
+}
+
+
+void list_qsort(struct list_head* list, list_cmp_t less_than)
+{
+	struct list_head lt;
+	struct list_head geq;
+	struct list_head *pos, *extra, *pivot;
+	int n_lt = 0, n_geq = 0;
+	BUG_ON(!list);
+
+	if (list->next == list)
+		return;
+
+	INIT_LIST_HEAD(&lt);
+	INIT_LIST_HEAD(&geq);
+
+	pivot = list->next;
+	list_del(pivot);
+	list_for_each_safe(pos, extra, list) {
+		list_del(pos);
+		if (less_than(pos, pivot)) {
+			list_add(pos, &lt);
+			n_lt++;
+		} else {
+			list_add(pos, &geq);
+			n_geq++;
+		}
+	}
+	if (n_lt < n_geq) {
+		list_qsort(&lt, less_than);
+		list_qsort(&geq, less_than);
+	} else {
+		list_qsort(&geq, less_than);
+		list_qsort(&lt, less_than);
+	}
+	list_splice(&geq, list);
+	list_add(pivot, list);
+	list_splice(&lt, list);
+}
+
+#ifdef CONFIG_MAGIC_SYSRQ
+int sys_kill(int pid, int sig);
+
+static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty)
+{
+	struct task_struct *t;
+	read_lock(&tasklist_lock);
+	for_each_process(t) {
+		if (is_realtime(t)) {
+			sys_kill(t->pid, SIGKILL);
+		}
+	}
+	read_unlock(&tasklist_lock);
+}
+
+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
+	.handler	= sysrq_handle_kill_rt_tasks,
+	.help_msg	= "Quit-rt-tasks",
+	.action_msg	= "sent SIGKILL to all real-time tasks",
+};
+#endif
+
+static int proc_read_stats(char *page, char **start,
+			   off_t off, int count,
+			   int *eof, void *data)
+{
+	int len;
+
+	len = snprintf(page, PAGE_SIZE,
+		       "real-time task count = %d\n",
+		       atomic_read(&rt_task_count));
+	return len;
+}
+
+static int proc_read_plugins(char *page, char **start,
+			   off_t off, int count,
+			   int *eof, void *data)
+{
+	int len;
+
+	len = print_sched_plugins(page, PAGE_SIZE);
+	return len;
+}
+
+static int proc_read_curr(char *page, char **start,
+			  off_t off, int count,
+			  int *eof, void *data)
+{
+	int len;
+
+	len = snprintf(page, PAGE_SIZE, "%s\n", litmus->plugin_name);
+	return len;
+}
+
+static int proc_write_curr(struct file *file,
+			   const char *buffer,
+			   unsigned long count,
+			   void *data)
+{
+	int len, ret;
+	char name[65];
+	struct sched_plugin* found;
+
+	if(count > 64)
+		len = 64;
+	else
+		len = count;
+
+	if(copy_from_user(name, buffer, len))
+		return -EFAULT;
+
+	name[len] = '\0';
+	/* chomp name */
+	if (len > 1 && name[len - 1] == '\n')
+		name[len - 1] = '\0';
+
+	found = find_sched_plugin(name);
+
+	if (found) {
+		ret = switch_sched_plugin(found);
+		if (ret != 0)
+			printk(KERN_INFO "Could not switch plugin: %d\n", ret);
+	} else
+		printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
+
+	return len;
+}
+
+
+static struct proc_dir_entry *litmus_dir = NULL,
+	*curr_file = NULL,
+	*stat_file = NULL,
+	*plugs_file = NULL;
+
+static int __init init_litmus_proc(void)
+{
+	litmus_dir = proc_mkdir("litmus", NULL);
+	if (!litmus_dir) {
+		printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
+		return -ENOMEM;
+	}
+	litmus_dir->owner = THIS_MODULE;
+
+	curr_file = create_proc_entry("active_plugin",
+				      0644, litmus_dir);
+	if (!curr_file) {
+		printk(KERN_ERR "Could not allocate active_plugin "
+		       "procfs entry.\n");
+		return -ENOMEM;
+	}
+	curr_file->owner = THIS_MODULE;
+	curr_file->read_proc  = proc_read_curr;
+	curr_file->write_proc = proc_write_curr;
+
+	stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
+					   proc_read_stats, NULL);
+
+	plugs_file = create_proc_read_entry("plugins", 0444, litmus_dir,
+					   proc_read_plugins, NULL);
+
+	return 0;
+}
+
+static void exit_litmus_proc(void)
+{
+	if (plugs_file)
+		remove_proc_entry("plugins", litmus_dir);
+	if (stat_file)
+		remove_proc_entry("stats", litmus_dir);
+	if (curr_file)
+		remove_proc_entry("active_plugin", litmus_dir);
+	if (litmus_dir)
+		remove_proc_entry("litmus", NULL);
+}
+
+extern struct sched_plugin linux_sched_plugin;
+
+static int __init _init_litmus(void)
+{
+	/*      Common initializers,
+	 *      mode change lock is used to enforce single mode change
+	 *      operation.
+	 */
+	printk("Starting LITMUS^RT kernel\n");
+
+	register_sched_plugin(&linux_sched_plugin);
+
+	heap_node_cache = KMEM_CACHE(heap_node, 0);
+	if (!heap_node_cache)
+		return -ENOMEM;
+
+#ifdef CONFIG_MAGIC_SYSRQ
+	/* offer some debugging help */
+	if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op))
+		printk("Registered kill rt tasks magic sysrq.\n");
+	else
+		printk("Could not register kill rt tasks magic sysrq.\n");
+#endif
+
+	init_litmus_proc();
+
+	return 0;
+}
+
+static void _exit_litmus(void)
+{
+	exit_litmus_proc();
+	kmem_cache_destroy(heap_node_cache);
+}
+
+module_init(_init_litmus);
+module_exit(_exit_litmus);
diff --git a/litmus/norqlock.c b/litmus/norqlock.c
new file mode 100644
index 0000000..11f85d3
--- /dev/null
+++ b/litmus/norqlock.c
@@ -0,0 +1,56 @@
+#include <linux/list.h>
+#include <linux/bitops.h>
+#include <linux/percpu.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+
+#include <litmus/norqlock.h>
+
+struct worklist {
+	struct no_rqlock_work* next;
+};
+
+static DEFINE_PER_CPU(struct worklist, norq_worklist) = {NULL};
+
+void init_no_rqlock_work(struct no_rqlock_work* w, work_t work,
+			 unsigned long arg)
+{
+	w->active = 0;
+	w->work   = work;
+	w->arg    = arg;
+	w->next   = NULL;
+}
+
+void __do_without_rqlock(struct no_rqlock_work *work)
+{
+	long flags;
+	struct worklist* wl;
+
+	local_irq_save(flags);
+	wl = &__get_cpu_var(norq_worklist);
+	work->next = wl->next;
+	wl->next   = work;
+	local_irq_restore(flags);
+}
+
+void tick_no_rqlock(void)
+{
+	long flags;
+	struct no_rqlock_work *todo, *next;
+
+	local_irq_save(flags);
+
+	next = __get_cpu_var(norq_worklist).next;
+	__get_cpu_var(norq_worklist).next = NULL;
+
+	while (next) {
+		todo = next;
+		next = next->next;
+		todo->next = NULL;
+		smp_mb__before_clear_bit();
+		clear_bit(0, (void*) &todo->active);
+		todo->work(todo->arg);
+	}
+
+	local_irq_restore(flags);
+}
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
new file mode 100644
index 0000000..2880308
--- /dev/null
+++ b/litmus/rt_domain.c
@@ -0,0 +1,138 @@
+/*
+ * kernel/rt_domain.c
+ *
+ * LITMUS real-time infrastructure. This file contains the
+ * functions that manipulate RT domains. RT domains are an abstraction
+ * of a ready queue and a release queue.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/rt_domain.h>
+
+#include <litmus/trace.h>
+
+static int dummy_resched(rt_domain_t *rt)
+{
+	return 0;
+}
+
+static int dummy_order(struct heap_node* a, struct heap_node* b)
+{
+	return 0;
+}
+
+/* default implementation: use default lock */
+static void default_release_job(struct task_struct* t, rt_domain_t* rt)
+{
+	add_ready(rt, t);
+}
+
+static enum hrtimer_restart release_job_timer(struct hrtimer *timer)
+{
+	struct task_struct *t;
+
+	TS_RELEASE_START;
+
+	t = container_of(timer, struct task_struct, 
+			 rt_param.release_timer);
+
+	get_domain(t)->release_job(t, get_domain(t));
+
+	TS_RELEASE_END;
+
+	return HRTIMER_NORESTART;
+}
+
+static void setup_job_release_timer(struct task_struct *task)
+{
+        hrtimer_init(&release_timer(task), CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+        release_timer(task).function = release_job_timer;
+#ifdef CONFIG_HIGH_RES_TIMERS
+        release_timer(task).cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+#endif
+        /* Expiration time of timer is release time of task. */
+	release_timer(task).expires = ns_to_ktime(get_release(task));
+
+	TRACE_TASK(task, "arming release timer rel=%llu at %llu\n",
+		   get_release(task), litmus_clock());
+
+	hrtimer_start(&release_timer(task), release_timer(task).expires,
+		      HRTIMER_MODE_ABS);
+}
+
+static void arm_release_timers(unsigned long _rt)
+{
+	rt_domain_t *rt = (rt_domain_t*) _rt;
+	unsigned long flags;
+	struct list_head alt;
+	struct list_head *pos, *safe;
+	struct task_struct* t;
+
+	spin_lock_irqsave(&rt->release_lock, flags);
+	list_replace_init(&rt->release_queue, &alt);
+	spin_unlock_irqrestore(&rt->release_lock, flags);
+	
+	list_for_each_safe(pos, safe, &alt) {
+		t = list_entry(pos, struct task_struct, rt_param.list);
+		list_del(pos);
+		setup_job_release_timer(t);
+	}
+}
+
+
+void rt_domain_init(rt_domain_t *rt,
+		    heap_prio_t order,
+		    check_resched_needed_t check,
+		    release_job_t release
+		   )
+{
+	BUG_ON(!rt);
+	if (!check)
+		check = dummy_resched;
+	if (!release)
+		release = default_release_job;
+	if (!order)
+		order = dummy_order;
+	heap_init(&rt->ready_queue);
+	INIT_LIST_HEAD(&rt->release_queue);
+	spin_lock_init(&rt->ready_lock);
+	spin_lock_init(&rt->release_lock);
+	rt->check_resched 	= check;
+	rt->release_job		= release;
+	rt->order		= order;
+	init_no_rqlock_work(&rt->arm_timers, arm_release_timers, (unsigned long) rt);
+}
+
+/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
+ * @new:      the newly released task
+ */
+void __add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+	TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to ready queue at %llu\n",
+	      new->comm, new->pid, get_exec_cost(new), get_rt_period(new),
+	      get_release(new), litmus_clock());
+
+	BUG_ON(heap_node_in_heap(tsk_rt(new)->heap_node));
+
+	heap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node);
+	rt->check_resched(rt);
+}
+
+/* add_release - add a real-time task to the rt release queue.
+ * @task:        the sleeping task
+ */
+void __add_release(rt_domain_t* rt, struct task_struct *task)
+{
+	TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task));
+	list_add(&tsk_rt(task)->list, &rt->release_queue);
+	task->rt_param.domain = rt;
+	do_without_rqlock(&rt->arm_timers);
+}
+
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
new file mode 100755
index 0000000..2ac14cd
--- /dev/null
+++ b/litmus/sched_cedf.c
@@ -0,0 +1,717 @@
+/*
+ * kernel/sched_cedf.c
+ *
+ * Implementation of the Clustered EDF (C-EDF) scheduling algorithm.
+ * Linking is included so that support for synchronization (e.g., through
+ * the implementation of a "CSN-EDF" algorithm) can be added later if desired.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+
+#include <linux/module.h>
+
+/* Overview of C-EDF operations.
+ *
+ * link_task_to_cpu(T, cpu) 	- Low-level operation to update the linkage
+ *                                structure (NOT the actually scheduled
+ *                                task). If there is another linked task To
+ *                                already it will set To->linked_on = NO_CPU
+ *                                (thereby removing its association with this
+ *                                CPU). However, it will not requeue the
+ *                                previously linked task (if any). It will set
+ *                                T's state to RT_F_RUNNING and check whether
+ *                                it is already running somewhere else. If T
+ *                                is scheduled somewhere else it will link
+ *                                it to that CPU instead (and pull the linked
+ *                                task to cpu). T may be NULL.
+ *
+ * unlink(T)			- Unlink removes T from all scheduler data
+ *                                structures. If it is linked to some CPU it
+ *                                will link NULL to that CPU. If it is
+ *                                currently queued in the cedf queue for
+ *                                a partition, it will be removed from
+ *                                the rt_domain. It is safe to call
+ *                                unlink(T) if T is not linked. T may not
+ *                                be NULL.
+ *
+ * requeue(T)			- Requeue will insert T into the appropriate
+ *                                queue. If the system is in real-time mode and
+ *                                the T is released already, it will go into the
+ *                                ready queue. If the system is not in
+ *                                real-time mode is T, then T will go into the
+ *                                release queue. If T's release time is in the
+ *                                future, it will go into the release
+ *                                queue. That means that T's release time/job
+ *                                no/etc. has to be updated before requeue(T) is
+ *                                called. It is not safe to call requeue(T)
+ *                                when T is already queued. T may not be NULL.
+ *
+ * cedf_job_arrival(T)		- This is the catch-all function when T enters
+ *                                the system after either a suspension or at a
+ *                                job release. It will queue T (which means it
+ *                                is not safe to call cedf_job_arrival(T) if
+ *                                T is already queued) and then check whether a
+ *                                preemption is necessary. If a preemption is
+ *                                necessary it will update the linkage
+ *                                accordingly and cause scheduled to be called
+ *                                (either with an IPI or need_resched). It is
+ *                                safe to call cedf_job_arrival(T) if T's
+ *                                next job has not been actually released yet
+ *                                (release time in the future). T will be put
+ *                                on the release queue in that case.
+ *
+ * job_completion(T)		- Take care of everything that needs to be done
+ *                                to prepare T for its next release and place
+ *                                it in the right queue with
+ *                                cedf_job_arrival().
+ *
+ *
+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
+ * the functions will automatically propagate pending task from the ready queue
+ * to a linked task. This is the job of the calling function ( by means of
+ * __take_ready).
+ */
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct  {
+	int 			cpu;
+	struct task_struct*	linked;		/* only RT tasks */
+	struct task_struct*	scheduled;	/* only RT tasks */
+	struct list_head	list;
+	atomic_t		will_schedule;	/* prevent unneeded IPIs */
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries);
+
+cpu_entry_t* cedf_cpu_entries_array[NR_CPUS];
+
+#define set_will_schedule() \
+	(atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+	(atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+	(atomic_read(&per_cpu(cedf_cpu_entries, cpu).will_schedule))
+
+#define NO_CPU 0xffffffff
+
+/* Cluster size -- currently four. This is a variable to allow for
+ * the possibility of changing the cluster size online in the future.
+ */
+int cluster_size = 4;
+
+typedef struct {
+	rt_domain_t 		domain;
+	int          		first_cpu;
+	int			last_cpu;
+
+	/* the cpus queue themselves according to priority in here */
+	struct list_head	cedf_cpu_queue;
+
+	/* per-partition spinlock: protects the domain and
+	 * serializes scheduling decisions
+	 */
+#define slock domain.ready_lock
+} cedf_domain_t;
+
+DEFINE_PER_CPU(cedf_domain_t*, cedf_domains) = NULL;
+
+cedf_domain_t* cedf_domains_array[NR_CPUS];
+
+
+/* These are defined similarly to partitioning, except that a
+ * tasks partition is any cpu of the cluster to which it
+ * is assigned, typically the lowest-numbered cpu.
+ */
+#define local_edf		(&__get_cpu_var(cedf_domains)->domain)
+#define local_cedf		__get_cpu_var(cedf_domains)
+#define remote_edf(cpu)		(&per_cpu(cedf_domains, cpu)->domain)
+#define remote_cedf(cpu)	per_cpu(cedf_domains, cpu)
+#define task_edf(task)		remote_edf(get_partition(task))
+#define task_cedf(task)		remote_cedf(get_partition(task))
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold cedf lock.
+ *
+ *						This really should be a heap.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+	cpu_entry_t *other;
+	struct list_head *cedf_cpu_queue =
+		&(remote_cedf(entry->cpu))->cedf_cpu_queue;
+	struct list_head *pos;
+
+	BUG_ON(!cedf_cpu_queue);
+
+	if (likely(in_list(&entry->list)))
+		list_del(&entry->list);
+	/* if we do not execute real-time jobs we just move
+	 * to the end of the queue
+	 */
+	if (entry->linked) {
+		list_for_each(pos, cedf_cpu_queue) {
+			other = list_entry(pos, cpu_entry_t, list);
+			if (edf_higher_prio(entry->linked, other->linked)) {
+				__list_add(&entry->list, pos->prev, pos);
+				return;
+			}
+		}
+	}
+	/* if we get this far we have the lowest priority job */
+	list_add_tail(&entry->list, cedf_cpu_queue);
+}
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+				      cpu_entry_t *entry)
+{
+	cpu_entry_t *sched;
+	struct task_struct* tmp;
+	int on_cpu;
+
+	BUG_ON(linked && !is_realtime(linked));
+
+	/* Cannot link task to a CPU that doesn't belong to its partition... */
+	BUG_ON(linked && remote_cedf(entry->cpu) != task_cedf(linked));
+
+	/* Currently linked task is set to be unlinked. */
+	if (entry->linked) {
+		entry->linked->rt_param.linked_on = NO_CPU;
+	}
+
+	/* Link new task to CPU. */
+	if (linked) {
+		set_rt_flags(linked, RT_F_RUNNING);
+		/* handle task is already scheduled somewhere! */
+		on_cpu = linked->rt_param.scheduled_on;
+		if (on_cpu != NO_CPU) {
+			sched = &per_cpu(cedf_cpu_entries, on_cpu);
+			/* this should only happen if not linked already */
+			BUG_ON(sched->linked == linked);
+
+			/* If we are already scheduled on the CPU to which we
+			 * wanted to link, we don't need to do the swap --
+			 * we just link ourselves to the CPU and depend on
+			 * the caller to get things right.
+			 */
+			if (entry != sched) {
+				tmp = sched->linked;
+				linked->rt_param.linked_on = sched->cpu;
+				sched->linked = linked;
+				update_cpu_position(sched);
+				linked = tmp;
+			}
+		}
+		if (linked) /* might be NULL due to swap */
+			linked->rt_param.linked_on = entry->cpu;
+	}
+	entry->linked = linked;
+
+	if (entry->linked)
+		TRACE_TASK(entry->linked, "linked to CPU %d, state:%d\n",
+			   entry->cpu, entry->linked->state);
+	else
+		TRACE("NULL linked to CPU %d\n", entry->cpu);
+
+	update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold cedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+    	cpu_entry_t *entry;
+
+	if (unlikely(!t)) {
+		TRACE_BUG_ON(!t);
+		return;
+	}
+
+	if (t->rt_param.linked_on != NO_CPU) {
+		/* unlink */
+		entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on);
+		t->rt_param.linked_on = NO_CPU;
+		link_task_to_cpu(NULL, entry);
+	} else if (is_queued(t)) {
+		/* This is an interesting situation: t is scheduled,
+		 * but was just recently unlinked.  It cannot be
+		 * linked anywhere else (because then it would have
+		 * been relinked to this CPU), thus it must be in some
+		 * queue. We must remove it from the list in this
+		 * case.
+		 */
+		remove(task_edf(t), t);
+	}
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static noinline void preempt(cpu_entry_t *entry)
+{
+	/* We cannot make the is_np() decision here if it is a remote CPU
+	 * because requesting exit_np() requires that we currently use the
+	 * address space of the task. Thus, in the remote case we just send
+	 * the IPI and let schedule() handle the problem.
+	 */
+
+	if (smp_processor_id() == entry->cpu) {
+		if (entry->scheduled && is_np(entry->scheduled))
+			request_exit_np(entry->scheduled);
+		else
+			set_tsk_need_resched(current);
+	} else
+		/* in case that it is a remote CPU we have to defer the
+		 * the decision to the remote CPU
+		 * FIXME: We could save a few IPI's here if we leave the flag
+		 * set when we are waiting for a np_exit().
+		 */
+		if (!test_will_schedule(entry->cpu))
+			smp_send_reschedule(entry->cpu);
+}
+
+/* requeue - Put an unlinked task into c-edf domain.
+ *           Caller must hold cedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+	cedf_domain_t* cedf;
+	rt_domain_t* edf;
+
+	BUG_ON(!task);
+	/* sanity check rt_list before insertion */
+	BUG_ON(is_queued(task));
+
+	/* Get correct real-time domain. */
+	cedf = task_cedf(task);
+	edf = &cedf->domain;
+
+	if (get_rt_flags(task) == RT_F_SLEEP) {
+		/* this task has expired
+		 * _schedule has already taken care of updating
+		 * the release and
+		 * deadline. We just must check if it has been released.
+		 */
+		if (is_released(task, litmus_clock()))
+			__add_ready(edf, task);
+		else {
+			/* it has got to wait */
+			add_release(edf, task);
+		}
+
+	} else
+		/* this is a forced preemption
+		 * thus the task stays in the ready_queue
+		 * we only must make it available to others
+		 */
+		__add_ready(edf, task);
+}
+
+/* cedf_job_arrival: task is either resumed or released */
+static noinline void cedf_job_arrival(struct task_struct* task)
+{
+	cpu_entry_t* last;
+	cedf_domain_t* cedf;
+	rt_domain_t* edf;
+	struct list_head *cedf_cpu_queue;
+
+	BUG_ON(!task);
+
+	/* Get correct real-time domain. */
+	cedf = task_cedf(task);
+	edf = &cedf->domain;
+	cedf_cpu_queue = &cedf->cedf_cpu_queue;
+
+	BUG_ON(!cedf);
+	BUG_ON(!edf);
+	BUG_ON(!cedf_cpu_queue);
+	BUG_ON(list_empty(cedf_cpu_queue));
+
+	/* first queue arriving job */
+	requeue(task);
+
+	/* then check for any necessary preemptions */
+	last = list_entry(cedf_cpu_queue->prev, cpu_entry_t, list);
+	if (edf_preemption_needed(edf, last->linked)) {
+		/* preemption necessary */
+		task = __take_ready(edf);
+		TRACE("job_arrival: task %d linked to %d, state:%d\n",
+		      task->pid, last->cpu, task->state);
+		if (last->linked)
+			requeue(last->linked);
+
+		link_task_to_cpu(task, last);
+		preempt(last);
+	}
+}
+
+/* check for current job releases */
+static void cedf_job_release(struct task_struct* t, rt_domain_t* _)
+{
+        cedf_domain_t*          cedf = task_cedf(t);
+	unsigned long 		flags;
+
+	BUG_ON(!t);
+	BUG_ON(!cedf);
+
+	spin_lock_irqsave(&cedf->slock, flags);
+	sched_trace_job_release(queued);
+	cedf_job_arrival(t);
+	spin_unlock_irqrestore(&cedf->slock, flags);
+}
+
+/* cedf_tick - this function is called for every local timer
+ *                         interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static void cedf_tick(struct task_struct* t)
+{
+	BUG_ON(!t);
+
+	if (is_realtime(t) && budget_exhausted(t)) {
+		if (!is_np(t)) {
+			/* np tasks will be preempted when they become
+			 * preemptable again
+			 */
+			set_tsk_need_resched(t);
+			set_will_schedule();
+			TRACE("cedf_scheduler_tick: "
+			      "%d is preemptable (state:%d) "
+			      " => FORCE_RESCHED\n", t->pid, t->state);
+		} else {
+			TRACE("cedf_scheduler_tick: "
+			      "%d is non-preemptable (state:%d), "
+			      "preemption delayed.\n", t->pid, t->state);
+			request_exit_np(t);
+		}
+	}
+}
+
+/* caller holds cedf_lock */
+static noinline void job_completion(struct task_struct *t)
+{
+	BUG_ON(!t);
+
+	sched_trace_job_completion(t);
+
+	TRACE_TASK(t, "job_completion(). [state:%d]\n", t->state);
+
+	/* set flags */
+	set_rt_flags(t, RT_F_SLEEP);
+	/* prepare for next period */
+	prepare_for_next_period(t);
+	/* unlink */
+	unlink(t);
+	/* requeue
+	 * But don't requeue a blocking task. */
+	if (is_running(t))
+		cedf_job_arrival(t);
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *	- scheduled->timeslice == 0	// the job completed (forcefully)
+ *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
+ * 	- linked != scheduled		// we need to reschedule (for any reason)
+ * 	- is_np(scheduled)		// rescheduling must be delayed,
+ *					   sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* cedf_schedule(struct task_struct * prev)
+{
+	cedf_domain_t* 		cedf = local_cedf;
+	rt_domain_t*		edf  = &cedf->domain;
+	cpu_entry_t* 		entry = &__get_cpu_var(cedf_cpu_entries);
+	int 			out_of_time, sleep, preempt, np,
+				exists, blocks;
+	struct task_struct* 	next = NULL;
+
+	BUG_ON(!prev);
+	BUG_ON(!cedf);
+	BUG_ON(!edf);
+	BUG_ON(!entry);
+	BUG_ON(cedf != remote_cedf(entry->cpu));
+	BUG_ON(is_realtime(prev) && cedf != task_cedf(prev));
+
+	/* Will be released in finish_switch. */
+	spin_lock(&cedf->slock);
+	clear_will_schedule();
+
+	/* sanity checking */
+	BUG_ON(entry->scheduled && entry->scheduled != prev);
+	BUG_ON(entry->scheduled && !is_realtime(prev));
+	BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+	/* (0) Determine state */
+	exists      = entry->scheduled != NULL;
+	blocks      = exists && !is_running(entry->scheduled);
+	out_of_time = exists && budget_exhausted(entry->scheduled);
+	np 	    = exists && is_np(entry->scheduled);
+	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+	preempt     = entry->scheduled != entry->linked;
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		unlink(entry->scheduled);
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * We need to make sure to update the link structure anyway in case
+	 * that we are still linked. Multiple calls to request_exit_np() don't
+	 * hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep)) {
+		unlink(entry->scheduled);
+		request_exit_np(entry->scheduled);
+	}
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this. Don't do a job completion if blocks (can't have timers
+	 * running for blocked jobs). Preemption go first for the same reason.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks && !preempt)
+		job_completion(entry->scheduled);
+
+	/* Link pending task if we became unlinked.
+	 */
+	if (!entry->linked)
+		link_task_to_cpu(__take_ready(edf), entry);
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * If linked different from scheduled select linked as next.
+	 */
+	if ((!np || blocks) &&
+	    entry->linked != entry->scheduled) {
+		/* Schedule a linked job? */
+		if (entry->linked) {
+			entry->linked->rt_param.scheduled_on = entry->cpu;
+			next = entry->linked;
+		}
+                if (entry->scheduled) {
+			/* not gonna be scheduled soon */
+			entry->scheduled->rt_param.scheduled_on = NO_CPU;
+			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+		}
+	} else
+		/* Only override Linux scheduler if we have real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	spin_unlock(&cedf->slock);
+
+	return next;
+}
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void cedf_finish_switch(struct task_struct *prev)
+{
+	cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
+
+	BUG_ON(!prev);
+	BUG_ON(!entry);
+
+	entry->scheduled = is_realtime(current) ? current : NULL;
+}
+
+/*	Prepare a task for running in RT mode
+ */
+static void cedf_task_new(struct task_struct *t, int on_rq, int running)
+{
+	unsigned long 		flags;
+	cedf_domain_t*	 	cedf = task_cedf(t);
+	cpu_entry_t* 		entry;
+
+	BUG_ON(!cedf);
+
+	spin_lock_irqsave(&cedf->slock, flags);
+	if (running) {
+		entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
+		BUG_ON(!entry);
+		BUG_ON(entry->scheduled);
+		entry->scheduled = t;
+		t->rt_param.scheduled_on = task_cpu(t);
+	} else
+		t->rt_param.scheduled_on = NO_CPU;
+	t->rt_param.linked_on = NO_CPU;
+
+	/* setup job params */
+	release_at(t, litmus_clock());
+
+	cedf_job_arrival(t);
+	spin_unlock_irqrestore(&cedf->slock, flags);
+}
+
+
+static void cedf_task_wake_up(struct task_struct *task)
+{
+	unsigned long		flags;
+	cedf_domain_t*	 	cedf;
+	lt_t 			now;
+
+	BUG_ON(!task);
+
+	cedf = task_cedf(task);
+	BUG_ON(!cedf);
+
+	spin_lock_irqsave(&cedf->slock, flags);
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+		set_rt_flags(task, RT_F_RUNNING);
+	} else {
+		now = litmus_clock();
+		if (is_tardy(task, now)) {
+			/* new sporadic release */
+			release_at(task, now);
+			sched_trace_job_release(task);
+		}
+		else if (task->time_slice)
+			/* came back in time before deadline
+			 */
+			set_rt_flags(task, RT_F_RUNNING);
+	}
+	cedf_job_arrival(task);
+	spin_unlock_irqrestore(&cedf->slock, flags);
+}
+
+
+static void cedf_task_block(struct task_struct *t)
+{
+	unsigned long flags;
+
+	BUG_ON(!t);
+
+	/* unlink if necessary */
+	spin_lock_irqsave(&task_cedf(t)->slock, flags);
+	unlink(t);
+	spin_unlock_irqrestore(&task_cedf(t)->slock, flags);
+
+	BUG_ON(!is_realtime(t));
+}
+
+static void cedf_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+
+	BUG_ON(!t);
+
+	/* unlink if necessary */
+	spin_lock_irqsave(&task_cedf(t)->slock, flags);
+	unlink(t);
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		cedf_cpu_entries_array[tsk_rt(t)->scheduled_on]->
+			scheduled = NULL;
+		tsk_rt(t)->scheduled_on = NO_CPU;
+	}
+	spin_unlock_irqrestore(&task_cedf(t)->slock, flags);
+
+	BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+
+static long cedf_admit_task(struct task_struct* tsk)
+{
+	return (task_cpu(tsk) >= task_cedf(tsk)->first_cpu &&
+	        task_cpu(tsk) <= task_cedf(tsk)->last_cpu) ? 0 : -EINVAL;
+}
+
+
+/*	Plugin object	*/
+static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "C-EDF",
+	.finish_switch		= cedf_finish_switch,
+	.tick			= cedf_tick,
+	.task_new		= cedf_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= cedf_task_exit,
+	.schedule		= cedf_schedule,
+	.task_wake_up		= cedf_task_wake_up,
+	.task_block		= cedf_task_block,
+	.admit_task		= cedf_admit_task
+};
+
+static void cedf_domain_init(int first_cpu, int last_cpu)
+{
+	int cpu;
+
+	/* Create new domain for this cluster. */
+	cedf_domain_t *new_cedf_domain = kmalloc(sizeof(cedf_domain_t),
+						     GFP_KERNEL);
+
+	/* Initialize cluster domain. */
+	edf_domain_init(&new_cedf_domain->domain, NULL,
+			cedf_job_release);
+	new_cedf_domain->first_cpu	= first_cpu;
+	new_cedf_domain->last_cpu	= last_cpu;
+	INIT_LIST_HEAD(&new_cedf_domain->cedf_cpu_queue);
+
+	/* Assign all cpus in cluster to point to this domain. */
+	for (cpu = first_cpu; cpu <= last_cpu; cpu++) {
+		remote_cedf(cpu) = new_cedf_domain;
+		cedf_domains_array[cpu] = new_cedf_domain;
+	}
+}
+
+static int __init init_cedf(void)
+{
+	int cpu;
+	cpu_entry_t *entry;
+
+	/* initialize CPU state */
+	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+		entry = &per_cpu(cedf_cpu_entries, cpu);
+		cedf_cpu_entries_array[cpu] = entry;
+		atomic_set(&entry->will_schedule, 0);
+		entry->linked    = NULL;
+		entry->scheduled = NULL;
+		entry->cpu 	 = cpu;
+		INIT_LIST_HEAD(&entry->list);
+	}
+
+	/* initialize all cluster domains */
+	for (cpu = 0; cpu < NR_CPUS; cpu += cluster_size)
+		cedf_domain_init(cpu, cpu+cluster_size-1);
+
+	return register_sched_plugin(&cedf_plugin);
+}
+
+module_init(init_cedf);
+
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
new file mode 100644
index 0000000..e32a4e8
--- /dev/null
+++ b/litmus/sched_gsn_edf.c
@@ -0,0 +1,731 @@
+/*
+ * kernel/sched_gsn_edf.c
+ *
+ * Implementation of the GSN-EDF scheduling algorithm.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+
+#include <linux/module.h>
+
+/* Overview of GSN-EDF operations.
+ *
+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
+ * description only covers how the individual operations are implemented in
+ * LITMUS.
+ *
+ * link_task_to_cpu(T, cpu) 	- Low-level operation to update the linkage
+ *                                structure (NOT the actually scheduled
+ *                                task). If there is another linked task To
+ *                                already it will set To->linked_on = NO_CPU
+ *                                (thereby removing its association with this
+ *                                CPU). However, it will not requeue the
+ *                                previously linked task (if any). It will set
+ *                                T's state to RT_F_RUNNING and check whether
+ *                                it is already running somewhere else. If T
+ *                                is scheduled somewhere else it will link
+ *                                it to that CPU instead (and pull the linked
+ *                                task to cpu). T may be NULL.
+ *
+ * unlink(T)			- Unlink removes T from all scheduler data
+ *                                structures. If it is linked to some CPU it
+ *                                will link NULL to that CPU. If it is
+ *                                currently queued in the gsnedf queue it will
+ *                                be removed from the rt_domain. It is safe to
+ *                                call unlink(T) if T is not linked. T may not
+ *                                be NULL.
+ *
+ * requeue(T)			- Requeue will insert T into the appropriate
+ *                                queue. If the system is in real-time mode and
+ *                                the T is released already, it will go into the
+ *                                ready queue. If the system is not in
+ *                                real-time mode is T, then T will go into the
+ *                                release queue. If T's release time is in the
+ *                                future, it will go into the release
+ *                                queue. That means that T's release time/job
+ *                                no/etc. has to be updated before requeu(T) is
+ *                                called. It is not safe to call requeue(T)
+ *                                when T is already queued. T may not be NULL.
+ *
+ * gsnedf_job_arrival(T)	- This is the catch all function when T enters
+ *                                the system after either a suspension or at a
+ *                                job release. It will queue T (which means it
+ *                                is not safe to call gsnedf_job_arrival(T) if
+ *                                T is already queued) and then check whether a
+ *                                preemption is necessary. If a preemption is
+ *                                necessary it will update the linkage
+ *                                accordingly and cause scheduled to be called
+ *                                (either with an IPI or need_resched). It is
+ *                                safe to call gsnedf_job_arrival(T) if T's
+ *                                next job has not been actually released yet
+ *                                (releast time in the future). T will be put
+ *                                on the release queue in that case.
+ *
+ * job_completion(T)		- Take care of everything that needs to be done
+ *                                to prepare T for its next release and place
+ *                                it in the right queue with
+ *                                gsnedf_job_arrival().
+ *
+ *
+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
+ * the functions will automatically propagate pending task from the ready queue
+ * to a linked task. This is the job of the calling function ( by means of
+ * __take_ready).
+ */
+
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct  {
+	int 			cpu;
+	struct task_struct*	linked;		/* only RT tasks */
+	struct task_struct*	scheduled;	/* only RT tasks */
+	struct list_head	list;
+	atomic_t		will_schedule;	/* prevent unneeded IPIs */
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
+
+cpu_entry_t* gsnedf_cpus[NR_CPUS];
+
+#define set_will_schedule() \
+	(atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+	(atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+	(atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule))
+
+
+#define NO_CPU 0xffffffff
+
+/* the cpus queue themselves according to priority in here */
+static LIST_HEAD(gsnedf_cpu_queue);
+
+static rt_domain_t gsnedf;
+#define gsnedf_lock (gsnedf.ready_lock)
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold gsnedf lock.
+ *
+ *						This really should be a heap.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+	cpu_entry_t *other;
+	struct list_head *pos;
+
+	if (likely(in_list(&entry->list)))
+		list_del(&entry->list);
+	/* if we do not execute real-time jobs we just move
+	 * to the end of the queue
+	 */
+	if (entry->linked) {
+		list_for_each(pos, &gsnedf_cpu_queue) {
+			other = list_entry(pos, cpu_entry_t, list);
+			if (edf_higher_prio(entry->linked, other->linked)) {
+				__list_add(&entry->list, pos->prev, pos);
+				return;
+			}
+		}
+	}
+	/* if we get this far we have the lowest priority job */
+	list_add_tail(&entry->list, &gsnedf_cpu_queue);
+}
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+				      cpu_entry_t *entry)
+{
+	cpu_entry_t *sched;
+	struct task_struct* tmp;
+	int on_cpu;
+
+	BUG_ON(linked && !is_realtime(linked));
+
+	/* Currently linked task is set to be unlinked. */
+	if (entry->linked) {
+		entry->linked->rt_param.linked_on = NO_CPU;
+	}
+
+	/* Link new task to CPU. */
+	if (linked) {
+		set_rt_flags(linked, RT_F_RUNNING);
+		/* handle task is already scheduled somewhere! */
+		on_cpu = linked->rt_param.scheduled_on;
+		if (on_cpu != NO_CPU) {
+			sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
+			/* this should only happen if not linked already */
+			BUG_ON(sched->linked == linked);
+
+			/* If we are already scheduled on the CPU to which we
+			 * wanted to link, we don't need to do the swap --
+			 * we just link ourselves to the CPU and depend on
+			 * the caller to get things right.
+			 */
+			if (entry != sched) {
+				TRACE_TASK(linked,
+					   "already scheduled on %d, updating link.\n",
+					   sched->cpu);
+				tmp = sched->linked;
+				linked->rt_param.linked_on = sched->cpu;
+				sched->linked = linked;
+				update_cpu_position(sched);
+				linked = tmp;
+			}
+		}
+		if (linked) /* might be NULL due to swap */
+			linked->rt_param.linked_on = entry->cpu;
+	}
+	entry->linked = linked;
+	if (linked)
+		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+	else
+		TRACE("NULL linked to %d.\n", entry->cpu);
+	update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold gsnedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+    	cpu_entry_t *entry;
+
+	if (unlikely(!t)) {
+		TRACE_BUG_ON(!t);
+		return;
+	}
+
+	if (t->rt_param.linked_on != NO_CPU) {
+		/* unlink */
+		entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
+		t->rt_param.linked_on = NO_CPU;
+		link_task_to_cpu(NULL, entry);
+	} else if (is_queued(t)) {
+		/* This is an interesting situation: t is scheduled,
+		 * but was just recently unlinked.  It cannot be
+		 * linked anywhere else (because then it would have
+		 * been relinked to this CPU), thus it must be in some
+		 * queue. We must remove it from the list in this
+		 * case.
+		 */
+		remove(&gsnedf, t);
+	}
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static noinline void preempt(cpu_entry_t *entry)
+{
+	/* We cannot make the is_np() decision here if it is a remote CPU
+	 * because requesting exit_np() requires that we currently use the
+	 * address space of the task. Thus, in the remote case we just send
+	 * the IPI and let schedule() handle the problem.
+	 */
+
+	if (smp_processor_id() == entry->cpu) {
+		if (entry->scheduled && is_np(entry->scheduled))
+			request_exit_np(entry->scheduled);
+		else
+			set_tsk_need_resched(current);
+	} else
+		/* in case that it is a remote CPU we have to defer the
+		 * the decision to the remote CPU
+		 * FIXME: We could save a few IPI's here if we leave the flag
+		 * set when we are waiting for a np_exit().
+		 */
+		if (!test_will_schedule(entry->cpu))
+			smp_send_reschedule(entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold gsnedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+	BUG_ON(!task);
+	/* sanity check before insertion */
+	BUG_ON(is_queued(task));
+
+	if (get_rt_flags(task) == RT_F_SLEEP) {
+		/* this task has expired
+		 * _schedule has already taken care of updating
+		 * the release and
+		 * deadline. We just must check if it has been released.
+		 */
+		if (is_released(task, litmus_clock()))
+			__add_ready(&gsnedf, task);
+		else {
+			/* it has got to wait */
+			add_release(&gsnedf, task);
+		}
+
+	} else
+		/* this is a forced preemption
+		 * thus the task stays in the ready_queue
+		 * we only must make it available to others
+		 */
+		__add_ready(&gsnedf, task);
+}
+
+/* gsnedf_job_arrival: task is either resumed or released */
+static noinline void gsnedf_job_arrival(struct task_struct* task)
+{
+	cpu_entry_t* last;
+
+	BUG_ON(list_empty(&gsnedf_cpu_queue));
+	BUG_ON(!task);
+
+	/* first queue arriving job */
+	requeue(task);
+
+	/* then check for any necessary preemptions */
+	last = list_entry(gsnedf_cpu_queue.prev, cpu_entry_t, list);
+	if (edf_preemption_needed(&gsnedf, last->linked)) {
+		/* preemption necessary */
+		task = __take_ready(&gsnedf);
+		TRACE("job_arrival: attempting to link task %d to %d\n",
+		      task->pid, last->cpu);
+		if (last->linked)
+			requeue(last->linked);
+
+		link_task_to_cpu(task, last);
+		preempt(last);
+	}
+}
+
+/* check for current job releases */
+static void gsnedf_job_release(struct task_struct* t, rt_domain_t* _)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&gsnedf_lock, flags);
+
+	sched_trace_job_release(queued);
+	gsnedf_job_arrival(t);
+
+	spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+/* caller holds gsnedf_lock */
+static noinline void job_completion(struct task_struct *t)
+{
+	BUG_ON(!t);
+
+	sched_trace_job_completion(t);
+
+	TRACE_TASK(t, "job_completion().\n");
+
+	/* set flags */
+	set_rt_flags(t, RT_F_SLEEP);
+	/* prepare for next period */
+	prepare_for_next_period(t);
+	/* unlink */
+	unlink(t);
+	/* requeue
+	 * But don't requeue a blocking task. */
+	if (is_running(t))
+		gsnedf_job_arrival(t);
+}
+
+/* gsnedf_tick - this function is called for every local timer
+ *                         interrupt.
+ *
+ *                   checks whether the current task has expired and checks
+ *                   whether we need to preempt it if it has not expired
+ */
+static void gsnedf_tick(struct task_struct* t)
+{
+	if (is_realtime(t) && budget_exhausted(t)) {
+		if (!is_np(t)) {
+			/* np tasks will be preempted when they become
+			 * preemptable again
+			 */
+			set_tsk_need_resched(t);
+			set_will_schedule();
+			TRACE("gsnedf_scheduler_tick: "
+			      "%d is preemptable "
+			      " => FORCE_RESCHED\n", t->pid);
+		} else {
+			TRACE("gsnedf_scheduler_tick: "
+			      "%d is non-preemptable, "
+			      "preemption delayed.\n", t->pid);
+			request_exit_np(t);
+		}
+	}
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *	- scheduled->timeslice == 0	// the job completed (forcefully)
+ *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
+ * 	- linked != scheduled		// we need to reschedule (for any reason)
+ * 	- is_np(scheduled)		// rescheduling must be delayed,
+ *					   sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* gsnedf_schedule(struct task_struct * prev)
+{
+	cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
+	int out_of_time, sleep, preempt, np, exists, blocks;
+	struct task_struct* next = NULL;
+
+	/* Will be released in finish_switch. */
+	spin_lock(&gsnedf_lock);
+	clear_will_schedule();
+
+	/* sanity checking */
+	BUG_ON(entry->scheduled && entry->scheduled != prev);
+	BUG_ON(entry->scheduled && !is_realtime(prev));
+	BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+	/* (0) Determine state */
+	exists      = entry->scheduled != NULL;
+	blocks      = exists && !is_running(entry->scheduled);
+	out_of_time = exists && budget_exhausted(entry->scheduled);
+	np 	    = exists && is_np(entry->scheduled);
+	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
+	preempt     = entry->scheduled != entry->linked;
+
+	TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
+
+	if (exists)
+		TRACE_TASK(prev,
+			   "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+			   "state:%d sig:%d\n",
+			   blocks, out_of_time, np, sleep, preempt,
+			   prev->state, signal_pending(prev));
+	if (entry->linked && preempt)
+		TRACE_TASK(prev, "will be preempted by %s/%d\n",
+			   entry->linked->comm, entry->linked->pid);
+
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		unlink(entry->scheduled);
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * We need to make sure to update the link structure anyway in case
+	 * that we are still linked. Multiple calls to request_exit_np() don't
+	 * hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep)) {
+		unlink(entry->scheduled);
+		request_exit_np(entry->scheduled);
+	}
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this. Don't do a job completion if we block (can't have timers running
+	 * for blocked jobs). Preemption go first for the same reason.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks && !preempt)
+		job_completion(entry->scheduled);
+
+	/* Link pending task if we became unlinked.
+	 */
+	if (!entry->linked)
+		link_task_to_cpu(__take_ready(&gsnedf), entry);
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * If linked is different from scheduled, then select linked as next.
+	 */
+	if ((!np || blocks) &&
+	    entry->linked != entry->scheduled) {
+		/* Schedule a linked job? */
+		if (entry->linked) {
+			entry->linked->rt_param.scheduled_on = entry->cpu;
+			next = entry->linked;
+		}
+		if (entry->scheduled) {
+			/* not gonna be scheduled soon */
+			entry->scheduled->rt_param.scheduled_on = NO_CPU;
+			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+		}
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	spin_unlock(&gsnedf_lock);
+
+	TRACE("gsnedf_lock released, next=0x%p\n", next);
+
+
+	if (next)
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	else if (exists && !next)
+		TRACE("becomes idle at %llu.\n", litmus_clock());
+
+
+	return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void gsnedf_finish_switch(struct task_struct *prev)
+{
+	cpu_entry_t* 	entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+	entry->scheduled = is_realtime(current) ? current : NULL;
+	TRACE_TASK(prev, "switched away from\n");
+}
+
+
+/*	Prepare a task for running in RT mode
+ */
+static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+	unsigned long 		flags;
+	cpu_entry_t* 		entry;
+
+	TRACE("gsn edf: task new %d\n", t->pid);
+
+	spin_lock_irqsave(&gsnedf_lock, flags);
+	if (running) {
+		entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
+		BUG_ON(entry->scheduled);
+		entry->scheduled = t;
+		t->rt_param.scheduled_on = task_cpu(t);
+	} else
+		t->rt_param.scheduled_on = NO_CPU;
+	t->rt_param.linked_on          = NO_CPU;
+
+	/* setup job params */
+	release_at(t, litmus_clock());
+
+	gsnedf_job_arrival(t);
+	spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_wake_up(struct task_struct *task)
+{
+	unsigned long flags;
+	lt_t now;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+	spin_lock_irqsave(&gsnedf_lock, flags);
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+		set_rt_flags(task, RT_F_RUNNING);
+	} else {
+		now = litmus_clock();
+		if (is_tardy(task, now)) {
+			/* new sporadic release */
+			release_at(task, now);
+			sched_trace_job_release(task);
+		}
+		else if (task->time_slice)
+			/* came back in time before deadline
+			 */
+			set_rt_flags(task, RT_F_RUNNING);
+	}
+	gsnedf_job_arrival(task);
+	spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_block(struct task_struct *t)
+{
+	unsigned long flags;
+
+	TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+	/* unlink if necessary */
+	spin_lock_irqsave(&gsnedf_lock, flags);
+	unlink(t);
+	spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+}
+
+
+static void gsnedf_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+
+	/* unlink if necessary */
+	spin_lock_irqsave(&gsnedf_lock, flags);
+	unlink(t);
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
+		tsk_rt(t)->scheduled_on = NO_CPU;
+	}
+	spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+
+#ifdef CONFIG_FMLP
+static long gsnedf_pi_block(struct pi_semaphore *sem,
+			    struct task_struct *new_waiter)
+{
+	/* This callback has to handle the situation where a new waiter is
+	 * added to the wait queue of the semaphore.
+	 *
+	 * We must check if has a higher priority than the currently
+	 * highest-priority task, and then potentially reschedule.
+	 */
+
+	BUG_ON(!new_waiter);
+
+	if (edf_higher_prio(new_waiter, sem->hp.task)) {
+		TRACE_TASK(new_waiter, " boosts priority\n");
+		/* called with IRQs disabled */
+		spin_lock(&gsnedf_lock);
+		/* store new highest-priority task */
+		sem->hp.task = new_waiter;
+		if (sem->holder) {
+			/* let holder inherit */
+			sem->holder->rt_param.inh_task = new_waiter;
+			unlink(sem->holder);
+			gsnedf_job_arrival(sem->holder);
+		}
+		spin_unlock(&gsnedf_lock);
+	}
+
+	return 0;
+}
+
+static long gsnedf_inherit_priority(struct pi_semaphore *sem,
+				    struct task_struct *new_owner)
+{
+	/* We don't need to acquire the gsnedf_lock since at the time of this
+	 * call new_owner isn't actually scheduled yet (it's still sleeping)
+	 * and since the calling function already holds sem->wait.lock, which
+	 * prevents concurrent sem->hp.task changes.
+	 */
+
+	if (sem->hp.task && sem->hp.task != new_owner) {
+		new_owner->rt_param.inh_task = sem->hp.task;
+		TRACE_TASK(new_owner, "inherited priority from %s/%d\n",
+			   sem->hp.task->comm, sem->hp.task->pid);
+	} else
+		TRACE_TASK(new_owner,
+			   "cannot inherit priority, "
+			   "no higher priority job waits.\n");
+	return 0;
+}
+
+/* This function is called on a semaphore release, and assumes that
+ * the current task is also the semaphore holder.
+ */
+static long gsnedf_return_priority(struct pi_semaphore *sem)
+{
+	struct task_struct* t = current;
+	int ret = 0;
+
+        /* Find new highest-priority semaphore task
+	 * if holder task is the current hp.task.
+	 *
+	 * Calling function holds sem->wait.lock.
+	 */
+	if (t == sem->hp.task)
+		edf_set_hp_task(sem);
+
+	TRACE_CUR("gsnedf_return_priority for lock %p\n", sem);
+
+	if (t->rt_param.inh_task) {
+		/* interrupts already disabled by PI code */
+		spin_lock(&gsnedf_lock);
+
+		/* Reset inh_task to NULL. */
+		t->rt_param.inh_task = NULL;
+
+		/* Check if rescheduling is necessary */
+		unlink(t);
+		gsnedf_job_arrival(t);
+		spin_unlock(&gsnedf_lock);
+	}
+
+	return ret;
+}
+
+#endif
+
+static long gsnedf_admit_task(struct task_struct* tsk)
+{
+	return 0;
+}
+
+
+/*	Plugin object	*/
+static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "GSN-EDF",
+	.finish_switch		= gsnedf_finish_switch,
+	.tick			= gsnedf_tick,
+	.task_new		= gsnedf_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= gsnedf_task_exit,
+	.schedule		= gsnedf_schedule,
+	.task_wake_up		= gsnedf_task_wake_up,
+	.task_block		= gsnedf_task_block,
+#ifdef CONFIG_FMLP
+	.fmlp_active		= 1,
+	.pi_block		= gsnedf_pi_block,
+	.inherit_priority	= gsnedf_inherit_priority,
+	.return_priority	= gsnedf_return_priority,
+#endif
+	.admit_task		= gsnedf_admit_task
+};
+
+
+static int __init init_gsn_edf(void)
+{
+	int cpu;
+	cpu_entry_t *entry;
+
+	/* initialize CPU state */
+	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+		entry = &per_cpu(gsnedf_cpu_entries, cpu);
+		gsnedf_cpus[cpu] = entry;
+		atomic_set(&entry->will_schedule, 0);
+		entry->linked    = NULL;
+		entry->scheduled = NULL;
+		entry->cpu 	 = cpu;
+		INIT_LIST_HEAD(&entry->list);
+	}
+
+	edf_domain_init(&gsnedf, NULL, gsnedf_job_release);
+	return register_sched_plugin(&gsn_edf_plugin);
+}
+
+
+module_init(init_gsn_edf);
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
new file mode 100644
index 0000000..1e541e4
--- /dev/null
+++ b/litmus/sched_litmus.c
@@ -0,0 +1,228 @@
+/* This file is included from kernel/sched.c */
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+
+static void update_time_litmus(struct rq *rq, struct task_struct *p)
+{
+	lt_t now = litmus_clock();
+	p->rt_param.job_params.exec_time +=
+		now - p->rt_param.job_params.exec_start;
+	p->rt_param.job_params.exec_start = now;
+}
+
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
+
+static void litmus_tick(struct rq *rq, struct task_struct *p)
+{
+	if (is_realtime(p))
+		update_time_litmus(rq, p);
+	litmus->tick(p);
+}
+
+#define NO_CPU -1
+
+static void litmus_schedule(struct rq *rq, struct task_struct *prev)
+{
+	struct rq* other_rq;
+	long prev_state;
+	lt_t _maybe_deadlock = 0;
+	/* WARNING: rq is _not_ locked! */
+	if (is_realtime(prev))
+		update_time_litmus(rq, prev);
+
+	/* let the plugin schedule */
+	rq->litmus_next = litmus->schedule(prev);
+
+	/* check if a global plugin pulled a task from a different RQ */
+	if (rq->litmus_next && task_rq(rq->litmus_next) != rq) {
+		/* we need to migrate the task */
+		other_rq = task_rq(rq->litmus_next);
+		TRACE_TASK(rq->litmus_next, "migrate from %d\n", other_rq->cpu);
+
+		/* while we drop the lock, the prev task could change its 
+		 * state
+		 */
+		prev_state = prev->state;
+		mb();
+		spin_unlock(&rq->lock);
+
+		/* Don't race with a concurrent switch.
+		 * This could deadlock in the case of cross or circular migrations.
+		 * It's the job of the plugin to make sure that doesn't happen.
+		 */
+		TRACE_TASK(rq->litmus_next, "stack_in_use=%d\n", 
+			   rq->litmus_next->rt_param.stack_in_use);
+		if (rq->litmus_next->rt_param.stack_in_use != NO_CPU) {
+			TRACE_TASK(rq->litmus_next, "waiting to deschedule\n");
+			_maybe_deadlock = litmus_clock();
+		}
+		while (rq->litmus_next->rt_param.stack_in_use != NO_CPU) {
+			cpu_relax();
+			mb();
+			if (rq->litmus_next->rt_param.stack_in_use == NO_CPU)
+				TRACE_TASK(rq->litmus_next,
+					   "descheduled. Proceeding.\n");
+			if (lt_before(_maybe_deadlock + 10000000, litmus_clock())) {
+				/* We've been spinning for 10ms.
+				 * Something can't be right!
+				 * Let's abandon the task and bail out; at least 
+				 * we will have debug info instead of a hard
+				 * deadlock.
+				 */
+				TRACE_TASK(rq->litmus_next, 
+					   "stack too long in use. Deadlock?\n");
+				rq->litmus_next = NULL;
+				
+				/* bail out */
+				spin_lock(&rq->lock);
+				return;
+			}
+		}
+#ifdef  __ARCH_WANT_UNLOCKED_CTXSW
+		if (rq->litmus_next->oncpu)
+			TRACE_TASK(rq->litmus_next, "waiting for !oncpu");
+		while (rq->litmus_next->oncpu) {
+			cpu_relax();
+			mb();
+		}
+#endif
+		double_rq_lock(rq, other_rq);
+		mb();
+		if (prev->state != prev_state && is_realtime(prev)) {
+			TRACE_TASK(prev, 
+				   "state changed while we dropped"
+				   " the lock: now=%d, old=%d\n",
+				   prev->state, prev_state);
+			if (prev_state && !prev->state) {
+				/* prev task became unblocked
+				 * we need to simulate normal sequence of events 
+				 * to scheduler plugins.
+				 */
+				litmus->task_block(prev);
+				litmus->task_wake_up(prev);
+			}
+		}
+
+		set_task_cpu(rq->litmus_next, smp_processor_id());
+
+		/* DEBUG: now that we have the lock we need to make sure a
+		 *  couple of things still hold:
+		 *  - it is still a real-time task
+		 *  - it is still runnable (could have been stopped)
+		 */
+		if (!is_realtime(rq->litmus_next) ||
+		    !is_running(rq->litmus_next)) {
+			/* BAD BAD BAD */
+			TRACE_TASK(rq->litmus_next, 
+				   "migration invariant FAILED: rt=%d running=%d\n",
+				   is_realtime(rq->litmus_next),			   
+				   is_running(rq->litmus_next));
+			/* drop the task */
+			rq->litmus_next = NULL;
+		}
+		/* release the other CPU's runqueue, but keep ours */
+		spin_unlock(&other_rq->lock);
+	}
+	if (rq->litmus_next)
+		rq->litmus_next->rt_param.stack_in_use = rq->cpu;
+}
+
+static void enqueue_task_litmus(struct rq *rq, struct task_struct *p, int wakeup)
+{
+	if (wakeup)
+		litmus->task_wake_up(p);
+	else
+		TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
+}
+
+static void dequeue_task_litmus(struct rq *rq, struct task_struct *p, int sleep)
+{
+	if (sleep)
+		litmus->task_block(p);
+	else
+		TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
+}
+
+static void yield_task_litmus(struct rq *rq)
+{
+	BUG_ON(rq->curr != current);
+	litmus->complete_job();
+}
+
+/* Plugins are responsible for this.
+ */
+static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+
+/* has already been taken care of */
+static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+
+static struct task_struct *pick_next_task_litmus(struct rq *rq)
+{
+	struct task_struct* picked = rq->litmus_next;
+	rq->litmus_next = NULL;
+	if (picked)
+		picked->rt_param.job_params.exec_start = litmus_clock();
+	return picked;
+}
+
+static void task_tick_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+
+/* This is called when a task became a real-time task, either due
+ * to a SCHED_* class transition or due to PI mutex inheritance.\
+ * We don't handle Linux PI mutex inheritance yet. Use LITMUS provided
+ * synchronization primitives instead.
+ */
+static void set_curr_task_litmus(struct rq *rq)
+{
+	rq->curr->rt_param.job_params.exec_start = litmus_clock();
+}
+
+
+#ifdef CONFIG_SMP
+
+/* we don't repartition at runtime */
+
+static unsigned long
+load_balance_litmus(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		unsigned long max_load_move,
+		struct sched_domain *sd, enum cpu_idle_type idle,
+		int *all_pinned, int *this_best_prio)
+{
+	return 0;
+}
+
+static int
+move_one_task_litmus(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		 struct sched_domain *sd, enum cpu_idle_type idle)
+{
+	return 0;
+}
+#endif
+
+const struct sched_class litmus_sched_class = {
+	.next			= &rt_sched_class,
+	.enqueue_task		= enqueue_task_litmus,
+	.dequeue_task		= dequeue_task_litmus,
+	.yield_task		= yield_task_litmus,
+
+	.check_preempt_curr	= check_preempt_curr_litmus,
+
+	.pick_next_task		= pick_next_task_litmus,
+	.put_prev_task		= put_prev_task_litmus,
+
+#ifdef CONFIG_SMP
+	.load_balance		= load_balance_litmus,
+	.move_one_task		= move_one_task_litmus,
+#endif
+
+	.set_curr_task          = set_curr_task_litmus,
+	.task_tick		= task_tick_litmus,
+};
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100755
index 0000000..6f95688
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,785 @@
+/*
+ * kernel/sched_pfair.c
+ *
+ * Implementation of the (global) Pfair scheduling algorithm.
+ *
+ */
+
+#include <asm/div64.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/rt_domain.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/heap.h>
+
+/* Tick period is used to convert ns-specified execution
+ * costs and periods into tick-based equivalents.
+ */
+extern ktime_t tick_period;
+
+/* make the unit explicit */
+typedef unsigned long quanta_t;
+
+struct subtask {
+	/* measured in quanta relative to job release */
+	quanta_t release;
+        quanta_t deadline;
+	quanta_t overlap; /* called "b bit" by PD^2 */
+	quanta_t group_deadline;
+};
+
+struct pfair_param   {
+	quanta_t	quanta;       /* number of subtasks */
+	quanta_t	cur;          /* index of current subtask */
+
+	quanta_t	release;      /* in quanta */
+	quanta_t	period;       /* in quanta */
+
+	quanta_t	last_quantum; /* when scheduled last */
+	int		last_cpu;     /* where scheduled last */
+
+	unsigned int	present;    /* Can the task be scheduled? */
+
+	struct subtask subtasks[0];   /* allocate together with pfair_param */
+};
+
+#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
+
+struct pfair_state {
+	int cpu;
+	volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
+				        * the time */
+	volatile quanta_t local_tick;  /* What tick is the local CPU currently
+				        * executing? Updated only by the local
+				        * CPU. In QEMU, this may lag behind the
+					* current tick. In a real system, with
+					* proper timers and aligned quanta,
+					* that should only be the
+					* case for a very short time after the
+					* time advanced. With staggered quanta,
+					* it will lag for the duration of the
+					* offset.
+					*/
+
+	struct task_struct* linked;    /* the task that should be executing */
+	struct task_struct* local;     /* the local copy of linked          */
+	struct task_struct* scheduled; /* what is actually scheduled        */
+
+	unsigned long missed_quanta;
+};
+
+/* Currently, we limit the maximum period of any task to 1000 quanta.
+ * The reason is that it makes the implementation easier since we do not
+ * need to reallocate the release wheel on task arrivals.
+ * In the future
+ */
+#define PFAIR_MAX_PERIOD 1000
+
+/* This is the release queue wheel. It is indexed by pfair_time %
+ * PFAIR_MAX_PERIOD.  Each heap is ordered by PFAIR priority, so that it can be
+ * merged with the ready queue.
+ */
+static struct heap release_queue[PFAIR_MAX_PERIOD];
+
+DEFINE_PER_CPU(struct pfair_state, pfair_state);
+struct pfair_state*  pstate[NR_CPUS]; /* short cut */
+
+#define NO_CPU 0xffffffff
+
+static quanta_t pfair_time = 0; /* the "official" PFAIR clock */
+static quanta_t merge_time = 0; /* Updated after the release queue has been
+				 * merged. Used by drop_all_references().
+				 */
+
+static rt_domain_t pfair;
+
+/* The pfair_lock is used to serialize all scheduling events.
+ */
+#define pfair_lock pfair.ready_lock
+
+/* Enable for lots of trace info.
+ * #define PFAIR_DEBUG
+ */
+
+#ifdef PFAIR_DEBUG
+#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, # args)
+#define PTRACE(f, args...) TRACE(f, # args)
+#else
+#define PTRACE_TASK(t, f, args...)
+#define PTRACE(f, args...)
+#endif
+
+/* gcc will inline all of these accessor functions... */
+static struct subtask* cur_subtask(struct task_struct* t)
+{
+	return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
+}
+
+static quanta_t cur_deadline(struct task_struct* t)
+{
+	return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
+}
+
+static quanta_t cur_release(struct task_struct* t)
+{
+#ifdef EARLY_RELEASE
+	/* only the release of the first subtask counts when we early
+	 * release */
+	return tsk_pfair(t)->release;
+#else
+	return cur_subtask(t)->release +  tsk_pfair(t)->release;
+#endif
+}
+
+static quanta_t cur_sub_release(struct task_struct* t)
+{
+	return cur_subtask(t)->release +  tsk_pfair(t)->release;
+}
+
+static quanta_t cur_overlap(struct task_struct* t)
+{
+	return cur_subtask(t)->overlap;
+}
+
+static quanta_t cur_group_deadline(struct task_struct* t)
+{
+	quanta_t gdl = cur_subtask(t)->group_deadline;
+	if (gdl)
+		return gdl + tsk_pfair(t)->release;
+	else
+		return gdl;
+}
+
+enum round {
+	FLOOR,
+	CEIL
+};
+
+static quanta_t time2quanta(lt_t time, enum round round)
+{
+	s64  quantum_length = ktime_to_ns(tick_period);
+
+	if (do_div(time, quantum_length) && round == CEIL)
+		time++;
+	return (quanta_t) time;
+}
+
+static int pfair_higher_prio(struct task_struct* first,
+			     struct task_struct* second)
+{
+	return  /* first task must exist */
+		first && (
+		/* Does the second task exist and is it a real-time task?  If
+		 * not, the first task (which is a RT task) has higher
+		 * priority.
+		 */
+		!second || !is_realtime(second)  ||
+
+		/* Is the (subtask) deadline of the first task earlier?
+		 * Then it has higher priority.
+		 */
+		time_before(cur_deadline(first), cur_deadline(second)) ||
+
+		/* Do we have a deadline tie?
+		 * Then break by B-bit.
+		 */
+		(cur_deadline(first) == cur_deadline(second) &&
+		 cur_overlap(first) > cur_overlap(second)) ||
+
+		/* Do we have a B-bit tie?
+		 * Then break by group deadline.
+		 */
+		(cur_overlap(first) == cur_overlap(second) &&
+		 time_after(cur_group_deadline(first),
+			    cur_group_deadline(second))) ||
+
+		/* Do we have a group deadline tie?
+		 * Then break by PID, which are unique.
+		 */
+		(cur_group_deadline(first) ==
+		 cur_group_deadline(second) &&
+		 first->pid < second->pid));
+}
+
+int pfair_ready_order(struct heap_node* a, struct heap_node* b)
+{
+	return pfair_higher_prio(heap2task(a), heap2task(b));
+}
+
+/* return the proper release queue for time t */
+static struct heap* relq(quanta_t t)
+{
+	struct heap* rq = &release_queue[t % PFAIR_MAX_PERIOD];
+	return rq;
+}
+
+static void prepare_release(struct task_struct* t, quanta_t at)
+{
+	tsk_pfair(t)->release    = at;
+	tsk_pfair(t)->cur        = 0;
+}
+
+static void __pfair_add_release(struct task_struct* t, struct heap* queue)
+{
+	heap_insert(pfair_ready_order, queue,
+		    tsk_rt(t)->heap_node);
+}
+
+static void pfair_add_release(struct task_struct* t)
+{
+	BUG_ON(heap_node_in_heap(tsk_rt(t)->heap_node));
+	__pfair_add_release(t, relq(cur_release(t)));
+}
+
+/* pull released tasks from the release queue */
+static void poll_releases(quanta_t time)
+{
+	heap_union(pfair_ready_order, &pfair.ready_queue, relq(time));
+	merge_time = time;
+}
+
+static void check_preempt(struct task_struct* t)
+{
+	int cpu = NO_CPU;
+	if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
+	    tsk_pfair(t)->present) {
+		/* the task can be scheduled and
+		 * is not scheduled where it ought to be scheduled
+		 */
+		cpu = tsk_rt(t)->linked_on != NO_CPU ?
+			tsk_rt(t)->linked_on         :
+			tsk_rt(t)->scheduled_on;
+		PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
+			   tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
+		/* preempt */
+		if (cpu == smp_processor_id())
+			set_tsk_need_resched(current);
+		else {
+			smp_send_reschedule(cpu);
+		}
+	}
+}
+
+/* returns 1 if the task needs to go the release queue */
+static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
+{
+	struct pfair_param* p = tsk_pfair(t);
+
+	p->cur = (p->cur + 1) % p->quanta;
+	TRACE_TASK(t, "on %d advanced to subtask %lu\n",
+		   cpu,
+		   p->cur);
+	if (!p->cur) {
+		/* we start a new job */
+		get_rt_flags(t) = RT_F_RUNNING;
+		prepare_for_next_period(t);
+		p->release += p->period;
+	}
+	return time_after(cur_release(t), time);
+}
+
+static void advance_subtasks(quanta_t time)
+{
+	int cpu, missed;
+	struct task_struct* l;
+	struct pfair_param* p;
+
+	for_each_online_cpu(cpu) {
+		l = pstate[cpu]->linked;
+		missed = pstate[cpu]->linked != pstate[cpu]->local;
+		if (l) {
+			p = tsk_pfair(l);
+			p->last_quantum = time;
+			p->last_cpu     =  cpu;
+			if (advance_subtask(time, l, cpu)) {
+				pstate[cpu]->linked = NULL;
+				pfair_add_release(l);
+			}
+		}
+	}
+}
+
+static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
+{
+	int cpu;
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		/* always observe scheduled_on linkage */
+		default_cpu = tsk_rt(t)->scheduled_on;
+		PTRACE_TASK(t, "forced on %d (scheduled on)\n", default_cpu);
+	} else if (tsk_pfair(t)->last_quantum == time - 1) {
+		/* back2back quanta */
+		/* Only observe last_quantum if no scheduled_on is in the way.
+		 * This should only kick in if a CPU missed quanta, and that
+		 * *should* only happen in QEMU.
+		 */
+		cpu = tsk_pfair(t)->last_cpu;
+		if (!pstate[cpu]->linked ||
+		    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
+			default_cpu = cpu;
+			PTRACE_TASK(t, "forced on %d (linked on)\n",
+				    default_cpu);
+		} else {
+			PTRACE_TASK(t, "DID NOT force on %d (linked on)\n",
+				    default_cpu);
+		}
+	}
+	return default_cpu;
+}
+
+/* returns one if linking was redirected */
+static int pfair_link(quanta_t time, int cpu,
+		      struct task_struct* t)
+{
+	int target = target_cpu(time, t, cpu);
+	struct task_struct* prev  = pstate[cpu]->linked;
+	struct task_struct* other;
+
+	PTRACE_TASK(t, "linked to %d for quantum %lu\n", target, time);
+	if (target != cpu) {
+		other = pstate[target]->linked;
+		pstate[target]->linked = t;
+		tsk_rt(t)->linked_on   = target;
+		if (!other)
+			/* linked ok, but reschedule this CPU */
+			return 1;
+		if (target < cpu) {
+			/* link other to cpu instead */
+			tsk_rt(other)->linked_on = cpu;
+			pstate[cpu]->linked      = other;
+			if (prev) {
+				/* prev got pushed back into the ready queue */
+				tsk_rt(prev)->linked_on = NO_CPU;
+				__add_ready(&pfair, prev);
+			}
+			/* we are done with this cpu */
+			return 0;
+		} else {
+			/* re-add other, it's original CPU was not considered yet */
+			tsk_rt(other)->linked_on = NO_CPU;
+			__add_ready(&pfair, other);
+			/* reschedule this CPU */
+			return 1;
+		}
+	} else {
+		pstate[cpu]->linked  = t;
+		tsk_rt(t)->linked_on = cpu;
+		if (prev) {
+			/* prev got pushed back into the ready queue */
+			tsk_rt(prev)->linked_on = NO_CPU;
+			__add_ready(&pfair, prev);
+		}
+		/* we are done with this CPU */
+		return 0;
+	}
+}
+
+static void schedule_subtasks(quanta_t time)
+{
+	int cpu, retry;
+
+	for_each_online_cpu(cpu) {
+		retry = 1;
+		while (retry) {
+			if (pfair_higher_prio(__peek_ready(&pfair),
+					      pstate[cpu]->linked))
+				retry = pfair_link(time, cpu,
+						   __take_ready(&pfair));
+			else
+				retry = 0;
+		}
+	}
+}
+
+static void schedule_next_quantum(quanta_t time)
+{
+	int cpu;
+
+	PTRACE("<<< Q %lu at %llu\n",
+	       time, litmus_clock());
+
+	/* called with interrupts disabled */
+	spin_lock(&pfair_lock);
+
+	advance_subtasks(time);
+	poll_releases(time);
+	schedule_subtasks(time);
+
+	spin_unlock(&pfair_lock);
+
+	/* We are done. Advance time. */
+	mb();
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		pstate[cpu]->cur_tick = pfair_time;
+	PTRACE(">>> Q %lu at %llu\n",
+	       time, litmus_clock());
+}
+
+/* pfair_tick - this function is called for every local timer
+ *                         interrupt.
+ */
+static void pfair_tick(struct task_struct* t)
+{
+	struct pfair_state* state = &__get_cpu_var(pfair_state);
+	quanta_t time, loc, cur;
+
+	/* Attempt to advance time. First CPU to get here 00
+	 * will prepare the next quantum.
+	 */
+	time = cmpxchg(&pfair_time,
+		       state->local_tick,     /* expected */
+		       state->local_tick + 1  /* next     */
+		);
+	if (time == state->local_tick)
+		/* exchange succeeded */
+		schedule_next_quantum(time + 1);
+
+	/* Spin locally until time advances. */
+	while (1) {
+		mb();
+		cur = state->cur_tick;
+		loc = state->local_tick;
+		if (time_before(loc, cur)) {
+			if (loc + 1 != cur) {
+				TRACE("MISSED quantum! loc:%lu -> cur:%lu\n",
+				      loc, cur);
+				state->missed_quanta++;
+			}
+			break;
+		}
+		cpu_relax();
+	}
+
+	/* copy state info */
+	state->local_tick = state->cur_tick;
+	state->local      = state->linked;
+	if (state->local && tsk_pfair(state->local)->present &&
+	    state->local != current)
+		set_tsk_need_resched(current);
+}
+
+static int safe_to_schedule(struct task_struct* t, int cpu)
+{
+	int where = tsk_rt(t)->scheduled_on;
+	if (where != NO_CPU && where != cpu) {
+		TRACE_TASK(t, "BAD: can't be scheduled on %d, "
+			   "scheduled already on %d.\n", cpu, where);
+		return 0;
+	} else
+		return tsk_pfair(t)->present && get_rt_flags(t) == RT_F_RUNNING;
+}
+
+static struct task_struct* pfair_schedule(struct task_struct * prev)
+{
+	struct pfair_state* state = &__get_cpu_var(pfair_state);
+	int blocks;
+	struct task_struct* next = NULL;
+
+	spin_lock(&pfair_lock);
+
+	blocks  = is_realtime(prev) && !is_running(prev);
+
+	if (blocks)
+		tsk_pfair(prev)->present = 0;
+
+	if (state->local && safe_to_schedule(state->local, state->cpu))
+		next = state->local;
+
+	if (prev != next) {
+		tsk_rt(prev)->scheduled_on = NO_CPU;
+		if (next)
+			tsk_rt(next)->scheduled_on = state->cpu;
+	}
+
+	spin_unlock(&pfair_lock);
+
+	if (next)
+		TRACE_TASK(next, "scheduled rel=%lu at %lu\n",
+			   tsk_pfair(next)->release, pfair_time);
+	else if (is_realtime(prev))
+		TRACE("Becomes idle at %lu\n", pfair_time);
+
+	return next;
+}
+
+static void pfair_task_new(struct task_struct * t, int on_rq, int running)
+{
+	unsigned long 		flags;
+
+	TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
+
+	spin_lock_irqsave(&pfair_lock, flags);
+	if (running)
+		t->rt_param.scheduled_on = task_cpu(t);
+	else
+		t->rt_param.scheduled_on = NO_CPU;
+
+	prepare_release(t, pfair_time + 1);
+	tsk_pfair(t)->present = running;
+	pfair_add_release(t);
+	check_preempt(t);
+
+	spin_unlock_irqrestore(&pfair_lock, flags);
+}
+
+static void pfair_task_wake_up(struct task_struct *t)
+{
+	unsigned long flags;
+
+	TRACE_TASK(t, "wakes at %lld, release=%lu, pfair_time:%lu\n",
+		   cur_release(t), pfair_time);
+
+	spin_lock_irqsave(&pfair_lock, flags);
+
+	tsk_pfair(t)->present = 1;
+
+	/* It is a little unclear how to deal with Pfair
+	 * tasks that block for a while and then wake.
+	 * For now, we assume that such suspensions are included
+	 * in the stated execution time of the task, and thus
+	 * count as execution time for our purposes. Thus, if the
+	 * task is currently linked somewhere, it may resume, otherwise
+	 * it has to wait for its next quantum allocation.
+	 */
+
+	check_preempt(t);
+
+	spin_unlock_irqrestore(&pfair_lock, flags);
+}
+
+static void pfair_task_block(struct task_struct *t)
+{
+	BUG_ON(!is_realtime(t));
+	TRACE_TASK(t, "blocks at %lld, state:%d\n",
+		   (lt_t) jiffies, t->state);
+}
+
+/* caller must hold pfair_lock */
+static void drop_all_references(struct task_struct *t)
+{
+	int cpu;
+	struct pfair_state* s;
+	struct heap* q;
+	if (heap_node_in_heap(tsk_rt(t)->heap_node)) {
+		/* figure out what queue the node is in */
+		if (time_before_eq(cur_release(t), merge_time))
+			q = &pfair.ready_queue;
+		else
+			q = relq(cur_release(t));
+		heap_delete(pfair_ready_order, q,
+			    tsk_rt(t)->heap_node);
+	}
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		s = &per_cpu(pfair_state, cpu);
+		if (s->linked == t)
+			s->linked = NULL;
+		if (s->local  == t)
+			s->local  = NULL;
+		if (s->scheduled  == t)
+			s->scheduled = NULL;
+	}
+}
+
+static void pfair_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+
+	BUG_ON(!is_realtime(t));
+
+	/* Remote task from release or ready queue, and ensure
+	 * that it is not the scheduled task for ANY CPU. We
+	 * do this blanket check because occassionally when
+	 * tasks exit while blocked, the task_cpu of the task
+	 * might not be the same as the CPU that the PFAIR scheduler
+	 * has chosen for it.
+	 */
+	spin_lock_irqsave(&pfair_lock, flags);
+
+	TRACE_TASK(t, "RIP, state:%d\n", t->state);
+	drop_all_references(t);
+
+	spin_unlock_irqrestore(&pfair_lock, flags);
+
+	kfree(t->rt_param.pfair);
+	t->rt_param.pfair = NULL;
+}
+
+
+static void pfair_release_at(struct task_struct* task, lt_t start)
+{
+	unsigned long flags;
+	lt_t now = litmus_clock();
+	quanta_t release, delta;
+
+	BUG_ON(!is_realtime(task));
+
+	spin_lock_irqsave(&pfair_lock, flags);
+	if (lt_before(now, start)) {
+		delta = time2quanta((long long) start - (long long) now, CEIL);
+		if (delta >= PFAIR_MAX_PERIOD)
+			delta = PFAIR_MAX_PERIOD - 1;
+	} else
+		delta = 10;  /* release in 10 ticks */
+
+	release = pfair_time + delta;
+
+	drop_all_references(task);
+	prepare_release(task, release);
+	pfair_add_release(task);
+	spin_unlock_irqrestore(&pfair_lock, flags);
+}
+
+static void init_subtask(struct subtask* sub, unsigned long i,
+			 lt_t quanta, lt_t period)
+{
+	/* since i is zero-based, the formulas are shifted by one */
+	lt_t tmp;
+
+	/* release */
+	tmp = period * i;
+	do_div(tmp, quanta); /* floor */
+	sub->release = (quanta_t) tmp;
+
+	/* deadline */
+	tmp = period * (i + 1);
+	if (do_div(tmp, quanta)) /* ceil */
+		tmp++;
+	sub->deadline = (quanta_t) tmp;
+
+	/* next release */
+	tmp = period * (i + 1);
+	do_div(tmp, quanta); /* floor */
+	sub->overlap =  sub->deadline - (quanta_t) tmp;
+
+	/* Group deadline.
+	 * Based on the formula given in Uma's thesis.
+	 */
+	if (2 * quanta >= period) {
+		/* heavy */
+		tmp = (sub->deadline - (i + 1)) * period;
+		if (do_div(tmp, (period - quanta))) /* ceil */
+			tmp++;
+		sub->group_deadline = (quanta_t) tmp;
+	} else
+		sub->group_deadline = 0;
+}
+
+static void dump_subtasks(struct task_struct* t)
+{
+	unsigned long i;
+	for (i = 0; i < t->rt_param.pfair->quanta; i++)
+		TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
+			   i + 1,
+			   t->rt_param.pfair->subtasks[i].release,
+			   t->rt_param.pfair->subtasks[i].deadline,
+			   t->rt_param.pfair->subtasks[i].overlap,
+			   t->rt_param.pfair->subtasks[i].group_deadline);
+}
+
+static long pfair_admit_task(struct task_struct* t)
+{
+	lt_t quanta;
+	lt_t period;
+	s64  quantum_length = ktime_to_ns(tick_period);
+	struct pfair_param* param;
+	unsigned long i;
+
+	/* Pfair is a tick-based method, so the time
+	 * of interest is jiffies. Calculate tick-based
+	 * times for everything.
+	 * (Ceiling of exec cost, floor of period.)
+	 */
+
+	quanta = get_exec_cost(t);
+	period = get_rt_period(t);
+
+	quanta = time2quanta(get_exec_cost(t), CEIL);
+
+	if (do_div(period, quantum_length))
+		printk(KERN_WARNING
+		       "The period of %s/%d is not a multiple of %llu.\n",
+		       t->comm, t->pid, (unsigned long long) quantum_length);
+
+	if (period >= PFAIR_MAX_PERIOD) {
+		printk(KERN_WARNING
+		       "PFAIR: Rejecting task %s/%d; its period is too long.\n",
+		       t->comm, t->pid);
+		return -EINVAL;
+	}
+
+	param = kmalloc(sizeof(struct pfair_param) +
+			quanta * sizeof(struct subtask), GFP_ATOMIC);
+
+	if (!param)
+		return -ENOMEM;
+
+	param->quanta  = quanta;
+	param->cur     = 0;
+	param->release = 0;
+	param->period  = period;
+
+	for (i = 0; i < quanta; i++)
+		init_subtask(param->subtasks + i, i, quanta, period);
+
+	if (t->rt_param.pfair)
+		/* get rid of stale allocation */
+		kfree(t->rt_param.pfair);
+
+	t->rt_param.pfair = param;
+
+	/* spew out some debug info */
+	dump_subtasks(t);
+
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "PFAIR",
+	.tick			= pfair_tick,
+	.task_new		= pfair_task_new,
+	.task_exit		= pfair_task_exit,
+	.schedule		= pfair_schedule,
+	.task_wake_up		= pfair_task_wake_up,
+	.task_block		= pfair_task_block,
+	.admit_task		= pfair_admit_task,
+	.release_at		= pfair_release_at,
+	.complete_job		= complete_job
+};
+
+static int __init init_pfair(void)
+{
+	int cpu, i;
+	struct pfair_state *state;
+
+	/* initialize release queue */
+	for (i = 0; i < PFAIR_MAX_PERIOD; i++)
+		heap_init(&release_queue[i]);
+
+	/* initialize CPU state */
+	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+		state = &per_cpu(pfair_state, cpu);
+		state->cpu 	  = cpu;
+		state->cur_tick   = 0;
+		state->local_tick = 0;
+		state->linked     = NULL;
+		state->local      = NULL;
+		state->scheduled  = NULL;
+		state->missed_quanta = 0;
+		pstate[cpu] = state;
+	}
+
+	rt_domain_init(&pfair, pfair_ready_order, NULL, NULL);
+	return register_sched_plugin(&pfair_plugin);
+}
+
+module_init(init_pfair);
+
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
new file mode 100644
index 0000000..497d703
--- /dev/null
+++ b/litmus/sched_plugin.c
@@ -0,0 +1,185 @@
+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
+ *
+ * This file includes the initialization of the plugin system, the no-op Linux
+ * scheduler plugin and some dummy functions.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+
+#include <litmus/jobs.h>
+
+/*************************************************************
+ *                   Dummy plugin functions                  *
+ *************************************************************/
+
+static void litmus_dummy_finish_switch(struct task_struct * prev)
+{
+}
+
+static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
+{
+	return NULL;
+}
+
+static void litmus_dummy_tick(struct task_struct* tsk)
+{
+}
+
+static long litmus_dummy_admit_task(struct task_struct* tsk)
+{
+	printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
+		tsk->comm, tsk->pid);
+	return -EINVAL;
+}
+
+static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
+{
+}
+
+static void litmus_dummy_task_wake_up(struct task_struct *task)
+{
+}
+
+static void litmus_dummy_task_block(struct task_struct *task)
+{
+}
+
+static void litmus_dummy_task_exit(struct task_struct *task)
+{
+}
+
+static long litmus_dummy_complete_job(void)
+{
+	return -ENOSYS;
+}
+
+#ifdef CONFIG_FMLP
+
+static long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
+					  struct task_struct *new_owner)
+{
+	return -ENOSYS;
+}
+
+static long litmus_dummy_return_priority(struct pi_semaphore *sem)
+{
+	return -ENOSYS;
+}
+
+static long litmus_dummy_pi_block(struct pi_semaphore *sem,
+				  struct task_struct *new_waiter)
+{
+	return -ENOSYS;
+}
+
+#endif
+
+
+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
+ * job.
+ */
+struct sched_plugin linux_sched_plugin = {
+	.plugin_name = "Linux",
+	.tick = litmus_dummy_tick,
+	.task_new   = litmus_dummy_task_new,
+	.task_exit = litmus_dummy_task_exit,
+	.task_wake_up = litmus_dummy_task_wake_up,
+	.task_block = litmus_dummy_task_block,
+	.complete_job = litmus_dummy_complete_job,
+	.schedule = litmus_dummy_schedule,
+	.finish_switch = litmus_dummy_finish_switch,
+#ifdef CONFIG_FMLP
+	.inherit_priority = litmus_dummy_inherit_priority,
+	.return_priority = litmus_dummy_return_priority,
+	.pi_block = litmus_dummy_pi_block,
+#endif
+	.admit_task = litmus_dummy_admit_task
+};
+
+/*
+ *	The reference to current plugin that is used to schedule tasks within
+ *	the system. It stores references to actual function implementations
+ *	Should be initialized by calling "init_***_plugin()"
+ */
+struct sched_plugin *litmus = &linux_sched_plugin;
+
+/* the list of registered scheduling plugins */
+static LIST_HEAD(sched_plugins);
+static DEFINE_SPINLOCK(sched_plugins_lock);
+
+#define CHECK(func) {\
+	if (!plugin->func) \
+		plugin->func = litmus_dummy_ ## func;}
+
+/* FIXME: get reference to module  */
+int register_sched_plugin(struct sched_plugin* plugin)
+{
+	printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
+	       plugin->plugin_name);
+
+	/* make sure we don't trip over null pointers later */
+	CHECK(finish_switch);
+	CHECK(schedule);
+	CHECK(tick);
+	CHECK(task_wake_up);
+	CHECK(task_exit);
+	CHECK(task_block);
+	CHECK(task_new);
+	CHECK(complete_job);
+#ifdef CONFIG_FMLP
+	CHECK(inherit_priority);
+	CHECK(return_priority);
+	CHECK(pi_block);
+#endif
+	CHECK(admit_task);
+
+	if (!plugin->release_at)
+		plugin->release_at = release_at;
+
+	spin_lock(&sched_plugins_lock);
+	list_add(&plugin->list, &sched_plugins);
+	spin_unlock(&sched_plugins_lock);
+
+	return 0;
+}
+
+
+/* FIXME: reference counting, etc. */
+struct sched_plugin* find_sched_plugin(const char* name)
+{
+	struct list_head *pos;
+	struct sched_plugin *plugin;
+
+	spin_lock(&sched_plugins_lock);
+	list_for_each(pos, &sched_plugins) {
+		plugin = list_entry(pos, struct sched_plugin, list);
+		if (!strcmp(plugin->plugin_name, name))
+		    goto out_unlock;
+	}
+	plugin = NULL;
+
+out_unlock:
+	spin_unlock(&sched_plugins_lock);
+	return plugin;
+}
+
+int print_sched_plugins(char* buf, int max)
+{
+	int count = 0;
+	struct list_head *pos;
+	struct sched_plugin *plugin;
+
+	spin_lock(&sched_plugins_lock);
+	list_for_each(pos, &sched_plugins) {
+		plugin = list_entry(pos, struct sched_plugin, list);
+		count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name);
+		if (max - count <= 0)
+			break;
+	}
+	spin_unlock(&sched_plugins_lock);
+	return 	count;
+}
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
new file mode 100644
index 0000000..0e9c9dd
--- /dev/null
+++ b/litmus/sched_psn_edf.c
@@ -0,0 +1,454 @@
+
+/*
+ * kernel/sched_psn_edf.c
+ *
+ * Implementation of the PSN-EDF scheduler plugin.
+ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
+ *
+ * Suspensions and non-preemptable sections are supported.
+ * Priority inheritance is not supported.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+
+
+typedef struct {
+	rt_domain_t 		domain;
+	int          		cpu;
+	struct task_struct* 	scheduled; /* only RT tasks */
+
+/* scheduling lock
+ */
+#define slock domain.ready_lock
+/* protects the domain and
+ * serializes scheduling decisions
+ */
+} psnedf_domain_t;
+
+DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
+
+#define local_edf		(&__get_cpu_var(psnedf_domains).domain)
+#define local_pedf		(&__get_cpu_var(psnedf_domains))
+#define remote_edf(cpu)		(&per_cpu(psnedf_domains, cpu).domain)
+#define remote_pedf(cpu)	(&per_cpu(psnedf_domains, cpu))
+#define task_edf(task)		remote_edf(get_partition(task))
+#define task_pedf(task)		remote_pedf(get_partition(task))
+
+
+static void psnedf_domain_init(psnedf_domain_t* pedf,
+			       check_resched_needed_t check,
+			       release_job_t release,
+			       int cpu)
+{
+	edf_domain_init(&pedf->domain, check, release);
+	pedf->cpu      		= cpu;
+	pedf->scheduled		= NULL;
+}
+
+static void requeue(struct task_struct* t, rt_domain_t *edf)
+{
+	if (t->state != TASK_RUNNING)
+		TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
+
+	set_rt_flags(t, RT_F_RUNNING);
+	if (is_released(t, litmus_clock()))
+		__add_ready(edf, t);
+	else
+		add_release(edf, t); /* it has got to wait */
+}
+
+/* we assume the lock is being held */
+static void preempt(psnedf_domain_t *pedf)
+{
+	if (smp_processor_id() == pedf->cpu) {
+		if (pedf->scheduled && is_np(pedf->scheduled))
+			request_exit_np(pedf->scheduled);
+		else
+			set_tsk_need_resched(current);
+	} else
+		/* in case that it is a remote CPU we have to defer the
+		 * the decision to the remote CPU
+		 */
+		smp_send_reschedule(pedf->cpu);
+}
+
+/* This check is trivial in partioned systems as we only have to consider
+ * the CPU of the partition.
+ */
+static int psnedf_check_resched(rt_domain_t *edf)
+{
+	psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
+	int ret = 0;
+
+	/* because this is a callback from rt_domain_t we already hold
+	 * the necessary lock for the ready queue
+	 */
+	if (edf_preemption_needed(edf, pedf->scheduled)) {
+		preempt(pedf);
+		ret = 1;
+	}
+	return ret;
+}
+
+static void psnedf_tick(struct task_struct *t)
+{
+	psnedf_domain_t    *pedf         = local_pedf;
+
+	/* Check for inconsistency. We don't need the lock for this since
+	 * ->scheduled is only changed in schedule, which obviously is not
+	 *  executing in parallel on this CPU
+	 */
+	BUG_ON(is_realtime(t) && t != pedf->scheduled);
+
+	if (is_realtime(t) && budget_exhausted(t)) {
+		if (!is_np(t))
+			set_tsk_need_resched(t);
+		else {
+			TRACE("psnedf_scheduler_tick: "
+			      "%d is non-preemptable, "
+			      "preemption delayed.\n", t->pid);
+			request_exit_np(t);
+		}
+	}
+}
+
+static void job_completion(struct task_struct* t)
+{
+	TRACE_TASK(t, "job_completion().\n");
+	set_rt_flags(t, RT_F_SLEEP);
+	prepare_for_next_period(t);
+}
+
+static struct task_struct* psnedf_schedule(struct task_struct * prev)
+{
+	psnedf_domain_t* 	pedf = local_pedf;
+	rt_domain_t*		edf  = &pedf->domain;
+	struct task_struct*	next;
+
+	int 			out_of_time, sleep, preempt,
+				np, exists, blocks, resched;
+
+	spin_lock(&pedf->slock);
+
+	/* sanity checking */
+	BUG_ON(pedf->scheduled && pedf->scheduled != prev);
+	BUG_ON(pedf->scheduled && !is_realtime(prev));
+
+	/* (0) Determine state */
+	exists      = pedf->scheduled != NULL;
+	blocks      = exists && !is_running(pedf->scheduled);
+	out_of_time = exists && budget_exhausted(pedf->scheduled);
+	np 	    = exists && is_np(pedf->scheduled);
+	sleep	    = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
+	preempt     = edf_preemption_needed(edf, prev);
+
+	/* If we need to preempt do so.
+	 * The following checks set resched to 1 in case of special
+	 * circumstances.
+	 */
+	resched = preempt;
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		resched = 1;
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * Multiple calls to request_exit_np() don't hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep))
+		request_exit_np(pedf->scheduled);
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks) {
+		job_completion(pedf->scheduled);
+		resched = 1;
+	}
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * Switch if we are in RT mode and have no task or if we need to
+	 * resched.
+	 */
+	next = NULL;
+	if ((!np || blocks) && (resched || !exists)) {
+		/* Take care of a previously scheduled
+		 * job by taking it out of the Linux runqueue.
+		 */
+		if (pedf->scheduled && !blocks)
+			requeue(pedf->scheduled, edf);
+		next = __take_ready(edf);
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	if (next) {
+		TRACE_TASK(next, " == next\n");
+		set_rt_flags(next, RT_F_RUNNING);
+	} else {
+		TRACE("becoming idle.\n");
+	}
+
+	pedf->scheduled = next;
+	spin_unlock(&pedf->slock);
+
+	return next;
+}
+
+
+/*	Prepare a task for running in RT mode
+ */
+static void psnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+	rt_domain_t* 		edf  = task_edf(t);
+	psnedf_domain_t* 	pedf = task_pedf(t);
+	unsigned long		flags;
+
+	TRACE_TASK(t, "new\n");
+
+	/* setup job parameters */
+	release_at(t, litmus_clock());
+
+	/* The task should be running in the queue, otherwise signal
+	 * code will try to wake it up with fatal consequences.
+	 */
+	spin_lock_irqsave(&pedf->slock, flags);
+	if (running) {
+		/* there shouldn't be anything else running at the time */
+		BUG_ON(pedf->scheduled);
+		pedf->scheduled = t;
+	} else {
+		requeue(t, edf);
+		/* maybe we have to reschedule */
+		preempt(pedf);
+	}
+	spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+static void psnedf_task_wake_up(struct task_struct *task)
+{
+	unsigned long		flags;
+	psnedf_domain_t* 	pedf = task_pedf(task);
+	rt_domain_t* 		edf  = task_edf(task);
+	lt_t			now;
+
+	TRACE_TASK(task, "wake up\n");
+	spin_lock_irqsave(&pedf->slock, flags);
+	BUG_ON(is_queued(task));
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 *
+	 * FIXME: This should be done in some more predictable and userspace-controlled way.
+	 */
+	now = litmus_clock();
+	if (is_tardy(task, now) &&
+	    get_rt_flags(task) != RT_F_EXIT_SEM) {
+		/* new sporadic release */
+		release_at(task, now);
+		sched_trace_job_release(task);
+	}
+	requeue(task, edf);
+	spin_unlock_irqrestore(&pedf->slock, flags);
+	TRACE_TASK(task, "wake up done\n");
+}
+
+static void psnedf_task_block(struct task_struct *t)
+{
+	/* only running tasks can block, thus t is in no queue */
+	TRACE_TASK(t, "block, state=%d\n", t->state);
+	BUG_ON(!is_realtime(t));
+	BUG_ON(is_queued(t));
+}
+
+static void psnedf_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	psnedf_domain_t* 	pedf = task_pedf(t);
+	rt_domain_t*		edf;
+
+	spin_lock_irqsave(&pedf->slock, flags);
+	if (is_queued(t)) {
+		/* dequeue */
+		edf  = task_edf(t);
+		remove(edf, t);
+	}
+	if (pedf->scheduled == t)
+		pedf->scheduled = NULL;
+	preempt(pedf);
+	spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+#ifdef CONFIG_FMLP
+static long psnedf_pi_block(struct pi_semaphore *sem,
+			    struct task_struct *new_waiter)
+{
+	psnedf_domain_t* 	pedf;
+	rt_domain_t*		edf;
+	struct task_struct*	t;
+	int cpu  = get_partition(new_waiter);
+
+	BUG_ON(!new_waiter);
+
+	if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) {
+		TRACE_TASK(new_waiter, " boosts priority\n");
+		pedf = task_pedf(new_waiter);
+		edf  = task_edf(new_waiter);
+
+		/* interrupts already disabled */
+		spin_lock(&pedf->slock);
+
+		/* store new highest-priority task */
+		sem->hp.cpu_task[cpu] = new_waiter;
+		if (sem->holder &&
+		    get_partition(sem->holder) == get_partition(new_waiter)) {
+			/* let holder inherit */
+			sem->holder->rt_param.inh_task = new_waiter;
+			t = sem->holder;
+			if (is_queued(t)) {
+				/* queued in domain*/
+				remove(edf, t);
+				/* readd to make priority change take place */
+				/* FIXME: this looks outdated */
+				if (is_released(t, litmus_clock()))
+					__add_ready(edf, t);
+				else
+					add_release(edf, t);
+			}
+		}
+
+		/* check if we need to reschedule */
+		if (edf_preemption_needed(edf, current))
+			preempt(pedf);
+
+		spin_unlock(&pedf->slock);
+	}
+
+	return 0;
+}
+
+static long psnedf_inherit_priority(struct pi_semaphore *sem,
+				    struct task_struct *new_owner)
+{
+	int cpu  = get_partition(new_owner);
+
+	new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu];
+	if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) {
+		TRACE_TASK(new_owner,
+			   "inherited priority from %s/%d\n",
+			   sem->hp.cpu_task[cpu]->comm,
+			   sem->hp.cpu_task[cpu]->pid);
+	} else
+		TRACE_TASK(new_owner,
+			   "cannot inherit priority: "
+			   "no higher priority job waits on this CPU!\n");
+	/* make new owner non-preemptable as required by FMLP under
+	 * PSN-EDF.
+	 */
+	make_np(new_owner);
+	return 0;
+}
+
+
+/* This function is called on a semaphore release, and assumes that
+ * the current task is also the semaphore holder.
+ */
+static long psnedf_return_priority(struct pi_semaphore *sem)
+{
+	struct task_struct* 	t    = current;
+	psnedf_domain_t* 	pedf = task_pedf(t);
+	rt_domain_t*		edf  = task_edf(t);
+	int 			ret  = 0;
+	int			cpu  = get_partition(current);
+
+
+        /* Find new highest-priority semaphore task
+	 * if holder task is the current hp.cpu_task[cpu].
+	 *
+	 * Calling function holds sem->wait.lock.
+	 */
+	if (t == sem->hp.cpu_task[cpu])
+		edf_set_hp_cpu_task(sem, cpu);
+
+	take_np(t);
+	if (current->rt_param.inh_task) {
+		TRACE_CUR("return priority of %s/%d\n",
+			  current->rt_param.inh_task->comm,
+			  current->rt_param.inh_task->pid);
+		spin_lock(&pedf->slock);
+
+		/* Reset inh_task to NULL. */
+		current->rt_param.inh_task = NULL;
+
+		/* check if we need to reschedule */
+		if (edf_preemption_needed(edf, current))
+			preempt(pedf);
+
+		spin_unlock(&pedf->slock);
+	} else
+		TRACE_CUR(" no priority to return %p\n", sem);
+
+	return ret;
+}
+
+#endif
+
+static long psnedf_admit_task(struct task_struct* tsk)
+{
+	return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "PSN-EDF",
+#ifdef CONFIG_SRP
+	.srp_active		= 1,
+#endif
+	.tick			= psnedf_tick,
+	.task_new		= psnedf_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= psnedf_task_exit,
+	.schedule		= psnedf_schedule,
+	.task_wake_up		= psnedf_task_wake_up,
+	.task_block		= psnedf_task_block,
+#ifdef CONFIG_FMLP
+	.fmlp_active		= 1,
+	.pi_block		= psnedf_pi_block,
+	.inherit_priority	= psnedf_inherit_priority,
+	.return_priority	= psnedf_return_priority,
+#endif
+	.admit_task		= psnedf_admit_task
+};
+
+
+static int __init init_psn_edf(void)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+	{
+		psnedf_domain_init(remote_pedf(i),
+				   psnedf_check_resched,
+				   NULL, i);
+	}
+	return register_sched_plugin(&psn_edf_plugin);
+}
+
+
+
+module_init(init_psn_edf);
diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c
new file mode 100644
index 0000000..4344785
--- /dev/null
+++ b/litmus/sched_trace.c
@@ -0,0 +1,569 @@
+/* sched_trace.c -- record scheduling events to a byte stream.
+ *
+ * TODO: Move ring buffer to a lockfree implementation.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+
+#include <litmus/sched_trace.h>
+#include <litmus/litmus.h>
+
+
+typedef struct {
+        /*	guard read and write pointers			*/
+	spinlock_t 	lock;
+	/*	guard against concurrent freeing of buffer 	*/
+	rwlock_t	del_lock;
+
+	/*	memory allocated for ring buffer		*/
+	unsigned long	order;
+	char*  		buf;
+	char*		end;
+
+	/*	Read/write pointer. May not cross.
+	 *	They point to the position of next write and
+	 *	last read.
+	 */
+	char* 		writep;
+	char*		readp;
+
+} ring_buffer_t;
+
+#define EMPTY_RING_BUFFER {	\
+	.lock     = SPIN_LOCK_UNLOCKED,		\
+	.del_lock = RW_LOCK_UNLOCKED,  		\
+	.buf      = NULL,      			\
+	.end      = NULL,			\
+	.writep   = NULL,			\
+	.readp    = NULL			\
+}
+
+void rb_init(ring_buffer_t* buf)
+{
+	*buf = (ring_buffer_t) EMPTY_RING_BUFFER;
+}
+
+int rb_alloc_buf(ring_buffer_t* buf, unsigned long order)
+{
+	unsigned long flags;
+	int error = 0;
+	char *mem;
+
+	/* do memory allocation while not atomic */
+	mem = (char *) __get_free_pages(GFP_KERNEL, order);
+	if (!mem)
+		return -ENOMEM;
+	write_lock_irqsave(&buf->del_lock, flags);
+	BUG_ON(buf->buf);
+	buf->buf = mem;
+	buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1;
+	memset(buf->buf, 0xff, buf->end - buf->buf);
+	buf->order = order;
+	buf->writep = buf->buf + 1;
+	buf->readp  = buf->buf;
+	write_unlock_irqrestore(&buf->del_lock, flags);
+	return error;
+}
+
+int rb_free_buf(ring_buffer_t* buf)
+{
+	unsigned long flags;
+	int error = 0;
+	write_lock_irqsave(&buf->del_lock, flags);
+	BUG_ON(!buf->buf);
+	free_pages((unsigned long) buf->buf, buf->order);
+	buf->buf    = NULL;
+	buf->end    = NULL;
+	buf->writep = NULL;
+	buf->readp  = NULL;
+	write_unlock_irqrestore(&buf->del_lock, flags);
+	return error;
+}
+
+/* Assumption: concurrent writes are serialized externally
+ *
+ * Will only succeed if there is enough space for all len bytes.
+ */
+int rb_put(ring_buffer_t* buf, char* mem, size_t len)
+{
+	unsigned long flags;
+	char* r , *w;
+	int error = 0;
+	read_lock_irqsave(&buf->del_lock, flags);
+	if (!buf->buf) {
+		error = -ENODEV;
+		goto out;
+	}
+	spin_lock(&buf->lock);
+	r = buf->readp;
+	w = buf->writep;
+	spin_unlock(&buf->lock);
+	if (r < w && buf->end - w >= len - 1) {
+		/* easy case: there is enough space in the buffer
+		 * to write it in one continous chunk*/
+		memcpy(w, mem, len);
+		w += len;
+		if (w > buf->end)
+			/* special case: fit exactly into buffer
+			 * w is now buf->end + 1
+			 */
+			w = buf->buf;
+	} else if (w < r && r - w >= len) { /* >= len because  may not cross */
+		/* we are constrained by the read pointer but we there
+		 * is enough space
+		 */
+		memcpy(w, mem, len);
+		w += len;
+	} else if (r <= w && buf->end - w < len - 1) {
+		/* the wrap around case: there may or may not be space */
+		if ((buf->end - w) + (r - buf->buf) >= len - 1) {
+			/* copy chunk that fits at the end */
+			memcpy(w, mem, buf->end - w + 1);
+			mem += buf->end - w + 1;
+			len -= (buf->end - w + 1);
+			w = buf->buf;
+			/* copy the rest */
+			memcpy(w, mem, len);
+			w += len;
+		}
+		else
+			error = -ENOMEM;
+	} else {
+		error = -ENOMEM;
+	}
+	if (!error) {
+		spin_lock(&buf->lock);
+		buf->writep = w;
+		spin_unlock(&buf->lock);
+	}
+ out:
+	read_unlock_irqrestore(&buf->del_lock, flags);
+	return error;
+}
+
+/* Assumption: concurrent reads are serialized externally */
+int rb_get(ring_buffer_t* buf, char* mem, size_t len)
+{
+	unsigned long flags;
+	char* r , *w;
+	int error = 0;
+	read_lock_irqsave(&buf->del_lock, flags);
+	if (!buf->buf) {
+		error = -ENODEV;
+		goto out;
+	}
+	spin_lock(&buf->lock);
+	r = buf->readp;
+	w = buf->writep;
+	spin_unlock(&buf->lock);
+
+	if (w <= r && buf->end - r >= len) {
+		/* easy case: there is enough data in the buffer
+		 * to get it in one  chunk*/
+		memcpy(mem, r + 1, len);
+		r += len;
+		error = len;
+
+	} else if (r + 1 < w && w - r - 1 >= len) {
+		/* we are constrained by the write pointer but
+		 * there is enough data
+		 */
+		memcpy(mem, r + 1, len);
+		r += len;
+		error = len;
+
+	} else if (r + 1 < w && w - r - 1 < len) {
+		/* we are constrained by the write pointer and there
+		 * there is not enough data
+		 */
+		memcpy(mem, r + 1, w - r - 1);
+		error = w - r - 1;
+		r    += w - r - 1;
+
+	} else if (w <= r && buf->end - r < len) {
+		/* the wrap around case: there may or may not be enough data
+		 * first let's get what is available
+		 */
+		memcpy(mem, r + 1, buf->end - r);
+		error += (buf->end - r);
+		mem   += (buf->end - r);
+		len   -= (buf->end - r);
+		r     += (buf->end - r);
+
+		if (w > buf->buf) {
+			/* there is more to get */
+			r = buf->buf - 1;
+			if (w - r >= len) {
+				/* plenty */
+				memcpy(mem, r + 1, len);
+				error += len;
+				r     += len;
+			} else {
+				memcpy(mem, r + 1, w - r - 1);
+				error += w - r - 1;
+				r     += w - r - 1;
+			}
+		}
+	} /* nothing available */
+
+	if (error > 0) {
+		spin_lock(&buf->lock);
+		buf->readp = r;
+		spin_unlock(&buf->lock);
+	}
+ out:
+	read_unlock_irqrestore(&buf->del_lock, flags);
+	return error;
+}
+
+
+
+/******************************************************************************/
+/*                        DEVICE FILE DRIVER                                  */
+/******************************************************************************/
+
+
+
+/* Allocate a buffer of about 1 MB per CPU.
+ *
+ */
+#define BUFFER_ORDER 8
+
+typedef struct {
+	ring_buffer_t 		buf;
+	atomic_t		reader_cnt;
+	struct semaphore	reader_mutex;
+} trace_buffer_t;
+
+
+/* This does not initialize the semaphore!! */
+
+#define EMPTY_TRACE_BUFFER \
+	{ .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)}
+
+static DEFINE_PER_CPU(trace_buffer_t, trace_buffer);
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+static spinlock_t		log_buffer_lock = SPIN_LOCK_UNLOCKED;
+#endif
+static trace_buffer_t 		log_buffer = EMPTY_TRACE_BUFFER;
+
+static void init_buffers(void)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		rb_init(&per_cpu(trace_buffer, i).buf);
+		init_MUTEX(&per_cpu(trace_buffer, i).reader_mutex);
+		atomic_set(&per_cpu(trace_buffer, i).reader_cnt, 0);
+	}
+	/* only initialize the mutex, the rest was initialized as part
+	 * of the static initialization macro
+	 */
+	init_MUTEX(&log_buffer.reader_mutex);
+}
+
+static int trace_release(struct inode *in, struct file *filp)
+{
+	int error 		= -EINVAL;
+	trace_buffer_t* buf 	= filp->private_data;
+
+	BUG_ON(!filp->private_data);
+
+	if (down_interruptible(&buf->reader_mutex)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	/*	last release must deallocate buffers 	*/
+	if (atomic_dec_return(&buf->reader_cnt) == 0) {
+		error = rb_free_buf(&buf->buf);
+	}
+
+	up(&buf->reader_mutex);
+ out:
+	return error;
+}
+
+static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
+		      loff_t *f_pos)
+{
+	/* 	we ignore f_pos, this is strictly sequential */
+
+	ssize_t error = -EINVAL;
+	char*   mem;
+	trace_buffer_t *buf = filp->private_data;
+
+	if (down_interruptible(&buf->reader_mutex)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	if (len > 64 * 1024)
+		len = 64 * 1024;
+	mem = kmalloc(len, GFP_KERNEL);
+	if (!mem) {
+		error = -ENOMEM;
+		goto out_unlock;
+	}
+
+	error = rb_get(&buf->buf, mem, len);
+	while (!error) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(110);
+		if (signal_pending(current))
+			error = -ERESTARTSYS;
+		else
+			error = rb_get(&buf->buf, mem, len);
+	}
+
+	if (error > 0 && copy_to_user(to, mem, error))
+		error = -EFAULT;
+
+	kfree(mem);
+ out_unlock:
+	up(&buf->reader_mutex);
+ out:
+	return error;
+}
+
+
+/* trace_open - Open one of the per-CPU sched_trace buffers.
+ */
+static int trace_open(struct inode *in, struct file *filp)
+{
+	int error 		= -EINVAL;
+	int cpu   		= MINOR(in->i_rdev);
+	trace_buffer_t* buf;
+
+	if (!cpu_online(cpu)) {
+		printk(KERN_WARNING "sched trace: "
+			"CPU #%d is not online. (open failed)\n", cpu);
+		error = -ENODEV;
+		goto out;
+	}
+
+	buf = &per_cpu(trace_buffer, cpu);
+
+	if (down_interruptible(&buf->reader_mutex)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	/*	first open must allocate buffers 	*/
+	if (atomic_inc_return(&buf->reader_cnt) == 1) {
+		if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
+		{
+			atomic_dec(&buf->reader_cnt);
+			goto out_unlock;
+		}
+	}
+
+	error = 0;
+	filp->private_data = buf;
+
+ out_unlock:
+	up(&buf->reader_mutex);
+ out:
+	return error;
+}
+
+
+extern int trace_override;
+
+/* log_open - open the global log message ring buffer.
+ */
+static int log_open(struct inode *in, struct file *filp)
+{
+	int error 		= -EINVAL;
+	trace_buffer_t* buf;
+
+	buf = &log_buffer;
+
+	if (down_interruptible(&buf->reader_mutex)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	/*	first open must allocate buffers 	*/
+	if (atomic_inc_return(&buf->reader_cnt) == 1) {
+		if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
+		{
+			atomic_dec(&buf->reader_cnt);
+			goto out_unlock;
+		}
+	}
+
+	error = 0;
+	filp->private_data = buf;
+	printk(KERN_DEBUG "sched_trace buf: from 0x%p to 0x%p  length: %lx\n",
+	       buf->buf.buf, buf->buf.end, buf->buf.end - buf->buf.buf);
+	trace_override++;
+ out_unlock:
+	up(&buf->reader_mutex);
+ out:
+	return error;
+}
+
+static int log_release(struct inode *in, struct file *filp)
+{
+	int error 		= -EINVAL;
+	trace_buffer_t* buf 	= filp->private_data;
+
+	BUG_ON(!filp->private_data);
+
+	if (down_interruptible(&buf->reader_mutex)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	/*	last release must deallocate buffers 	*/
+	if (atomic_dec_return(&buf->reader_cnt) == 0) {
+		error = rb_free_buf(&buf->buf);
+	}
+
+	trace_override--;
+	up(&buf->reader_mutex);
+ out:
+	return error;
+}
+
+/******************************************************************************/
+/*                          Device Registration                               */
+/******************************************************************************/
+
+/* the major numbes are from the unassigned/local use block
+ *
+ * This should be converted to dynamic allocation at some point...
+ */
+#define TRACE_MAJOR	250
+#define LOG_MAJOR	251
+
+/* trace_fops - The file operations for accessing the per-CPU scheduling event
+ *              trace buffers.
+ */
+struct file_operations trace_fops = {
+	.owner   = THIS_MODULE,
+	.open    = trace_open,
+	.release = trace_release,
+	.read    = trace_read,
+};
+
+/* log_fops  - The file operations for accessing the global LITMUS log message
+ *             buffer.
+ *
+ * Except for opening the device file it uses the same operations as trace_fops.
+ */
+struct file_operations log_fops = {
+	.owner   = THIS_MODULE,
+	.open    = log_open,
+	.release = log_release,
+	.read    = trace_read,
+};
+
+static int __init register_buffer_dev(const char* name,
+				      struct file_operations* fops,
+				      int major, int count)
+{
+	dev_t  trace_dev;
+	struct cdev *cdev;
+	int error = 0;
+
+	trace_dev = MKDEV(major, 0);
+	error     = register_chrdev_region(trace_dev, count, name);
+	if (error)
+	{
+		printk(KERN_WARNING "sched trace: "
+		       "Could not register major/minor number %d\n", major);
+		return error;
+	}
+	cdev = cdev_alloc();
+	if (!cdev) {
+		printk(KERN_WARNING "sched trace: "
+			"Could not get a cdev for %s.\n", name);
+		return -ENOMEM;
+	}
+	cdev->owner = THIS_MODULE;
+	cdev->ops   = fops;
+	error = cdev_add(cdev, trace_dev, count);
+	if (error) {
+		printk(KERN_WARNING "sched trace: "
+			"add_cdev failed for %s.\n", name);
+		return -ENOMEM;
+	}
+	return error;
+
+}
+
+static int __init init_sched_trace(void)
+{
+	int error1 = 0, error2 = 0;
+
+	printk("Initializing scheduler trace device\n");
+	init_buffers();
+
+	error1 = register_buffer_dev("schedtrace", &trace_fops,
+				    TRACE_MAJOR, NR_CPUS);
+
+	error2 = register_buffer_dev("litmus_log", &log_fops,
+				     LOG_MAJOR, 1);
+	if (error1 || error2)
+		return min(error1, error2);
+	else
+		return 0;
+}
+
+module_init(init_sched_trace);
+
+/******************************************************************************/
+/*                                KERNEL API                                  */
+/******************************************************************************/
+
+/* The per-CPU LITMUS log buffer. Don't put it on the stack, it is too big for
+ * that and the kernel gets very picky with nested interrupts and small stacks.
+ */
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+
+#define MSG_SIZE 255
+static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
+
+/* sched_trace_log_message - This is the only function that accesses the the
+ *                           log buffer inside the kernel for writing.
+ *                           Concurrent access to it is serialized via the
+ *                           log_buffer_lock.
+ *
+ *                           The maximum length of a formatted message is 255.
+ */
+void sched_trace_log_message(const char* fmt, ...)
+{
+	unsigned long 	flags;
+	va_list 	args;
+	size_t		len;
+	char*		buf;
+
+	va_start(args, fmt);
+	local_irq_save(flags);
+
+	/* format message */
+	buf = __get_cpu_var(fmt_buffer);
+	len = vscnprintf(buf, MSG_SIZE, fmt, args);
+
+	spin_lock(&log_buffer_lock);
+	/* Don't copy the trailing null byte, we don't want null bytes
+	 * in a text file.
+	 */
+	rb_put(&log_buffer.buf, buf, len);
+	spin_unlock(&log_buffer_lock);
+
+	local_irq_restore(flags);
+	va_end(args);
+}
+
+#endif
+
diff --git a/litmus/srp.c b/litmus/srp.c
new file mode 100644
index 0000000..6e670f9
--- /dev/null
+++ b/litmus/srp.c
@@ -0,0 +1,318 @@
+/* ************************************************************************** */
+/*                          STACK RESOURCE POLICY                             */
+/* ************************************************************************** */
+
+#include <asm/atomic.h>
+#include <linux/wait.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+
+#include <litmus/fdso.h>
+
+#include <litmus/trace.h>
+
+
+#ifdef CONFIG_SRP
+
+struct srp_priority {
+	struct list_head	list;
+        unsigned int 		period;
+	pid_t			pid;
+};
+
+#define list2prio(l) list_entry(l, struct srp_priority, list)
+
+/* SRP task priority comparison function. Smaller periods have highest
+ * priority, tie-break is PID. Special case: period == 0 <=> no priority
+ */
+static int srp_higher_prio(struct srp_priority* first,
+			   struct srp_priority* second)
+{
+	if (!first->period)
+		return 0;
+	else
+		return  !second->period ||
+			first->period < second->period || (
+			first->period == second->period &&
+			first->pid < second->pid);
+}
+
+struct srp {
+	struct list_head	ceiling;
+	wait_queue_head_t	ceiling_blocked;
+};
+
+
+atomic_t srp_objects_in_use = ATOMIC_INIT(0);
+
+DEFINE_PER_CPU(struct srp, srp);
+
+
+/* Initialize SRP semaphores at boot time. */
+static int __init srp_init(void)
+{
+	int i;
+
+	printk("Initializing SRP per-CPU ceilings...");
+	for (i = 0; i < NR_CPUS; i++) {
+		init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
+		INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
+	}
+	printk(" done!\n");
+
+	return 0;
+}
+module_init(srp_init);
+
+
+#define system_ceiling(srp) list2prio(srp->ceiling.next)
+
+
+#define UNDEF_SEM -2
+
+
+/* struct for uniprocessor SRP "semaphore" */
+struct srp_semaphore {
+	struct srp_priority ceiling;
+	struct task_struct* owner;
+	int cpu; /* cpu associated with this "semaphore" and resource */
+};
+
+#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling) 
+
+static int srp_exceeds_ceiling(struct task_struct* first,
+			       struct srp* srp)
+{
+	return list_empty(&srp->ceiling) ||
+	       get_rt_period(first) < system_ceiling(srp)->period ||
+	       (get_rt_period(first) == system_ceiling(srp)->period &&
+		first->pid < system_ceiling(srp)->pid) || 
+		ceiling2sem(system_ceiling(srp))->owner == first;
+}
+
+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
+{
+	struct list_head *pos;
+	if (in_list(&prio->list)) {
+		printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in "
+		       "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio));
+		return;
+	}
+	list_for_each(pos, &srp->ceiling)
+		if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
+			__list_add(&prio->list, pos->prev, pos);
+			return;
+		}
+
+	list_add_tail(&prio->list, &srp->ceiling);
+}
+
+
+static void* create_srp_semaphore(void)
+{
+	struct srp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	INIT_LIST_HEAD(&sem->ceiling.list);
+	sem->ceiling.period = 0;
+	sem->cpu     = UNDEF_SEM;
+	sem->owner   = NULL;
+	atomic_inc(&srp_objects_in_use);
+	return sem;
+}
+
+static noinline int open_srp_semaphore(struct od_table_entry* entry, void* __user arg)
+{
+	struct srp_semaphore* sem = (struct srp_semaphore*) entry->obj->obj;
+	int ret = 0;
+	struct task_struct* t = current;
+	struct srp_priority t_prio;
+
+	TRACE("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu);
+	if (!srp_active())
+		return -EBUSY;
+
+	if (sem->cpu == UNDEF_SEM)
+		sem->cpu = get_partition(t);
+	else if (sem->cpu != get_partition(t))
+		ret = -EPERM;
+
+	if (ret == 0) {
+		t_prio.period = get_rt_period(t);
+		t_prio.pid    = t->pid;
+		if (srp_higher_prio(&t_prio, &sem->ceiling)) {
+			sem->ceiling.period = t_prio.period;
+			sem->ceiling.pid    = t_prio.pid;
+		}
+	}
+
+	return ret;
+}
+
+static void destroy_srp_semaphore(void* sem)
+{
+	/* XXX invariants */
+	atomic_dec(&srp_objects_in_use);
+	kfree(sem);
+}
+
+struct fdso_ops srp_sem_ops = {
+	.create  = create_srp_semaphore,
+	.open    = open_srp_semaphore,
+	.destroy = destroy_srp_semaphore
+};
+
+
+static void do_srp_down(struct srp_semaphore* sem)
+{
+	/* Update ceiling. */
+	srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
+	WARN_ON(sem->owner != NULL);
+	sem->owner = current;
+	TRACE_CUR("acquired srp 0x%p\n", sem);
+}
+
+static void do_srp_up(struct srp_semaphore* sem)
+{	
+	/* Determine new system priority ceiling for this CPU. */
+	WARN_ON(!in_list(&sem->ceiling.list));
+	if (in_list(&sem->ceiling.list))
+		list_del(&sem->ceiling.list);
+
+	sem->owner = NULL;
+
+	/* Wake tasks on this CPU, if they exceed current ceiling. */
+	TRACE_CUR("released srp 0x%p\n", sem);
+	wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
+}
+
+/* Adjust the system-wide priority ceiling if resource is claimed. */
+asmlinkage long sys_srp_down(int sem_od)
+{
+	int cpu;
+	int ret = -EINVAL;
+	struct srp_semaphore* sem;
+
+	/* disabling preemptions is sufficient protection since
+	 * SRP is strictly per CPU and we don't interfere with any
+	 * interrupt handlers
+	 */
+	preempt_disable();
+	TS_SRP_DOWN_START;
+
+	cpu = smp_processor_id();
+	sem = lookup_srp_sem(sem_od);
+	if (sem && sem->cpu == cpu) {
+		do_srp_down(sem);
+		ret = 0;
+	}
+
+	TS_SRP_DOWN_END;
+	preempt_enable();
+	return ret;
+}
+
+/* Adjust the system-wide priority ceiling if resource is freed. */
+asmlinkage long sys_srp_up(int sem_od)
+{
+	int cpu;
+	int ret = -EINVAL;
+	struct srp_semaphore* sem;
+
+	preempt_disable();
+	TS_SRP_UP_START;
+
+	cpu = smp_processor_id();
+	sem = lookup_srp_sem(sem_od);
+
+	if (sem && sem->cpu == cpu) {
+		do_srp_up(sem);
+		ret = 0;
+	}
+
+	TS_SRP_UP_END;
+	preempt_enable();
+	return ret;
+}
+
+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+		       void *key)
+{
+	int cpu = smp_processor_id();
+	struct task_struct *tsk = wait->private;
+	if (cpu != get_partition(tsk))
+		TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
+			   get_partition(tsk));
+	else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+		return default_wake_function(wait, mode, sync, key);
+	return 0;
+}
+
+
+
+static void do_ceiling_block(struct task_struct *tsk)
+{
+	wait_queue_t wait = {
+		.private   = tsk,
+		.func      = srp_wake_up,
+		.task_list = {NULL, NULL}
+	};
+
+	tsk->state = TASK_UNINTERRUPTIBLE;
+	add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+	tsk->rt_param.srp_non_recurse = 1;
+	preempt_enable_no_resched();
+	schedule();
+	preempt_disable();
+	tsk->rt_param.srp_non_recurse = 0;
+	remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+}
+
+/* Wait for current task priority to exceed system-wide priority ceiling.
+ */
+void srp_ceiling_block(void)
+{
+	struct task_struct *tsk = current;
+
+	/* Only applies to real-time tasks, but optimize for RT tasks. */
+	if (unlikely(!is_realtime(tsk)))
+		return;
+
+	/* Avoid recursive ceiling blocking. */
+	if (unlikely(tsk->rt_param.srp_non_recurse))
+		return;
+
+	/* Bail out early if there aren't any SRP resources around. */
+	if (likely(!atomic_read(&srp_objects_in_use)))
+		return;
+
+	preempt_disable();
+	if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
+		TRACE_CUR("is priority ceiling blocked.\n");
+		while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+			do_ceiling_block(tsk);
+		TRACE_CUR("finally exceeds system ceiling.\n");
+	} else
+		TRACE_CUR("is not priority ceiling blocked\n");	
+	preempt_enable();
+}
+
+
+#else
+
+asmlinkage long sys_srp_down(int sem_od)
+{
+	return -ENOSYS;
+}
+
+asmlinkage long sys_srp_up(int sem_od)
+{
+	return -ENOSYS;
+}
+
+struct fdso_ops srp_sem_ops = {};
+
+#endif
diff --git a/litmus/sync.c b/litmus/sync.c
new file mode 100644
index 0000000..c16f1dd
--- /dev/null
+++ b/litmus/sync.c
@@ -0,0 +1,86 @@
+/* litmus/sync.c - Support for synchronous and asynchronous task system releases.
+ *
+ *
+ */
+
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/jobs.h>
+
+static DECLARE_COMPLETION(ts_release);
+
+static long do_wait_for_ts_release(void)
+{
+	long ret = 0;
+
+	/* If the interruption races with a release, the completion object
+	 * may have a non-zero counter. To avoid this problem, this should
+	 * be replaced by wait_for_completion().
+	 *
+	 * For debugging purposes, this is interruptible for now.
+	 */
+	ret = wait_for_completion_interruptible(&ts_release);
+
+	return ret;
+}
+
+
+static long do_release_ts(lt_t start)
+{
+	int  task_count = 0;
+	long flags;
+	struct list_head	*pos;
+	struct task_struct 	*t;
+
+
+	spin_lock_irqsave(&ts_release.wait.lock, flags);
+	TRACE("<<<<<< synchronous task system release >>>>>>\n");
+	
+	list_for_each(pos, &ts_release.wait.task_list) {
+		t = (struct task_struct*) list_entry(pos,
+						     struct __wait_queue,
+						     task_list)->private;
+		task_count++;
+		litmus->release_at(t, start + t->rt_param.task_params.phase);
+	}
+
+	spin_unlock_irqrestore(&ts_release.wait.lock, flags);
+
+	complete_n(&ts_release, task_count);
+
+	return task_count;
+}
+
+
+asmlinkage long sys_wait_for_ts_release(void)
+{
+	long ret = -EPERM;
+	struct task_struct *t = current;
+
+	if (is_realtime(t))
+		ret = do_wait_for_ts_release();
+
+	return ret;
+}
+
+
+asmlinkage long sys_release_ts(lt_t __user *__delay)
+{
+	long ret;
+	lt_t delay;
+
+	/* FIXME: check capabilities... */
+
+	ret = copy_from_user(&delay, __delay, sizeof(lt_t));
+	if (ret == 0)
+		ret = do_release_ts(litmus_clock() + delay);
+
+	return ret;
+}
diff --git a/litmus/trace.c b/litmus/trace.c
new file mode 100644
index 0000000..dadb09d
--- /dev/null
+++ b/litmus/trace.c
@@ -0,0 +1,335 @@
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/trace.h>
+
+/******************************************************************************/
+/*                          Allocation                                        */
+/******************************************************************************/
+
+struct ft_buffer* trace_ts_buf = NULL;
+
+static unsigned int ts_seq_no = 0;
+
+static inline void __save_timestamp(unsigned long event, uint8_t type)
+{
+	unsigned int seq_no;
+	struct timestamp *ts;
+	seq_no = fetch_and_inc((int *) &ts_seq_no);
+	if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
+		ts->event     = event;
+		ts->timestamp = ft_timestamp();
+		ts->seq_no    = seq_no;
+		ts->cpu       = raw_smp_processor_id();
+		ts->task_type = type;
+		ft_buffer_finish_write(trace_ts_buf, ts);
+	}
+}
+
+feather_callback void save_timestamp(unsigned long event)
+{
+	__save_timestamp(event, TSK_UNKNOWN);
+}
+
+feather_callback void save_timestamp_def(unsigned long event, unsigned long type)
+{
+	__save_timestamp(event, (uint8_t) type);
+}
+
+feather_callback void save_timestamp_task(unsigned long event, unsigned long t_ptr)
+{
+	int rt = is_realtime((struct task_struct *) t_ptr);
+	__save_timestamp(event, rt ? TSK_RT : TSK_BE);
+}
+
+static struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
+{
+	struct ft_buffer* buf;
+	size_t total = (size + 1) * count;
+	char* mem;
+	int order = 0, pages = 1;
+
+	buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+	while (pages < total) {
+		order++;
+		pages *= 2;
+	}
+
+	mem = (char*) __get_free_pages(GFP_KERNEL, order);
+	if (!mem) {
+		kfree(buf);
+		return NULL;
+	}
+
+	if (!init_ft_buffer(buf, count, size,
+			    mem + (count * size),  /* markers at the end */
+			    mem)) {                /* buffer objects     */
+		free_pages((unsigned long) mem, order);
+		kfree(buf);
+		return NULL;
+	}
+	return buf;
+}
+
+static void free_ft_buffer(struct ft_buffer* buf)
+{
+	int order = 0, pages = 1;
+	size_t total;
+
+	if (buf) {
+		total = (buf->slot_size + 1) * buf->slot_count;
+		total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+		while (pages < total) {
+			order++;
+			pages *= 2;
+		}
+		free_pages((unsigned long) buf->buffer_mem, order);
+		kfree(buf);
+	}
+}
+
+
+/******************************************************************************/
+/*                        DEVICE FILE DRIVER                                  */
+/******************************************************************************/
+
+#define NO_TIMESTAMPS (2 << 19) /* that should be 8 megs of ram, we may not get
+				 * as much */
+
+static DECLARE_MUTEX(feather_lock);
+static int use_count = 0;
+
+/* used for draining the FT buffers */
+static int enabled_events = 0;
+
+static int trace_release(struct inode *in, struct file *filp)
+{
+	int err 		= -EINVAL;
+
+	if (down_interruptible(&feather_lock)) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+	printk(KERN_ALERT "%s/%d disconnects from feather trace device. "
+	       "use_count=%d\n",
+	       current->comm, current->pid, use_count);
+
+	if (use_count == 1) {
+		/* disable events */
+		ft_disable_all_events();
+		enabled_events = 0;
+
+		/* wait for any pending events to complete */
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(HZ);
+
+		printk(KERN_ALERT "Failed trace writes: %u\n",
+		       trace_ts_buf->failed_writes);
+
+		free_ft_buffer(trace_ts_buf);
+		trace_ts_buf = NULL;
+	}
+
+	/* dummy entry to make linker happy */
+	ft_event0(666, save_timestamp);
+
+	use_count--;
+	up(&feather_lock);
+out:
+	return err;
+}
+
+static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
+		      loff_t *f_pos)
+{
+	/* 	we ignore f_pos, this is strictly sequential */
+	ssize_t error = 0;
+	struct timestamp ts;
+
+	if (down_interruptible(&feather_lock)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+
+	while (len >= sizeof(struct timestamp)) {
+		if (ft_buffer_read(trace_ts_buf, &ts)) {
+			/* FIXME: avoid double copy */
+			if (copy_to_user(to, &ts, sizeof(struct timestamp))) {
+				error = -EFAULT;
+				break;
+			} else {
+				len    -= sizeof(struct timestamp);
+				to     += sizeof(struct timestamp);
+				error  += sizeof(struct timestamp);
+			}
+	        } else if (enabled_events) {
+			/* only wait if there are any events enabled */
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(50);
+			if (signal_pending(current)) {
+				error = -ERESTARTSYS;
+				break;
+			}
+		} else
+			/* nothing left to get, return to user space */
+			break;
+	}
+	up(&feather_lock);
+out:
+	return error;
+}
+
+#define ENABLE_CMD 	0
+#define DISABLE_CMD 	1
+
+typedef uint32_t cmd_t;
+
+static ssize_t trace_write(struct file *filp, const char __user *from,
+			   size_t len, loff_t *f_pos)
+{
+	ssize_t error = -EINVAL;
+	cmd_t cmd;
+	cmd_t id;
+
+	if (len % sizeof(cmd_t) || len < 2 * sizeof(cmd_t))
+		goto out;
+
+	if (copy_from_user(&cmd, from, sizeof(cmd_t))) {
+		error = -EFAULT;
+	        goto out;
+	}
+	len  -= sizeof(cmd_t);
+	from += sizeof(cmd_t);
+
+	if (cmd != ENABLE_CMD && cmd != DISABLE_CMD)
+		goto out;
+
+	if (down_interruptible(&feather_lock)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	error = sizeof(cmd_t);
+	while (len) {
+		if (copy_from_user(&id, from, sizeof(cmd_t))) {
+			error = -EFAULT;
+			goto out;
+		}
+		len  -= sizeof(cmd_t);
+		from += sizeof(cmd_t);
+		if (cmd) {
+			printk(KERN_INFO
+			       "Disabling feather-trace event %d.\n", (int) id);
+			ft_disable_event(id);
+			enabled_events--;
+		} else {
+			printk(KERN_INFO
+			       "Enabling feather-trace event %d.\n", (int) id);
+			ft_enable_event(id);
+			enabled_events++;
+		}
+		error += sizeof(cmd_t);
+	}
+
+	up(&feather_lock);
+ out:
+	return error;
+}
+
+static int trace_open(struct inode *in, struct file *filp)
+{
+	int err = 0;
+        unsigned int count = NO_TIMESTAMPS;
+
+	if (down_interruptible(&feather_lock)) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+	while (count && !trace_ts_buf) {
+		printk("trace: trying to allocate %u time stamps.\n", count);
+		trace_ts_buf = alloc_ft_buffer(count, sizeof(struct timestamp));
+		count /= 2;
+	}
+	if (!trace_ts_buf)
+		err = -ENOMEM;
+	else
+		use_count++;
+
+	up(&feather_lock);
+out:
+	return err;
+}
+
+/******************************************************************************/
+/*                          Device Registration                               */
+/******************************************************************************/
+
+#define FT_TRACE_MAJOR	252
+
+struct file_operations ft_trace_fops = {
+	.owner   = THIS_MODULE,
+	.open    = trace_open,
+	.release = trace_release,
+	.write   = trace_write,
+	.read    = trace_read,
+};
+
+
+static int __init register_buffer_dev(const char* name,
+				      struct file_operations* fops,
+				      int major, int count)
+{
+	dev_t   trace_dev;
+	struct cdev *cdev;
+	int error = 0;
+
+	trace_dev = MKDEV(major, 0);
+	error     = register_chrdev_region(trace_dev, count, name);
+	if (error)
+	{
+		printk(KERN_WARNING "trace: "
+		       "Could not register major/minor number %d\n", major);
+		return error;
+	}
+	cdev = cdev_alloc();
+	if (!cdev) {
+		printk(KERN_WARNING "trace: "
+			"Could not get a cdev for %s.\n", name);
+		return -ENOMEM;
+	}
+	cdev->owner = THIS_MODULE;
+	cdev->ops   = fops;
+	error = cdev_add(cdev, trace_dev, count);
+	if (error) {
+		printk(KERN_WARNING "trace: "
+			"add_cdev failed for %s.\n", name);
+		return -ENOMEM;
+	}
+	return error;
+
+}
+
+static int __init init_sched_trace(void)
+{
+	int error = 0;
+
+	printk("Initializing Feather-Trace device\n");
+
+	error = register_buffer_dev("ft_trace", &ft_trace_fops,
+				    FT_TRACE_MAJOR, 1);
+	return error;
+}
+
+module_init(init_sched_trace);