libgo patch committed: Move sched variable from C to Go

Fri Nov 18 17:48:00 GMT 2016

This patch simply moves the schedt (aka Sched) type and the sched
variable from C to Go.  This is a step toward further changes.
Bootstrapped and ran Go testsuite on x86_64-pc-linux-gnu.  Committed
to mainline.

Ian
-------------- next part --------------
Index: gcc/go/gofrontend/MERGE
===================================================================

--- gcc/go/gofrontend/MERGE	(revision 242594)
+++ gcc/go/gofrontend/MERGE	(working copy)
@@ -1,4 +1,4 @@
-fc4ca600b2fc6de81fd3c4014542d6a50593db1a
+bf4762823c4543229867436399be3ae30b4d13bb
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
Index: libgo/go/runtime/runtime2.go
===================================================================
--- libgo/go/runtime/runtime2.go	(revision 242581)
+++ libgo/go/runtime/runtime2.go	(working copy)
@@ -550,9 +550,6 @@ const (
 	_MaxGomaxprocs = 1 << 8
 )
 
-/*
-Commented out for gccgo for now.
-
 type schedt struct {
 	// accessed atomically. keep at top to ensure alignment on 32-bit systems.
 	goidgen  uint64
@@ -578,18 +575,17 @@ type schedt struct {
 	runqsize int32
 
 	// Global cache of dead G's.
-	gflock       mutex
-	gfreeStack   *g
-	gfreeNoStack *g
-	ngfree       int32
+	gflock mutex
+	gfree  *g
+	ngfree int32
 
 	// Central cache of sudog structs.
 	sudoglock  mutex
 	sudogcache *sudog
 
-	// Central pool of available defer structs of different sizes.
+	// Central pool of available defer structs.
 	deferlock mutex
-	deferpool [5]*_defer
+	deferpool *_defer
 
 	gcwaiting  uint32 // gc is waiting to run
 	stopwait   int32
@@ -608,7 +604,6 @@ type schedt struct {
 	procresizetime int64 // nanotime() of last change to gomaxprocs
 	totaltime      int64 // ∫gomaxprocs dt up to procresizetime
 }
-*/
 
 // The m.locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
 // The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
@@ -772,8 +767,10 @@ var (
 
 	ncpu int32
 
-//	forcegc     forcegcstate
-//	sched       schedt
+	//	forcegc     forcegcstate
+
+	sched schedt
+
 //	newprocs    int32
 
 // Information about what cpu features are available.
Index: libgo/go/runtime/stubs.go
===================================================================
--- libgo/go/runtime/stubs.go	(revision 242581)
+++ libgo/go/runtime/stubs.go	(working copy)
@@ -520,3 +520,9 @@ func dumpregs(*_siginfo_t, unsafe.Pointe
 
 // Temporary for gccgo until we port panic.go.
 func startpanic()
+
+// Temporary for gccgo until we port proc.go.
+//go:linkname getsched runtime.getsched
+func getsched() *schedt {
+	return &sched
+}
Index: libgo/runtime/proc.c
===================================================================
--- libgo/runtime/proc.c	(revision 242581)
+++ libgo/runtime/proc.c	(working copy)
@@ -351,48 +351,18 @@ runtime_mcall(void (*pfn)(G*))
 //
 // Design doc at http://golang.org/s/go11sched.
 
-typedef struct Sched Sched;
-struct Sched {
-	Lock;
-
-	uint64	goidgen;
-	M*	midle;	 // idle m's waiting for work
-	int32	nmidle;	 // number of idle m's waiting for work
-	int32	nmidlelocked; // number of locked m's waiting for work
-	int32	mcount;	 // number of m's that have been created
-	int32	maxmcount;	// maximum number of m's allowed (or die)
-
-	P*	pidle;  // idle P's
-	uint32	npidle;
-	uint32	nmspinning;
-
-	// Global runnable queue.
-	G*	runqhead;
-	G*	runqtail;
-	int32	runqsize;
-
-	// Global cache of dead G's.
-	Lock	gflock;
-	G*	gfree;
-
-	uint32	gcwaiting;	// gc is waiting to run
-	int32	stopwait;
-	Note	stopnote;
-	uint32	sysmonwait;
-	Note	sysmonnote;
-	uint64	lastpoll;
-
-	int32	profilehz;	// cpu profiling rate
-};
+typedef struct schedt Sched;
 
 enum
 {
-	// Number of goroutine ids to grab from runtime_sched.goidgen to local per-P cache at once.
+	// Number of goroutine ids to grab from runtime_sched->goidgen to local per-P cache at once.
 	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
 	GoidCacheBatch = 16,
 };
 
-Sched	runtime_sched;
+extern Sched* runtime_getsched() __asm__ (GOSYM_PREFIX "runtime.getsched");
+
+static Sched*	runtime_sched;
 int32	runtime_gomaxprocs;
 uint32	runtime_needextram = 1;
 M	runtime_m0;
@@ -471,6 +441,8 @@ runtime_schedinit(void)
 	const byte *p;
 	Eface i;
 
+	runtime_sched = runtime_getsched();
+
 	m = &runtime_m0;
 	g = &runtime_g0;
 	m->g0 = g;
@@ -479,7 +451,7 @@ runtime_schedinit(void)
 
 	initcontext();
 
-	runtime_sched.maxmcount = 10000;
+	runtime_sched->maxmcount = 10000;
 	runtime_precisestack = 0;
 
 	// runtime_symtabinit();
@@ -500,7 +472,7 @@ runtime_schedinit(void)
 	runtime_goenvs();
 	runtime_parsedebugvars();
 
-	runtime_sched.lastpoll = runtime_nanotime();
+	runtime_sched->lastpoll = runtime_nanotime();
 	procs = 1;
 	s = runtime_getenv("GOMAXPROCS");
 	p = s.str;
@@ -747,8 +719,8 @@ static void
 checkmcount(void)
 {
 	// sched lock is held
-	if(runtime_sched.mcount > runtime_sched.maxmcount) {
-		runtime_printf("runtime: program exceeds %d-thread limit\n", runtime_sched.maxmcount);
+	if(runtime_sched->mcount > runtime_sched->maxmcount) {
+		runtime_printf("runtime: program exceeds %d-thread limit\n", runtime_sched->maxmcount);
 		runtime_throw("thread exhaustion");
 	}
 }
@@ -782,8 +754,8 @@ mcommoninit(M *mp)
 
 	mp->fastrand = 0x49f6428aUL + mp->id + runtime_cputicks();
 
-	runtime_lock(&runtime_sched);
-	mp->id = runtime_sched.mcount++;
+	runtime_lock(&runtime_sched->lock);
+	mp->id = runtime_sched->mcount++;
 	checkmcount();
 	runtime_mpreinit(mp);
 
@@ -793,7 +765,7 @@ mcommoninit(M *mp)
 	// runtime_NumCgoCall() iterates over allm w/o schedlock,
 	// so we need to publish it safely.
 	runtime_atomicstorep(&runtime_allm, mp);
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 }
 
 // Mark gp ready to run.
@@ -808,7 +780,7 @@ runtime_ready(G *gp)
 	}
 	gp->atomicstatus = _Grunnable;
 	runqput((P*)g->m->p, gp);
-	if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0)  // TODO: fast atomic
+	if(runtime_atomicload(&runtime_sched->npidle) != 0 && runtime_atomicload(&runtime_sched->nmspinning) == 0)  // TODO: fast atomic
 		wakep();
 	g->m->locks--;
 }
@@ -828,15 +800,15 @@ runtime_gcprocs(void)
 
 	// Figure out how many CPUs to use during GC.
 	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	n = runtime_gomaxprocs;
 	if(n > runtime_ncpu)
 		n = runtime_ncpu > 0 ? runtime_ncpu : 1;
 	if(n > MaxGcproc)
 		n = MaxGcproc;
-	if(n > runtime_sched.nmidle+1) // one M is currently running
-		n = runtime_sched.nmidle+1;
-	runtime_unlock(&runtime_sched);
+	if(n > runtime_sched->nmidle+1) // one M is currently running
+		n = runtime_sched->nmidle+1;
+	runtime_unlock(&runtime_sched->lock);
 	return n;
 }
 
@@ -845,14 +817,14 @@ needaddgcproc(void)
 {
 	int32 n;
 
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	n = runtime_gomaxprocs;
 	if(n > runtime_ncpu)
 		n = runtime_ncpu;
 	if(n > MaxGcproc)
 		n = MaxGcproc;
-	n -= runtime_sched.nmidle+1; // one M is currently running
-	runtime_unlock(&runtime_sched);
+	n -= runtime_sched->nmidle+1; // one M is currently running
+	runtime_unlock(&runtime_sched->lock);
 	return n > 0;
 }
 
@@ -862,7 +834,7 @@ runtime_helpgc(int32 nproc)
 	M *mp;
 	int32 n, pos;
 
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	pos = 0;
 	for(n = 1; n < nproc; n++) {  // one M is currently running
 		if(runtime_allp[pos]->mcache == g->m->mcache)
@@ -875,7 +847,7 @@ runtime_helpgc(int32 nproc)
 		pos++;
 		runtime_notewakeup(&mp->park);
 	}
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 }
 
 // Similar to stoptheworld but best-effort and can be called several times.
@@ -893,8 +865,8 @@ runtime_freezetheworld(void)
 	// so try several times
 	for(i = 0; i < 5; i++) {
 		// this should tell the scheduler to not start any new goroutines
-		runtime_sched.stopwait = 0x7fffffff;
-		runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
+		runtime_sched->stopwait = 0x7fffffff;
+		runtime_atomicstore((uint32*)&runtime_sched->gcwaiting, 1);
 		// this should stop running goroutines
 		if(!preemptall())
 			break;  // no running goroutines
@@ -914,34 +886,34 @@ runtime_stopTheWorldWithSema(void)
 	P *p;
 	bool wait;
 
-	runtime_lock(&runtime_sched);
-	runtime_sched.stopwait = runtime_gomaxprocs;
-	runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
+	runtime_lock(&runtime_sched->lock);
+	runtime_sched->stopwait = runtime_gomaxprocs;
+	runtime_atomicstore((uint32*)&runtime_sched->gcwaiting, 1);
 	preemptall();
 	// stop current P
 	((P*)g->m->p)->status = _Pgcstop;
-	runtime_sched.stopwait--;
+	runtime_sched->stopwait--;
 	// try to retake all P's in _Psyscall status
 	for(i = 0; i < runtime_gomaxprocs; i++) {
 		p = runtime_allp[i];
 		s = p->status;
 		if(s == _Psyscall && runtime_cas(&p->status, s, _Pgcstop))
-			runtime_sched.stopwait--;
+			runtime_sched->stopwait--;
 	}
 	// stop idle P's
 	while((p = pidleget()) != nil) {
 		p->status = _Pgcstop;
-		runtime_sched.stopwait--;
+		runtime_sched->stopwait--;
 	}
-	wait = runtime_sched.stopwait > 0;
-	runtime_unlock(&runtime_sched);
+	wait = runtime_sched->stopwait > 0;
+	runtime_unlock(&runtime_sched->lock);
 
 	// wait for remaining P's to stop voluntarily
 	if(wait) {
-		runtime_notesleep(&runtime_sched.stopnote);
-		runtime_noteclear(&runtime_sched.stopnote);
+		runtime_notesleep(&runtime_sched->stopnote);
+		runtime_noteclear(&runtime_sched->stopnote);
 	}
-	if(runtime_sched.stopwait)
+	if(runtime_sched->stopwait)
 		runtime_throw("stoptheworld: not stopped");
 	for(i = 0; i < runtime_gomaxprocs; i++) {
 		p = runtime_allp[i];
@@ -968,13 +940,13 @@ runtime_startTheWorldWithSema(void)
 	gp = runtime_netpoll(false);  // non-blocking
 	injectglist(gp);
 	add = needaddgcproc();
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	if(newprocs) {
 		procresize(newprocs);
 		newprocs = 0;
 	} else
 		procresize(runtime_gomaxprocs);
-	runtime_sched.gcwaiting = 0;
+	runtime_sched->gcwaiting = 0;
 
 	p1 = nil;
 	while((p = pidleget()) != nil) {
@@ -988,11 +960,11 @@ runtime_startTheWorldWithSema(void)
 		p->link = (uintptr)p1;
 		p1 = p;
 	}
-	if(runtime_sched.sysmonwait) {
-		runtime_sched.sysmonwait = false;
-		runtime_notewakeup(&runtime_sched.sysmonnote);
+	if(runtime_sched->sysmonwait) {
+		runtime_sched->sysmonwait = false;
+		runtime_notewakeup(&runtime_sched->sysmonnote);
 	}
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 
 	while(p1) {
 		p = p1;
@@ -1285,7 +1257,7 @@ runtime_newextram(void)
 	mp->locked = _LockInternal;
 	mp->lockedg = gp;
 	gp->lockedm = mp;
-	gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
+	gp->goid = runtime_xadd64(&runtime_sched->goidgen, 1);
 	// put on allg for garbage collector
 	allgadd(gp);
 
@@ -1439,13 +1411,13 @@ stopm(void)
 		runtime_throw("stopm holding p");
 	if(m->spinning) {
 		m->spinning = false;
-		runtime_xadd(&runtime_sched.nmspinning, -1);
+		runtime_xadd(&runtime_sched->nmspinning, -1);
 	}
 
 retry:
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	mput(m);
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 	runtime_notesleep(&m->park);
 	m = g->m;
 	runtime_noteclear(&m->park);
@@ -1473,18 +1445,18 @@ startm(P *p, bool spinning)
 	M *mp;
 	void (*fn)(void);
 
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	if(p == nil) {
 		p = pidleget();
 		if(p == nil) {
-			runtime_unlock(&runtime_sched);
+			runtime_unlock(&runtime_sched->lock);
 			if(spinning)
-				runtime_xadd(&runtime_sched.nmspinning, -1);
+				runtime_xadd(&runtime_sched->nmspinning, -1);
 			return;
 		}
 	}
 	mp = mget();
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 	if(mp == nil) {
 		fn = nil;
 		if(spinning)
@@ -1506,39 +1478,39 @@ static void
 handoffp(P *p)
 {
 	// if it has local work, start it straight away
-	if(p->runqhead != p->runqtail || runtime_sched.runqsize) {
+	if(p->runqhead != p->runqtail || runtime_sched->runqsize) {
 		startm(p, false);
 		return;
 	}
 	// no local work, check that there are no spinning/idle M's,
 	// otherwise our help is not required
-	if(runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) == 0 &&  // TODO: fast atomic
-		runtime_cas(&runtime_sched.nmspinning, 0, 1)) {
+	if(runtime_atomicload(&runtime_sched->nmspinning) + runtime_atomicload(&runtime_sched->npidle) == 0 &&  // TODO: fast atomic
+		runtime_cas(&runtime_sched->nmspinning, 0, 1)) {
 		startm(p, true);
 		return;
 	}
-	runtime_lock(&runtime_sched);
-	if(runtime_sched.gcwaiting) {
+	runtime_lock(&runtime_sched->lock);
+	if(runtime_sched->gcwaiting) {
 		p->status = _Pgcstop;
-		if(--runtime_sched.stopwait == 0)
-			runtime_notewakeup(&runtime_sched.stopnote);
-		runtime_unlock(&runtime_sched);
+		if(--runtime_sched->stopwait == 0)
+			runtime_notewakeup(&runtime_sched->stopnote);
+		runtime_unlock(&runtime_sched->lock);
 		return;
 	}
-	if(runtime_sched.runqsize) {
-		runtime_unlock(&runtime_sched);
+	if(runtime_sched->runqsize) {
+		runtime_unlock(&runtime_sched->lock);
 		startm(p, false);
 		return;
 	}
 	// If this is the last running P and nobody is polling network,
 	// need to wakeup another M to poll network.
-	if(runtime_sched.npidle == (uint32)runtime_gomaxprocs-1 && runtime_atomicload64(&runtime_sched.lastpoll) != 0) {
-		runtime_unlock(&runtime_sched);
+	if(runtime_sched->npidle == (uint32)runtime_gomaxprocs-1 && runtime_atomicload64(&runtime_sched->lastpoll) != 0) {
+		runtime_unlock(&runtime_sched->lock);
 		startm(p, false);
 		return;
 	}
 	pidleput(p);
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 }
 
 // Tries to add one more P to execute G's.
@@ -1547,7 +1519,7 @@ static void
 wakep(void)
 {
 	// be conservative about spinning threads
-	if(!runtime_cas(&runtime_sched.nmspinning, 0, 1))
+	if(!runtime_cas(&runtime_sched->nmspinning, 0, 1))
 		return;
 	startm(nil, true);
 }
@@ -1606,18 +1578,18 @@ gcstopm(void)
 {
 	P *p;
 
-	if(!runtime_sched.gcwaiting)
+	if(!runtime_sched->gcwaiting)
 		runtime_throw("gcstopm: not waiting for gc");
 	if(g->m->spinning) {
 		g->m->spinning = false;
-		runtime_xadd(&runtime_sched.nmspinning, -1);
+		runtime_xadd(&runtime_sched->nmspinning, -1);
 	}
 	p = releasep();
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	p->status = _Pgcstop;
-	if(--runtime_sched.stopwait == 0)
-		runtime_notewakeup(&runtime_sched.stopnote);
-	runtime_unlock(&runtime_sched);
+	if(--runtime_sched->stopwait == 0)
+		runtime_notewakeup(&runtime_sched->stopnote);
+	runtime_unlock(&runtime_sched->lock);
 	stopm();
 }
 
@@ -1639,7 +1611,7 @@ execute(G *gp)
 	gp->m = g->m;
 
 	// Check whether the profiler needs to be turned on or off.
-	hz = runtime_sched.profilehz;
+	hz = runtime_sched->profilehz;
 	if(g->m->profilehz != hz)
 		runtime_resetcpuprofiler(hz);
 
@@ -1656,7 +1628,7 @@ findrunnable(void)
 	int32 i;
 
 top:
-	if(runtime_sched.gcwaiting) {
+	if(runtime_sched->gcwaiting) {
 		gcstopm();
 		goto top;
 	}
@@ -1667,10 +1639,10 @@ top:
 	if(gp)
 		return gp;
 	// global runq
-	if(runtime_sched.runqsize) {
-		runtime_lock(&runtime_sched);
+	if(runtime_sched->runqsize) {
+		runtime_lock(&runtime_sched->lock);
 		gp = globrunqget((P*)g->m->p, 0);
-		runtime_unlock(&runtime_sched);
+		runtime_unlock(&runtime_sched->lock);
 		if(gp)
 			return gp;
 	}
@@ -1684,15 +1656,15 @@ top:
 	// If number of spinning M's >= number of busy P's, block.
 	// This is necessary to prevent excessive CPU consumption
 	// when GOMAXPROCS>>1 but the program parallelism is low.
-	if(!g->m->spinning && 2 * runtime_atomicload(&runtime_sched.nmspinning) >= runtime_gomaxprocs - runtime_atomicload(&runtime_sched.npidle))  // TODO: fast atomic
+	if(!g->m->spinning && 2 * runtime_atomicload(&runtime_sched->nmspinning) >= runtime_gomaxprocs - runtime_atomicload(&runtime_sched->npidle))  // TODO: fast atomic
 		goto stop;
 	if(!g->m->spinning) {
 		g->m->spinning = true;
-		runtime_xadd(&runtime_sched.nmspinning, 1);
+		runtime_xadd(&runtime_sched->nmspinning, 1);
 	}
 	// random steal from other P's
 	for(i = 0; i < 2*runtime_gomaxprocs; i++) {
-		if(runtime_sched.gcwaiting)
+		if(runtime_sched->gcwaiting)
 			goto top;
 		p = runtime_allp[runtime_fastrand1()%runtime_gomaxprocs];
 		if(p == (P*)g->m->p)
@@ -1704,30 +1676,30 @@ top:
 	}
 stop:
 	// return P and block
-	runtime_lock(&runtime_sched);
-	if(runtime_sched.gcwaiting) {
-		runtime_unlock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
+	if(runtime_sched->gcwaiting) {
+		runtime_unlock(&runtime_sched->lock);
 		goto top;
 	}
-	if(runtime_sched.runqsize) {
+	if(runtime_sched->runqsize) {
 		gp = globrunqget((P*)g->m->p, 0);
-		runtime_unlock(&runtime_sched);
+		runtime_unlock(&runtime_sched->lock);
 		return gp;
 	}
 	p = releasep();
 	pidleput(p);
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 	if(g->m->spinning) {
 		g->m->spinning = false;
-		runtime_xadd(&runtime_sched.nmspinning, -1);
+		runtime_xadd(&runtime_sched->nmspinning, -1);
 	}
 	// check all runqueues once again
 	for(i = 0; i < runtime_gomaxprocs; i++) {
 		p = runtime_allp[i];
 		if(p && p->runqhead != p->runqtail) {
-			runtime_lock(&runtime_sched);
+			runtime_lock(&runtime_sched->lock);
 			p = pidleget();
-			runtime_unlock(&runtime_sched);
+			runtime_unlock(&runtime_sched->lock);
 			if(p) {
 				acquirep(p);
 				goto top;
@@ -1736,17 +1708,17 @@ stop:
 		}
 	}
 	// poll network
-	if(runtime_xchg64(&runtime_sched.lastpoll, 0) != 0) {
+	if(runtime_xchg64(&runtime_sched->lastpoll, 0) != 0) {
 		if(g->m->p)
 			runtime_throw("findrunnable: netpoll with p");
 		if(g->m->spinning)
 			runtime_throw("findrunnable: netpoll with spinning");
 		gp = runtime_netpoll(true);  // block until new work is available
-		runtime_atomicstore64(&runtime_sched.lastpoll, runtime_nanotime());
+		runtime_atomicstore64(&runtime_sched->lastpoll, runtime_nanotime());
 		if(gp) {
-			runtime_lock(&runtime_sched);
+			runtime_lock(&runtime_sched->lock);
 			p = pidleget();
-			runtime_unlock(&runtime_sched);
+			runtime_unlock(&runtime_sched->lock);
 			if(p) {
 				acquirep(p);
 				injectglist((G*)gp->schedlink);
@@ -1767,15 +1739,15 @@ resetspinning(void)
 
 	if(g->m->spinning) {
 		g->m->spinning = false;
-		nmspinning = runtime_xadd(&runtime_sched.nmspinning, -1);
+		nmspinning = runtime_xadd(&runtime_sched->nmspinning, -1);
 		if(nmspinning < 0)
 			runtime_throw("findrunnable: negative nmspinning");
 	} else
-		nmspinning = runtime_atomicload(&runtime_sched.nmspinning);
+		nmspinning = runtime_atomicload(&runtime_sched->nmspinning);
 
 	// M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
 	// so see if we need to wakeup another P here.
-	if (nmspinning == 0 && runtime_atomicload(&runtime_sched.npidle) > 0)
+	if (nmspinning == 0 && runtime_atomicload(&runtime_sched->npidle) > 0)
 		wakep();
 }
 
@@ -1789,16 +1761,16 @@ injectglist(G *glist)
 
 	if(glist == nil)
 		return;
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	for(n = 0; glist; n++) {
 		gp = glist;
 		glist = (G*)gp->schedlink;
 		gp->atomicstatus = _Grunnable;
 		globrunqput(gp);
 	}
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 
-	for(; n && runtime_sched.npidle; n--)
+	for(; n && runtime_sched->npidle; n--)
 		startm(nil, false);
 }
 
@@ -1814,7 +1786,7 @@ schedule(void)
 		runtime_throw("schedule: holding locks");
 
 top:
-	if(runtime_sched.gcwaiting) {
+	if(runtime_sched->gcwaiting) {
 		gcstopm();
 		goto top;
 	}
@@ -1826,10 +1798,10 @@ top:
 	tick = ((P*)g->m->p)->schedtick;
 	// This is a fancy way to say tick%61==0,
 	// it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
-	if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime_sched.runqsize > 0) {
-		runtime_lock(&runtime_sched);
+	if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime_sched->runqsize > 0) {
+		runtime_lock(&runtime_sched->lock);
 		gp = globrunqget((P*)g->m->p, 1);
-		runtime_unlock(&runtime_sched);
+		runtime_unlock(&runtime_sched->lock);
 		if(gp)
 			resetspinning();
 	}
@@ -1959,9 +1931,9 @@ runtime_gosched0(G *gp)
 	gp->atomicstatus = _Grunnable;
 	gp->m = nil;
 	m->curg = nil;
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	globrunqput(gp);
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 	if(m->lockedg) {
 		stoplockedm();
 		execute(gp);  // Never returns.
@@ -2074,25 +2046,25 @@ doentersyscall(uintptr pc, uintptr sp)
 
 	g->atomicstatus = _Gsyscall;
 
-	if(runtime_atomicload(&runtime_sched.sysmonwait)) {  // TODO: fast atomic
-		runtime_lock(&runtime_sched);
-		if(runtime_atomicload(&runtime_sched.sysmonwait)) {
-			runtime_atomicstore(&runtime_sched.sysmonwait, 0);
-			runtime_notewakeup(&runtime_sched.sysmonnote);
+	if(runtime_atomicload(&runtime_sched->sysmonwait)) {  // TODO: fast atomic
+		runtime_lock(&runtime_sched->lock);
+		if(runtime_atomicload(&runtime_sched->sysmonwait)) {
+			runtime_atomicstore(&runtime_sched->sysmonwait, 0);
+			runtime_notewakeup(&runtime_sched->sysmonnote);
 		}
-		runtime_unlock(&runtime_sched);
+		runtime_unlock(&runtime_sched->lock);
 	}
 
 	g->m->mcache = nil;
 	((P*)(g->m->p))->m = 0;
 	runtime_atomicstore(&((P*)g->m->p)->status, _Psyscall);
-	if(runtime_atomicload(&runtime_sched.gcwaiting)) {
-		runtime_lock(&runtime_sched);
-		if (runtime_sched.stopwait > 0 && runtime_cas(&((P*)g->m->p)->status, _Psyscall, _Pgcstop)) {
-			if(--runtime_sched.stopwait == 0)
-				runtime_notewakeup(&runtime_sched.stopnote);
+	if(runtime_atomicload(&runtime_sched->gcwaiting)) {
+		runtime_lock(&runtime_sched->lock);
+		if (runtime_sched->stopwait > 0 && runtime_cas(&((P*)g->m->p)->status, _Psyscall, _Pgcstop)) {
+			if(--runtime_sched->stopwait == 0)
+				runtime_notewakeup(&runtime_sched->stopnote);
 		}
-		runtime_unlock(&runtime_sched);
+		runtime_unlock(&runtime_sched->lock);
 	}
 
 	g->m->locks--;
@@ -2201,7 +2173,7 @@ exitsyscallfast(void)
 	gp = g;
 
 	// Freezetheworld sets stopwait but does not retake P's.
-	if(runtime_sched.stopwait) {
+	if(runtime_sched->stopwait) {
 		gp->m->p = 0;
 		return false;
 	}
@@ -2215,14 +2187,14 @@ exitsyscallfast(void)
 	}
 	// Try to get any other idle P.
 	gp->m->p = 0;
-	if(runtime_sched.pidle) {
-		runtime_lock(&runtime_sched);
+	if(runtime_sched->pidle) {
+		runtime_lock(&runtime_sched->lock);
 		p = pidleget();
-		if(p && runtime_atomicload(&runtime_sched.sysmonwait)) {
-			runtime_atomicstore(&runtime_sched.sysmonwait, 0);
-			runtime_notewakeup(&runtime_sched.sysmonnote);
+		if(p && runtime_atomicload(&runtime_sched->sysmonwait)) {
+			runtime_atomicstore(&runtime_sched->sysmonwait, 0);
+			runtime_notewakeup(&runtime_sched->sysmonnote);
 		}
-		runtime_unlock(&runtime_sched);
+		runtime_unlock(&runtime_sched->lock);
 		if(p) {
 			acquirep(p);
 			return true;
@@ -2243,15 +2215,15 @@ exitsyscall0(G *gp)
 	gp->atomicstatus = _Grunnable;
 	gp->m = nil;
 	m->curg = nil;
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	p = pidleget();
 	if(p == nil)
 		globrunqput(gp);
-	else if(runtime_atomicload(&runtime_sched.sysmonwait)) {
-		runtime_atomicstore(&runtime_sched.sysmonwait, 0);
-		runtime_notewakeup(&runtime_sched.sysmonnote);
+	else if(runtime_atomicload(&runtime_sched->sysmonwait)) {
+		runtime_atomicstore(&runtime_sched->sysmonwait, 0);
+		runtime_notewakeup(&runtime_sched->sysmonnote);
 	}
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 	if(p) {
 		acquirep(p);
 		execute(gp);  // Never returns.
@@ -2308,7 +2280,7 @@ syscall_runtime_AfterFork(void)
 {
 	int32 hz;
 
-	hz = runtime_sched.profilehz;
+	hz = runtime_sched->profilehz;
 	if(hz != 0)
 		runtime_resetcpuprofiler(hz);
 	runtime_m()->locks--;
@@ -2400,7 +2372,7 @@ __go_go(void (*fn)(void*), void* arg)
 	newg->gopc = (uintptr)__builtin_return_address(0);
 	newg->atomicstatus = _Grunnable;
 	if(p->goidcache == p->goidcacheend) {
-		p->goidcache = runtime_xadd64(&runtime_sched.goidgen, GoidCacheBatch);
+		p->goidcache = runtime_xadd64(&runtime_sched->goidgen, GoidCacheBatch);
 		p->goidcacheend = p->goidcache + GoidCacheBatch;
 	}
 	newg->goid = p->goidcache++;
@@ -2421,7 +2393,7 @@ __go_go(void (*fn)(void*), void* arg)
 
 		runqput(p, vnewg);
 
-		if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0 && fn != runtime_main)  // TODO: fast atomic
+		if(runtime_atomicload(&runtime_sched->npidle) != 0 && runtime_atomicload(&runtime_sched->nmspinning) == 0 && fn != runtime_main)  // TODO: fast atomic
 			wakep();
 		g->m->locks--;
 		return vnewg;
@@ -2462,15 +2434,15 @@ gfput(P *p, G *gp)
 	p->gfree = gp;
 	p->gfreecnt++;
 	if(p->gfreecnt >= 64) {
-		runtime_lock(&runtime_sched.gflock);
+		runtime_lock(&runtime_sched->gflock);
 		while(p->gfreecnt >= 32) {
 			p->gfreecnt--;
 			gp = p->gfree;
 			p->gfree = (G*)gp->schedlink;
-			gp->schedlink = (uintptr)runtime_sched.gfree;
-			runtime_sched.gfree = gp;
+			gp->schedlink = (uintptr)runtime_sched->gfree;
+			runtime_sched->gfree = gp;
 		}
-		runtime_unlock(&runtime_sched.gflock);
+		runtime_unlock(&runtime_sched->gflock);
 	}
 }
 
@@ -2483,16 +2455,16 @@ gfget(P *p)
 
 retry:
 	gp = p->gfree;
-	if(gp == nil && runtime_sched.gfree) {
-		runtime_lock(&runtime_sched.gflock);
-		while(p->gfreecnt < 32 && runtime_sched.gfree) {
+	if(gp == nil && runtime_sched->gfree) {
+		runtime_lock(&runtime_sched->gflock);
+		while(p->gfreecnt < 32 && runtime_sched->gfree) {
 			p->gfreecnt++;
-			gp = runtime_sched.gfree;
-			runtime_sched.gfree = (G*)gp->schedlink;
+			gp = runtime_sched->gfree;
+			runtime_sched->gfree = (G*)gp->schedlink;
 			gp->schedlink = (uintptr)p->gfree;
 			p->gfree = gp;
 		}
-		runtime_unlock(&runtime_sched.gflock);
+		runtime_unlock(&runtime_sched->gflock);
 		goto retry;
 	}
 	if(gp) {
@@ -2508,15 +2480,15 @@ gfpurge(P *p)
 {
 	G *gp;
 
-	runtime_lock(&runtime_sched.gflock);
+	runtime_lock(&runtime_sched->gflock);
 	while(p->gfreecnt) {
 		p->gfreecnt--;
 		gp = p->gfree;
 		p->gfree = (G*)gp->schedlink;
-		gp->schedlink = (uintptr)runtime_sched.gfree;
-		runtime_sched.gfree = gp;
+		gp->schedlink = (uintptr)runtime_sched->gfree;
+		runtime_sched->gfree = gp;
 	}
-	runtime_unlock(&runtime_sched.gflock);
+	runtime_unlock(&runtime_sched->gflock);
 }
 
 void
@@ -2546,13 +2518,13 @@ runtime_GOMAXPROCS(intgo n)
 
 	if(n > _MaxGomaxprocs)
 		n = _MaxGomaxprocs;
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	ret = (intgo)runtime_gomaxprocs;
 	if(n <= 0 || n == ret) {
-		runtime_unlock(&runtime_sched);
+		runtime_unlock(&runtime_sched->lock);
 		return ret;
 	}
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 
 	runtime_acquireWorldsema();
 	g->m->gcing = 1;
@@ -2653,7 +2625,7 @@ runtime_gcount(void)
 int32
 runtime_mcount(void)
 {
-	return runtime_sched.mcount;
+	return runtime_sched->mcount;
 }
 
 static struct {
@@ -2754,9 +2726,9 @@ runtime_setcpuprofilerate_m(int32 hz)
 	prof.hz = hz;
 	runtime_atomicstore(&prof.lock, 0);
 
-	runtime_lock(&runtime_sched);
-	runtime_sched.profilehz = hz;
-	runtime_unlock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
+	runtime_sched->profilehz = hz;
+	runtime_unlock(&runtime_sched->lock);
 
 	if(hz != 0)
 		runtime_resetcpuprofiler(hz);
@@ -2809,22 +2781,22 @@ procresize(int32 new)
 			p->runqtail--;
 			gp = (G*)p->runq[p->runqtail%nelem(p->runq)];
 			// push onto head of global queue
-			gp->schedlink = (uintptr)runtime_sched.runqhead;
-			runtime_sched.runqhead = gp;
-			if(runtime_sched.runqtail == nil)
-				runtime_sched.runqtail = gp;
-			runtime_sched.runqsize++;
+			gp->schedlink = runtime_sched->runqhead;
+			runtime_sched->runqhead = (uintptr)gp;
+			if(runtime_sched->runqtail == 0)
+				runtime_sched->runqtail = (uintptr)gp;
+			runtime_sched->runqsize++;
 		}
 	}
 	// fill local queues with at most nelem(p->runq)/2 goroutines
 	// start at 1 because current M already executes some G and will acquire allp[0] below,
 	// so if we have a spare G we want to put it into allp[1].
-	for(i = 1; (uint32)i < (uint32)new * nelem(p->runq)/2 && runtime_sched.runqsize > 0; i++) {
-		gp = runtime_sched.runqhead;
-		runtime_sched.runqhead = (G*)gp->schedlink;
-		if(runtime_sched.runqhead == nil)
-			runtime_sched.runqtail = nil;
-		runtime_sched.runqsize--;
+	for(i = 1; (uint32)i < (uint32)new * nelem(p->runq)/2 && runtime_sched->runqsize > 0; i++) {
+		gp = (G*)runtime_sched->runqhead;
+		runtime_sched->runqhead = gp->schedlink;
+		if(runtime_sched->runqhead == 0)
+			runtime_sched->runqtail = 0;
+		runtime_sched->runqsize--;
 		runqput(runtime_allp[i%new], gp);
 	}
 
@@ -2899,11 +2871,11 @@ releasep(void)
 static void
 incidlelocked(int32 v)
 {
-	runtime_lock(&runtime_sched);
-	runtime_sched.nmidlelocked += v;
+	runtime_lock(&runtime_sched->lock);
+	runtime_sched->nmidlelocked += v;
 	if(v > 0)
 		checkdead();
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 }
 
 // Check for deadlock situation.
@@ -2923,7 +2895,7 @@ checkdead(void)
 	}
 
 	// -1 for sysmon
-	run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.nmidlelocked - 1 - countextra();
+	run = runtime_sched->mcount - runtime_sched->nmidle - runtime_sched->nmidlelocked - 1 - countextra();
 	if(run > 0)
 		return;
 	// If we are dying because of a signal caught on an already idle thread,
@@ -2934,7 +2906,7 @@ checkdead(void)
 		return;
 	if(run < 0) {
 		runtime_printf("runtime: checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
-			runtime_sched.nmidle, runtime_sched.nmidlelocked, runtime_sched.mcount);
+			runtime_sched->nmidle, runtime_sched->nmidlelocked, runtime_sched->mcount);
 		runtime_throw("checkdead: inconsistent counts");
 	}
 	grunning = 0;
@@ -2978,23 +2950,23 @@ sysmon(void)
 			delay = 10*1000;
 		runtime_usleep(delay);
 		if(runtime_debug.schedtrace <= 0 &&
-			(runtime_sched.gcwaiting || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs)) {  // TODO: fast atomic
-			runtime_lock(&runtime_sched);
-			if(runtime_atomicload(&runtime_sched.gcwaiting) || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs) {
-				runtime_atomicstore(&runtime_sched.sysmonwait, 1);
-				runtime_unlock(&runtime_sched);
-				runtime_notesleep(&runtime_sched.sysmonnote);
-				runtime_noteclear(&runtime_sched.sysmonnote);
+			(runtime_sched->gcwaiting || runtime_atomicload(&runtime_sched->npidle) == (uint32)runtime_gomaxprocs)) {  // TODO: fast atomic
+			runtime_lock(&runtime_sched->lock);
+			if(runtime_atomicload(&runtime_sched->gcwaiting) || runtime_atomicload(&runtime_sched->npidle) == (uint32)runtime_gomaxprocs) {
+				runtime_atomicstore(&runtime_sched->sysmonwait, 1);
+				runtime_unlock(&runtime_sched->lock);
+				runtime_notesleep(&runtime_sched->sysmonnote);
+				runtime_noteclear(&runtime_sched->sysmonnote);
 				idle = 0;
 				delay = 20;
 			} else
-				runtime_unlock(&runtime_sched);
+				runtime_unlock(&runtime_sched->lock);
 		}
 		// poll network if not polled for more than 10ms
-		lastpoll = runtime_atomicload64(&runtime_sched.lastpoll);
+		lastpoll = runtime_atomicload64(&runtime_sched->lastpoll);
 		now = runtime_nanotime();
 		if(lastpoll != 0 && lastpoll + 10*1000*1000 < now) {
-			runtime_cas64(&runtime_sched.lastpoll, lastpoll, now);
+			runtime_cas64(&runtime_sched->lastpoll, lastpoll, now);
 			gp = runtime_netpoll(false);  // non-blocking
 			if(gp) {
 				// Need to decrement number of idle locked M's
@@ -3060,7 +3032,7 @@ retake(int64 now)
 			// but on the other hand we want to retake them eventually
 			// because they can prevent the sysmon thread from deep sleep.
 			if(p->runqhead == p->runqtail &&
-				runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0 &&
+				runtime_atomicload(&runtime_sched->nmspinning) + runtime_atomicload(&runtime_sched->npidle) > 0 &&
 				pd->syscallwhen + 10*1000*1000 > now)
 				continue;
 			// Need to decrement number of idle locked M's
@@ -3117,14 +3089,14 @@ runtime_schedtrace(bool detailed)
 	if(starttime == 0)
 		starttime = now;
 
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	runtime_printf("SCHED %Dms: gomaxprocs=%d idleprocs=%d threads=%d idlethreads=%d runqueue=%d",
-		(now-starttime)/1000000, runtime_gomaxprocs, runtime_sched.npidle, runtime_sched.mcount,
-		runtime_sched.nmidle, runtime_sched.runqsize);
+		(now-starttime)/1000000, runtime_gomaxprocs, runtime_sched->npidle, runtime_sched->mcount,
+		runtime_sched->nmidle, runtime_sched->runqsize);
 	if(detailed) {
 		runtime_printf(" gcwaiting=%d nmidlelocked=%d nmspinning=%d stopwait=%d sysmonwait=%d\n",
-			runtime_sched.gcwaiting, runtime_sched.nmidlelocked, runtime_sched.nmspinning,
-			runtime_sched.stopwait, runtime_sched.sysmonwait);
+			runtime_sched->gcwaiting, runtime_sched->nmidlelocked, runtime_sched->nmspinning,
+			runtime_sched->stopwait, runtime_sched->sysmonwait);
 	}
 	// We must be careful while reading data from P's, M's and G's.
 	// Even if we hold schedlock, most data can be changed concurrently.
@@ -3153,7 +3125,7 @@ runtime_schedtrace(bool detailed)
 		}
 	}
 	if(!detailed) {
-		runtime_unlock(&runtime_sched);
+		runtime_unlock(&runtime_sched->lock);
 		return;
 	}
 	for(mp = runtime_allm; mp; mp = mp->alllink) {
@@ -3185,7 +3157,7 @@ runtime_schedtrace(bool detailed)
 			lockedm ? lockedm->id : -1);
 	}
 	runtime_unlock(&allglock);
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 }
 
 // Put mp on midle list.
@@ -3193,9 +3165,9 @@ runtime_schedtrace(bool detailed)
 static void
 mput(M *mp)
 {
-	mp->schedlink = (uintptr)runtime_sched.midle;
-	runtime_sched.midle = mp;
-	runtime_sched.nmidle++;
+	mp->schedlink = runtime_sched->midle;
+	runtime_sched->midle = (uintptr)mp;
+	runtime_sched->nmidle++;
 	checkdead();
 }
 
@@ -3206,9 +3178,9 @@ mget(void)
 {
 	M *mp;
 
-	if((mp = runtime_sched.midle) != nil){
-		runtime_sched.midle = (M*)mp->schedlink;
-		runtime_sched.nmidle--;
+	if((mp = (M*)runtime_sched->midle) != nil){
+		runtime_sched->midle = mp->schedlink;
+		runtime_sched->nmidle--;
 	}
 	return mp;
 }
@@ -3219,12 +3191,12 @@ static void
 globrunqput(G *gp)
 {
 	gp->schedlink = 0;
-	if(runtime_sched.runqtail)
-		runtime_sched.runqtail->schedlink = (uintptr)gp;
+	if(runtime_sched->runqtail)
+		((G*)runtime_sched->runqtail)->schedlink = (uintptr)gp;
 	else
-		runtime_sched.runqhead = gp;
-	runtime_sched.runqtail = gp;
-	runtime_sched.runqsize++;
+		runtime_sched->runqhead = (uintptr)gp;
+	runtime_sched->runqtail = (uintptr)gp;
+	runtime_sched->runqsize++;
 }
 
 // Put a batch of runnable goroutines on the global runnable queue.
@@ -3233,12 +3205,12 @@ static void
 globrunqputbatch(G *ghead, G *gtail, int32 n)
 {
 	gtail->schedlink = 0;
-	if(runtime_sched.runqtail)
-		runtime_sched.runqtail->schedlink = (uintptr)ghead;
+	if(runtime_sched->runqtail)
+		((G*)runtime_sched->runqtail)->schedlink = (uintptr)ghead;
 	else
-		runtime_sched.runqhead = ghead;
-	runtime_sched.runqtail = gtail;
-	runtime_sched.runqsize += n;
+		runtime_sched->runqhead = (uintptr)ghead;
+	runtime_sched->runqtail = (uintptr)gtail;
+	runtime_sched->runqsize += n;
 }
 
 // Try get a batch of G's from the global runnable queue.
@@ -3249,24 +3221,24 @@ globrunqget(P *p, int32 max)
 	G *gp, *gp1;
 	int32 n;
 
-	if(runtime_sched.runqsize == 0)
+	if(runtime_sched->runqsize == 0)
 		return nil;
-	n = runtime_sched.runqsize/runtime_gomaxprocs+1;
-	if(n > runtime_sched.runqsize)
-		n = runtime_sched.runqsize;
+	n = runtime_sched->runqsize/runtime_gomaxprocs+1;
+	if(n > runtime_sched->runqsize)
+		n = runtime_sched->runqsize;
 	if(max > 0 && n > max)
 		n = max;
 	if((uint32)n > nelem(p->runq)/2)
 		n = nelem(p->runq)/2;
-	runtime_sched.runqsize -= n;
-	if(runtime_sched.runqsize == 0)
-		runtime_sched.runqtail = nil;
-	gp = runtime_sched.runqhead;
-	runtime_sched.runqhead = (G*)gp->schedlink;
+	runtime_sched->runqsize -= n;
+	if(runtime_sched->runqsize == 0)
+		runtime_sched->runqtail = 0;
+	gp = (G*)runtime_sched->runqhead;
+	runtime_sched->runqhead = gp->schedlink;
 	n--;
 	while(n--) {
-		gp1 = runtime_sched.runqhead;
-		runtime_sched.runqhead = (G*)gp1->schedlink;
+		gp1 = (G*)runtime_sched->runqhead;
+		runtime_sched->runqhead = gp1->schedlink;
 		runqput(p, gp1);
 	}
 	return gp;
@@ -3277,9 +3249,9 @@ globrunqget(P *p, int32 max)
 static void
 pidleput(P *p)
 {
-	p->link = (uintptr)runtime_sched.pidle;
-	runtime_sched.pidle = p;
-	runtime_xadd(&runtime_sched.npidle, 1);  // TODO: fast atomic
+	p->link = runtime_sched->pidle;
+	runtime_sched->pidle = (uintptr)p;
+	runtime_xadd(&runtime_sched->npidle, 1);  // TODO: fast atomic
 }
 
 // Try get a p from pidle list.
@@ -3289,10 +3261,10 @@ pidleget(void)
 {
 	P *p;
 
-	p = runtime_sched.pidle;
+	p = (P*)runtime_sched->pidle;
 	if(p) {
-		runtime_sched.pidle = (P*)p->link;
-		runtime_xadd(&runtime_sched.npidle, -1);  // TODO: fast atomic
+		runtime_sched->pidle = p->link;
+		runtime_xadd(&runtime_sched->npidle, -1);  // TODO: fast atomic
 	}
 	return p;
 }
@@ -3341,9 +3313,9 @@ runqputslow(P *p, G *gp, uint32 h, uint3
 	for(i=0; i<n; i++)
 		batch[i]->schedlink = (uintptr)batch[i+1];
 	// Now put the batch on global queue.
-	runtime_lock(&runtime_sched);
+	runtime_lock(&runtime_sched->lock);
 	globrunqputbatch(batch[0], batch[n], n+1);
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 	return true;
 }
 
@@ -3495,11 +3467,11 @@ runtime_setmaxthreads(intgo in)
 {
 	intgo out;
 
-	runtime_lock(&runtime_sched);
-	out = (intgo)runtime_sched.maxmcount;
-	runtime_sched.maxmcount = (int32)in;
+	runtime_lock(&runtime_sched->lock);
+	out = (intgo)runtime_sched->maxmcount;
+	runtime_sched->maxmcount = (int32)in;
 	checkmcount();
-	runtime_unlock(&runtime_sched);
+	runtime_unlock(&runtime_sched->lock);
 	return out;
 }
 
@@ -3558,7 +3530,6 @@ sync_atomic_runtime_procUnpin()
 void
 runtime_proc_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
 {
-	enqueue1(wbufp, (Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
 	enqueue1(wbufp, (Obj){(byte*)&runtime_main_init_done, sizeof runtime_main_init_done, 0});
 }
 
@@ -3567,7 +3538,7 @@ runtime_proc_scan(struct Workbuf** wbufp
 bool
 runtime_gcwaiting(void)
 {
-	return runtime_sched.gcwaiting;
+	return runtime_sched->gcwaiting;
 }
 
 // os_beforeExit is called from os.Exit(0).
@@ -3602,7 +3573,7 @@ sync_runtime_canSpin(intgo i)
 	// GOMAXPROCS>1 and there is at least one other running P and local runq is empty.
 	// As opposed to runtime mutex we don't do passive spinning here,
 	// because there can be work on global runq on on other Ps.
-	if (i >= ACTIVE_SPIN || runtime_ncpu <= 1 || runtime_gomaxprocs <= (int32)(runtime_sched.npidle+runtime_sched.nmspinning)+1) {
+	if (i >= ACTIVE_SPIN || runtime_ncpu <= 1 || runtime_gomaxprocs <= (int32)(runtime_sched->npidle+runtime_sched->nmspinning)+1) {
 		return false;
 	}
 	p = (P*)g->m->p;