1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build goexperiment.exectracer2 6 7 // Go execution tracer. 8 // The tracer captures a wide range of execution events like goroutine 9 // creation/blocking/unblocking, syscall enter/exit/block, GC-related events, 10 // changes of heap size, processor start/stop, etc and writes them to a buffer 11 // in a compact form. A precise nanosecond-precision timestamp and a stack 12 // trace is captured for most events. 13 // 14 // Tracer invariants (to keep the synchronization making sense): 15 // - An m that has a trace buffer must be on either the allm or sched.freem lists. 16 // - Any trace buffer mutation must either be happening in traceAdvance or between 17 // a traceAcquire and a subsequent traceRelease. 18 // - traceAdvance cannot return until the previous generation's buffers are all flushed. 19 // 20 // See https://go.dev/issue/60773 for a link to the full design. 21 22 package runtime 23 24 import ( 25 "runtime/internal/atomic" 26 "unsafe" 27 ) 28 29 // Trace state. 30 31 // trace is global tracing context. 32 var trace struct { 33 // trace.lock must only be acquired on the system stack where 34 // stack splits cannot happen while it is held. 35 lock mutex 36 37 // Trace buffer management. 38 // 39 // First we check the empty list for any free buffers. If not, buffers 40 // are allocated directly from the OS. Once they're filled up and/or 41 // flushed, they end up on the full queue for trace.gen%2. 42 // 43 // The trace reader takes buffers off the full list one-by-one and 44 // places them into reading until they're finished being read from. 45 // Then they're placed onto the empty list. 46 // 47 // Protected by trace.lock. 48 reading *traceBuf // buffer currently handed off to user 49 empty *traceBuf // stack of empty buffers 50 full [2]traceBufQueue 51 workAvailable atomic.Bool 52 53 // State for the trace reader goroutine. 54 // 55 // Protected by trace.lock. 56 readerGen atomic.Uintptr // the generation the reader is currently reading for 57 flushedGen atomic.Uintptr // the last completed generation 58 headerWritten bool // whether ReadTrace has emitted trace header 59 60 // doneSema is used to synchronize the reader and traceAdvance. Specifically, 61 // it notifies traceAdvance that the reader is done with a generation. 62 // Both semaphores are 0 by default (so, acquires block). traceAdvance 63 // attempts to acquire for gen%2 after flushing the last buffers for gen. 64 // Meanwhile the reader releases the sema for gen%2 when it has finished 65 // processing gen. 66 doneSema [2]uint32 67 68 // Trace data tables for deduplicating data going into the trace. 69 // There are 2 of each: one for gen%2, one for 1-gen%2. 70 stackTab [2]traceStackTable // maps stack traces to unique ids 71 stringTab [2]traceStringTable // maps strings to unique ids 72 73 // cpuLogRead accepts CPU profile samples from the signal handler where 74 // they're generated. There are two profBufs here: one for gen%2, one for 75 // 1-gen%2. These profBufs use a three-word header to hold the IDs of the P, G, 76 // and M (respectively) that were active at the time of the sample. Because 77 // profBuf uses a record with all zeros in its header to indicate overflow, 78 // we make sure to make the P field always non-zero: The ID of a real P will 79 // start at bit 1, and bit 0 will be set. Samples that arrive while no P is 80 // running (such as near syscalls) will set the first header field to 0b10. 81 // This careful handling of the first header field allows us to store ID of 82 // the active G directly in the second field, even though that will be 0 83 // when sampling g0. 84 // 85 // Initialization and teardown of these fields is protected by traceAdvanceSema. 86 cpuLogRead [2]*profBuf 87 signalLock atomic.Uint32 // protects use of the following member, only usable in signal handlers 88 cpuLogWrite [2]atomic.Pointer[profBuf] // copy of cpuLogRead for use in signal handlers, set without signalLock 89 cpuSleep *wakeableSleep 90 cpuLogDone <-chan struct{} 91 cpuBuf [2]*traceBuf 92 93 reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil 94 95 // Fast mappings from enumerations to string IDs that are prepopulated 96 // in the trace. 97 markWorkerLabels [2][len(gcMarkWorkerModeStrings)]traceArg 98 goStopReasons [2][len(traceGoStopReasonStrings)]traceArg 99 goBlockReasons [2][len(traceBlockReasonStrings)]traceArg 100 101 // Trace generation counter. 102 gen atomic.Uintptr 103 lastNonZeroGen uintptr // last non-zero value of gen 104 105 // shutdown is set when we are waiting for trace reader to finish after setting gen to 0 106 // 107 // Writes protected by trace.lock. 108 shutdown atomic.Bool 109 110 // Number of goroutines in syscall exiting slow path. 111 exitingSyscall atomic.Int32 112 113 // seqGC is the sequence counter for GC begin/end. 114 // 115 // Mutated only during stop-the-world. 116 seqGC uint64 117 } 118 119 // Trace public API. 120 121 var ( 122 traceAdvanceSema uint32 = 1 123 traceShutdownSema uint32 = 1 124 ) 125 126 // StartTrace enables tracing for the current process. 127 // While tracing, the data will be buffered and available via [ReadTrace]. 128 // StartTrace returns an error if tracing is already enabled. 129 // Most clients should use the [runtime/trace] package or the [testing] package's 130 // -test.trace flag instead of calling StartTrace directly. 131 func StartTrace() error { 132 if traceEnabled() || traceShuttingDown() { 133 return errorString("tracing is already enabled") 134 } 135 // Block until cleanup of the last trace is done. 136 semacquire(&traceShutdownSema) 137 semrelease(&traceShutdownSema) 138 139 // Hold traceAdvanceSema across trace start, since we'll want it on 140 // the other side of tracing being enabled globally. 141 semacquire(&traceAdvanceSema) 142 143 // Initialize CPU profile -> trace ingestion. 144 traceInitReadCPU() 145 146 // Compute the first generation for this StartTrace. 147 // 148 // Note: we start from the last non-zero generation rather than 1 so we 149 // can avoid resetting all the arrays indexed by gen%2 or gen%3. There's 150 // more than one of each per m, p, and goroutine. 151 firstGen := traceNextGen(trace.lastNonZeroGen) 152 153 // Reset GC sequencer. 154 trace.seqGC = 1 155 156 // Reset trace reader state. 157 trace.headerWritten = false 158 trace.readerGen.Store(firstGen) 159 trace.flushedGen.Store(0) 160 161 // Register some basic strings in the string tables. 162 traceRegisterLabelsAndReasons(firstGen) 163 164 // Stop the world. 165 // 166 // The purpose of stopping the world is to make sure that no goroutine is in a 167 // context where it could emit an event by bringing all goroutines to a safe point 168 // with no opportunity to transition. 169 // 170 // The exception to this rule are goroutines that are concurrently exiting a syscall. 171 // Those will all be forced into the syscalling slow path, and we'll just make sure 172 // that we don't observe any goroutines in that critical section before starting 173 // the world again. 174 // 175 // A good follow-up question to this is why stopping the world is necessary at all 176 // given that we have traceAcquire and traceRelease. Unfortunately, those only help 177 // us when tracing is already active (for performance, so when tracing is off the 178 // tracing seqlock is left untouched). The main issue here is subtle: we're going to 179 // want to obtain a correct starting status for each goroutine, but there are windows 180 // of time in which we could read and emit an incorrect status. Specifically: 181 // 182 // trace := traceAcquire() 183 // // <----> problem window 184 // casgstatus(gp, _Gwaiting, _Grunnable) 185 // if trace.ok() { 186 // trace.GoUnpark(gp, 2) 187 // traceRelease(trace) 188 // } 189 // 190 // More precisely, if we readgstatus for a gp while another goroutine is in the problem 191 // window and that goroutine didn't observe that tracing had begun, then we might write 192 // a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking 193 // the transition from GoWaiting to GoRunnable. The trace will then be broken, because 194 // future events will be emitted assuming the tracer sees GoRunnable. 195 // 196 // In short, what we really need here is to make sure that the next time *any goroutine* 197 // hits a traceAcquire, it sees that the trace is enabled. 198 // 199 // Note also that stopping the world is necessary to make sure sweep-related events are 200 // coherent. Since the world is stopped and sweeps are non-preemptible, we can never start 201 // the world and see an unpaired sweep 'end' event. Other parts of the tracer rely on this. 202 stw := stopTheWorld(stwStartTrace) 203 204 // Prevent sysmon from running any code that could generate events. 205 lock(&sched.sysmonlock) 206 207 // Reset mSyscallID on all Ps while we have them stationary and the trace is disabled. 208 for _, pp := range allp { 209 pp.trace.mSyscallID = -1 210 } 211 212 // Start tracing. 213 // 214 // After this executes, other Ms may start creating trace buffers and emitting 215 // data into them. 216 trace.gen.Store(firstGen) 217 218 // Wait for exitingSyscall to drain. 219 // 220 // It may not monotonically decrease to zero, but in the limit it will always become 221 // zero because the world is stopped and there are no available Ps for syscall-exited 222 // goroutines to run on. 223 // 224 // Because we set gen before checking this, and because exitingSyscall is always incremented 225 // *after* traceAcquire (which checks gen), we can be certain that when exitingSyscall is zero 226 // that any goroutine that goes to exit a syscall from then on *must* observe the new gen. 227 // 228 // The critical section on each goroutine here is going to be quite short, so the likelihood 229 // that we observe a zero value is high. 230 for trace.exitingSyscall.Load() != 0 { 231 osyield() 232 } 233 234 // Record some initial pieces of information. 235 // 236 // N.B. This will also emit a status event for this goroutine. 237 tl := traceAcquire() 238 tl.Gomaxprocs(gomaxprocs) // Get this as early in the trace as possible. See comment in traceAdvance. 239 tl.STWStart(stwStartTrace) // We didn't trace this above, so trace it now. 240 241 // Record the fact that a GC is active, if applicable. 242 if gcphase == _GCmark || gcphase == _GCmarktermination { 243 tl.GCActive() 244 } 245 246 // Record the heap goal so we have it at the very beginning of the trace. 247 tl.HeapGoal() 248 249 // Make sure a ProcStatus is emitted for every P, while we're here. 250 for _, pp := range allp { 251 tl.writer().writeProcStatusForP(pp, pp == tl.mp.p.ptr()).end() 252 } 253 traceRelease(tl) 254 255 unlock(&sched.sysmonlock) 256 startTheWorld(stw) 257 258 traceStartReadCPU() 259 traceAdvancer.start() 260 261 semrelease(&traceAdvanceSema) 262 return nil 263 } 264 265 // StopTrace stops tracing, if it was previously enabled. 266 // StopTrace only returns after all the reads for the trace have completed. 267 func StopTrace() { 268 traceAdvance(true) 269 } 270 271 // traceAdvance moves tracing to the next generation, and cleans up the current generation, 272 // ensuring that it's flushed out before returning. If stopTrace is true, it disables tracing 273 // altogether instead of advancing to the next generation. 274 // 275 // traceAdvanceSema must not be held. 276 func traceAdvance(stopTrace bool) { 277 semacquire(&traceAdvanceSema) 278 279 // Get the gen that we're advancing from. In this function we don't really care much 280 // about the generation we're advancing _into_ since we'll do all the cleanup in this 281 // generation for the next advancement. 282 gen := trace.gen.Load() 283 if gen == 0 { 284 // We may end up here traceAdvance is called concurrently with StopTrace. 285 semrelease(&traceAdvanceSema) 286 return 287 } 288 289 // Write an EvFrequency event for this generation. 290 // 291 // N.B. This may block for quite a while to get a good frequency estimate, so make sure we do 292 // this here and not e.g. on the trace reader. 293 traceFrequency(gen) 294 295 // Collect all the untraced Gs. 296 type untracedG struct { 297 gp *g 298 goid uint64 299 mid int64 300 status uint32 301 waitreason waitReason 302 inMarkAssist bool 303 } 304 var untracedGs []untracedG 305 forEachGRace(func(gp *g) { 306 // Make absolutely sure all Gs are ready for the next 307 // generation. We need to do this even for dead Gs because 308 // they may come alive with a new identity, and its status 309 // traced bookkeeping might end up being stale. 310 // We may miss totally new goroutines, but they'll always 311 // have clean bookkeeping. 312 gp.trace.readyNextGen(gen) 313 // If the status was traced, nothing else to do. 314 if gp.trace.statusWasTraced(gen) { 315 return 316 } 317 // Scribble down information about this goroutine. 318 ug := untracedG{gp: gp, mid: -1} 319 systemstack(func() { 320 me := getg().m.curg 321 // We don't have to handle this G status transition because we 322 // already eliminated ourselves from consideration above. 323 casGToWaiting(me, _Grunning, waitReasonTraceGoroutineStatus) 324 // We need to suspend and take ownership of the G to safely read its 325 // goid. Note that we can't actually emit the event at this point 326 // because we might stop the G in a window where it's unsafe to write 327 // events based on the G's status. We need the global trace buffer flush 328 // coming up to make sure we're not racing with the G. 329 // 330 // It should be very unlikely that we try to preempt a running G here. 331 // The only situation that we might is that we're racing with a G 332 // that's running for the first time in this generation. Therefore, 333 // this should be relatively fast. 334 s := suspendG(gp) 335 if !s.dead { 336 ug.goid = s.g.goid 337 if s.g.m != nil { 338 ug.mid = int64(s.g.m.procid) 339 } 340 ug.status = readgstatus(s.g) &^ _Gscan 341 ug.waitreason = s.g.waitreason 342 ug.inMarkAssist = s.g.inMarkAssist 343 } 344 resumeG(s) 345 casgstatus(me, _Gwaiting, _Grunning) 346 }) 347 if ug.goid != 0 { 348 untracedGs = append(untracedGs, ug) 349 } 350 }) 351 352 if !stopTrace { 353 // Re-register runtime goroutine labels and stop/block reasons. 354 traceRegisterLabelsAndReasons(traceNextGen(gen)) 355 } 356 357 // Now that we've done some of the heavy stuff, prevent the world from stopping. 358 // This is necessary to ensure the consistency of the STW events. If we're feeling 359 // adventurous we could lift this restriction and add a STWActive event, but the 360 // cost of maintaining this consistency is low. We're not going to hold this semaphore 361 // for very long and most STW periods are very short. 362 // Once we hold worldsema, prevent preemption as well so we're not interrupted partway 363 // through this. We want to get this done as soon as possible. 364 semacquire(&worldsema) 365 mp := acquirem() 366 367 // Advance the generation or stop the trace. 368 trace.lastNonZeroGen = gen 369 if stopTrace { 370 systemstack(func() { 371 // Ordering is important here. Set shutdown first, then disable tracing, 372 // so that conditions like (traceEnabled() || traceShuttingDown()) have 373 // no opportunity to be false. Hold the trace lock so this update appears 374 // atomic to the trace reader. 375 lock(&trace.lock) 376 trace.shutdown.Store(true) 377 trace.gen.Store(0) 378 unlock(&trace.lock) 379 }) 380 } else { 381 trace.gen.Store(traceNextGen(gen)) 382 } 383 384 // Emit a ProcsChange event so we have one on record for each generation. 385 // Let's emit it as soon as possible so that downstream tools can rely on the value 386 // being there fairly soon in a generation. 387 // 388 // It's important that we do this before allowing stop-the-worlds again, 389 // because the procs count could change. 390 if !stopTrace { 391 tl := traceAcquire() 392 tl.Gomaxprocs(gomaxprocs) 393 traceRelease(tl) 394 } 395 396 // Emit a GCActive event in the new generation if necessary. 397 // 398 // It's important that we do this before allowing stop-the-worlds again, 399 // because that could emit global GC-related events. 400 if !stopTrace && (gcphase == _GCmark || gcphase == _GCmarktermination) { 401 tl := traceAcquire() 402 tl.GCActive() 403 traceRelease(tl) 404 } 405 406 // Preemption is OK again after this. If the world stops or whatever it's fine. 407 // We're just cleaning up the last generation after this point. 408 // 409 // We also don't care if the GC starts again after this for the same reasons. 410 releasem(mp) 411 semrelease(&worldsema) 412 413 // Snapshot allm and freem. 414 // 415 // Snapshotting after the generation counter update is sufficient. 416 // Because an m must be on either allm or sched.freem if it has an active trace 417 // buffer, new threads added to allm after this point must necessarily observe 418 // the new generation number (sched.lock acts as a barrier). 419 // 420 // Threads that exit before this point and are on neither list explicitly 421 // flush their own buffers in traceThreadDestroy. 422 // 423 // Snapshotting freem is necessary because Ms can continue to emit events 424 // while they're still on that list. Removal from sched.freem is serialized with 425 // this snapshot, so either we'll capture an m on sched.freem and race with 426 // the removal to flush its buffers (resolved by traceThreadDestroy acquiring 427 // the thread's seqlock, which one of us must win, so at least its old gen buffer 428 // will be flushed in time for the new generation) or it will have flushed its 429 // buffers before we snapshotted it to begin with. 430 lock(&sched.lock) 431 mToFlush := allm 432 for mp := mToFlush; mp != nil; mp = mp.alllink { 433 mp.trace.link = mp.alllink 434 } 435 for mp := sched.freem; mp != nil; mp = mp.freelink { 436 mp.trace.link = mToFlush 437 mToFlush = mp 438 } 439 unlock(&sched.lock) 440 441 // Iterate over our snapshot, flushing every buffer until we're done. 442 // 443 // Because trace writers read the generation while the seqlock is 444 // held, we can be certain that when there are no writers there are 445 // also no stale generation values left. Therefore, it's safe to flush 446 // any buffers that remain in that generation's slot. 447 const debugDeadlock = false 448 systemstack(func() { 449 // Track iterations for some rudimentary deadlock detection. 450 i := 0 451 detectedDeadlock := false 452 453 for mToFlush != nil { 454 prev := &mToFlush 455 for mp := *prev; mp != nil; { 456 if mp.trace.seqlock.Load()%2 != 0 { 457 // The M is writing. Come back to it later. 458 prev = &mp.trace.link 459 mp = mp.trace.link 460 continue 461 } 462 // Flush the trace buffer. 463 // 464 // trace.lock needed for traceBufFlush, but also to synchronize 465 // with traceThreadDestroy, which flushes both buffers unconditionally. 466 lock(&trace.lock) 467 bufp := &mp.trace.buf[gen%2] 468 if *bufp != nil { 469 traceBufFlush(*bufp, gen) 470 *bufp = nil 471 } 472 unlock(&trace.lock) 473 474 // Remove the m from the flush list. 475 *prev = mp.trace.link 476 mp.trace.link = nil 477 mp = *prev 478 } 479 // Yield only if we're going to be going around the loop again. 480 if mToFlush != nil { 481 osyield() 482 } 483 484 if debugDeadlock { 485 // Try to detect a deadlock. We probably shouldn't loop here 486 // this many times. 487 if i > 100000 && !detectedDeadlock { 488 detectedDeadlock = true 489 println("runtime: failing to flush") 490 for mp := mToFlush; mp != nil; mp = mp.trace.link { 491 print("runtime: m=", mp.id, "\n") 492 } 493 } 494 i++ 495 } 496 } 497 }) 498 499 // At this point, the old generation is fully flushed minus stack and string 500 // tables, CPU samples, and goroutines that haven't run at all during the last 501 // generation. 502 503 // Check to see if any Gs still haven't had events written out for them. 504 statusWriter := unsafeTraceWriter(gen, nil) 505 for _, ug := range untracedGs { 506 if ug.gp.trace.statusWasTraced(gen) { 507 // It was traced, we don't need to do anything. 508 continue 509 } 510 // It still wasn't traced. Because we ensured all Ms stopped writing trace 511 // events to the last generation, that must mean the G never had its status 512 // traced in gen between when we recorded it and now. If that's true, the goid 513 // and status we recorded then is exactly what we want right now. 514 status := goStatusToTraceGoStatus(ug.status, ug.waitreason) 515 statusWriter = statusWriter.writeGoStatus(ug.goid, ug.mid, status, ug.inMarkAssist) 516 } 517 statusWriter.flush().end() 518 519 // Read everything out of the last gen's CPU profile buffer. 520 traceReadCPU(gen) 521 522 systemstack(func() { 523 // Flush CPU samples, stacks, and strings for the last generation. This is safe, 524 // because we're now certain no M is writing to the last generation. 525 // 526 // Ordering is important here. traceCPUFlush may generate new stacks and dumping 527 // stacks may generate new strings. 528 traceCPUFlush(gen) 529 trace.stackTab[gen%2].dump(gen) 530 trace.stringTab[gen%2].reset(gen) 531 532 // That's it. This generation is done producing buffers. 533 lock(&trace.lock) 534 trace.flushedGen.Store(gen) 535 unlock(&trace.lock) 536 }) 537 538 if stopTrace { 539 semacquire(&traceShutdownSema) 540 541 // Finish off CPU profile reading. 542 traceStopReadCPU() 543 } else { 544 // Go over each P and emit a status event for it if necessary. 545 // 546 // We do this at the beginning of the new generation instead of the 547 // end like we do for goroutines because forEachP doesn't give us a 548 // hook to skip Ps that have already been traced. Since we have to 549 // preempt all Ps anyway, might as well stay consistent with StartTrace 550 // which does this during the STW. 551 semacquire(&worldsema) 552 forEachP(waitReasonTraceProcStatus, func(pp *p) { 553 tl := traceAcquire() 554 if !pp.trace.statusWasTraced(tl.gen) { 555 tl.writer().writeProcStatusForP(pp, false).end() 556 } 557 traceRelease(tl) 558 }) 559 // Perform status reset on dead Ps because they just appear as idle. 560 // 561 // Holding worldsema prevents allp from changing. 562 // 563 // TODO(mknyszek): Consider explicitly emitting ProcCreate and ProcDestroy 564 // events to indicate whether a P exists, rather than just making its 565 // existence implicit. 566 for _, pp := range allp[len(allp):cap(allp)] { 567 pp.trace.readyNextGen(traceNextGen(gen)) 568 } 569 semrelease(&worldsema) 570 } 571 572 // Block until the trace reader has finished processing the last generation. 573 semacquire(&trace.doneSema[gen%2]) 574 if raceenabled { 575 raceacquire(unsafe.Pointer(&trace.doneSema[gen%2])) 576 } 577 578 // Double-check that things look as we expect after advancing and perform some 579 // final cleanup if the trace has fully stopped. 580 systemstack(func() { 581 lock(&trace.lock) 582 if !trace.full[gen%2].empty() { 583 throw("trace: non-empty full trace buffer for done generation") 584 } 585 if stopTrace { 586 if !trace.full[1-(gen%2)].empty() { 587 throw("trace: non-empty full trace buffer for next generation") 588 } 589 if trace.reading != nil || trace.reader.Load() != nil { 590 throw("trace: reading after shutdown") 591 } 592 // Free all the empty buffers. 593 for trace.empty != nil { 594 buf := trace.empty 595 trace.empty = buf.link 596 sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys) 597 } 598 // Clear trace.shutdown and other flags. 599 trace.headerWritten = false 600 trace.shutdown.Store(false) 601 } 602 unlock(&trace.lock) 603 }) 604 605 if stopTrace { 606 // Clear the sweep state on every P for the next time tracing is enabled. 607 // 608 // It may be stale in the next trace because we may have ended tracing in 609 // the middle of a sweep on a P. 610 // 611 // It's fine not to call forEachP here because tracing is disabled and we 612 // know at this point that nothing is calling into the tracer, but we do 613 // need to look at dead Ps too just because GOMAXPROCS could have been called 614 // at any point since we stopped tracing, and we have to ensure there's no 615 // bad state on dead Ps too. Prevent a STW and a concurrent GOMAXPROCS that 616 // might mutate allp by making ourselves briefly non-preemptible. 617 mp := acquirem() 618 for _, pp := range allp[:cap(allp)] { 619 pp.trace.inSweep = false 620 pp.trace.maySweep = false 621 pp.trace.swept = 0 622 pp.trace.reclaimed = 0 623 } 624 releasem(mp) 625 } 626 627 // Release the advance semaphore. If stopTrace is true we're still holding onto 628 // traceShutdownSema. 629 // 630 // Do a direct handoff. Don't let one caller of traceAdvance starve 631 // other calls to traceAdvance. 632 semrelease1(&traceAdvanceSema, true, 0) 633 634 if stopTrace { 635 // Stop the traceAdvancer. We can't be holding traceAdvanceSema here because 636 // we'll deadlock (we're blocked on the advancer goroutine exiting, but it 637 // may be currently trying to acquire traceAdvanceSema). 638 traceAdvancer.stop() 639 semrelease(&traceShutdownSema) 640 } 641 } 642 643 func traceNextGen(gen uintptr) uintptr { 644 if gen == ^uintptr(0) { 645 // gen is used both %2 and %3 and we want both patterns to continue when we loop around. 646 // ^uint32(0) and ^uint64(0) are both odd and multiples of 3. Therefore the next generation 647 // we want is even and one more than a multiple of 3. The smallest such number is 4. 648 return 4 649 } 650 return gen + 1 651 } 652 653 // traceRegisterLabelsAndReasons re-registers mark worker labels and 654 // goroutine stop/block reasons in the string table for the provided 655 // generation. Note: the provided generation must not have started yet. 656 func traceRegisterLabelsAndReasons(gen uintptr) { 657 for i, label := range gcMarkWorkerModeStrings[:] { 658 trace.markWorkerLabels[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, label)) 659 } 660 for i, str := range traceBlockReasonStrings[:] { 661 trace.goBlockReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) 662 } 663 for i, str := range traceGoStopReasonStrings[:] { 664 trace.goStopReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str)) 665 } 666 } 667 668 // ReadTrace returns the next chunk of binary tracing data, blocking until data 669 // is available. If tracing is turned off and all the data accumulated while it 670 // was on has been returned, ReadTrace returns nil. The caller must copy the 671 // returned data before calling ReadTrace again. 672 // ReadTrace must be called from one goroutine at a time. 673 func ReadTrace() []byte { 674 top: 675 var buf []byte 676 var park bool 677 systemstack(func() { 678 buf, park = readTrace0() 679 }) 680 if park { 681 gopark(func(gp *g, _ unsafe.Pointer) bool { 682 if !trace.reader.CompareAndSwapNoWB(nil, gp) { 683 // We're racing with another reader. 684 // Wake up and handle this case. 685 return false 686 } 687 688 if g2 := traceReader(); gp == g2 { 689 // New data arrived between unlocking 690 // and the CAS and we won the wake-up 691 // race, so wake up directly. 692 return false 693 } else if g2 != nil { 694 printlock() 695 println("runtime: got trace reader", g2, g2.goid) 696 throw("unexpected trace reader") 697 } 698 699 return true 700 }, nil, waitReasonTraceReaderBlocked, traceBlockSystemGoroutine, 2) 701 goto top 702 } 703 704 return buf 705 } 706 707 // readTrace0 is ReadTrace's continuation on g0. This must run on the 708 // system stack because it acquires trace.lock. 709 // 710 //go:systemstack 711 func readTrace0() (buf []byte, park bool) { 712 if raceenabled { 713 // g0 doesn't have a race context. Borrow the user G's. 714 if getg().racectx != 0 { 715 throw("expected racectx == 0") 716 } 717 getg().racectx = getg().m.curg.racectx 718 // (This defer should get open-coded, which is safe on 719 // the system stack.) 720 defer func() { getg().racectx = 0 }() 721 } 722 723 // This function must not allocate while holding trace.lock: 724 // allocation can call heap allocate, which will try to emit a trace 725 // event while holding heap lock. 726 lock(&trace.lock) 727 728 if trace.reader.Load() != nil { 729 // More than one goroutine reads trace. This is bad. 730 // But we rather do not crash the program because of tracing, 731 // because tracing can be enabled at runtime on prod servers. 732 unlock(&trace.lock) 733 println("runtime: ReadTrace called from multiple goroutines simultaneously") 734 return nil, false 735 } 736 // Recycle the old buffer. 737 if buf := trace.reading; buf != nil { 738 buf.link = trace.empty 739 trace.empty = buf 740 trace.reading = nil 741 } 742 // Write trace header. 743 if !trace.headerWritten { 744 trace.headerWritten = true 745 unlock(&trace.lock) 746 return []byte("go 1.22 trace\x00\x00\x00"), false 747 } 748 749 // Read the next buffer. 750 751 if trace.readerGen.Load() == 0 { 752 trace.readerGen.Store(1) 753 } 754 var gen uintptr 755 for { 756 assertLockHeld(&trace.lock) 757 gen = trace.readerGen.Load() 758 759 // Check to see if we need to block for more data in this generation 760 // or if we need to move our generation forward. 761 if !trace.full[gen%2].empty() { 762 break 763 } 764 // Most of the time readerGen is one generation ahead of flushedGen, as the 765 // current generation is being read from. Then, once the last buffer is flushed 766 // into readerGen, flushedGen will rise to meet it. At this point, the tracer 767 // is waiting on the reader to finish flushing the last generation so that it 768 // can continue to advance. 769 if trace.flushedGen.Load() == gen { 770 if trace.shutdown.Load() { 771 unlock(&trace.lock) 772 773 // Wake up anyone waiting for us to be done with this generation. 774 // 775 // Do this after reading trace.shutdown, because the thread we're 776 // waking up is going to clear trace.shutdown. 777 if raceenabled { 778 // Model synchronization on trace.doneSema, which te race 779 // detector does not see. This is required to avoid false 780 // race reports on writer passed to trace.Start. 781 racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) 782 } 783 semrelease(&trace.doneSema[gen%2]) 784 785 // We're shutting down, and the last generation is fully 786 // read. We're done. 787 return nil, false 788 } 789 // The previous gen has had all of its buffers flushed, and 790 // there's nothing else for us to read. Advance the generation 791 // we're reading from and try again. 792 trace.readerGen.Store(trace.gen.Load()) 793 unlock(&trace.lock) 794 795 // Wake up anyone waiting for us to be done with this generation. 796 // 797 // Do this after reading gen to make sure we can't have the trace 798 // advance until we've read it. 799 if raceenabled { 800 // See comment above in the shutdown case. 801 racerelease(unsafe.Pointer(&trace.doneSema[gen%2])) 802 } 803 semrelease(&trace.doneSema[gen%2]) 804 805 // Reacquire the lock and go back to the top of the loop. 806 lock(&trace.lock) 807 continue 808 } 809 // Wait for new data. 810 // 811 // We don't simply use a note because the scheduler 812 // executes this goroutine directly when it wakes up 813 // (also a note would consume an M). 814 // 815 // Before we drop the lock, clear the workAvailable flag. Work can 816 // only be queued with trace.lock held, so this is at least true until 817 // we drop the lock. 818 trace.workAvailable.Store(false) 819 unlock(&trace.lock) 820 return nil, true 821 } 822 // Pull a buffer. 823 tbuf := trace.full[gen%2].pop() 824 trace.reading = tbuf 825 unlock(&trace.lock) 826 return tbuf.arr[:tbuf.pos], false 827 } 828 829 // traceReader returns the trace reader that should be woken up, if any. 830 // Callers should first check (traceEnabled() || traceShuttingDown()). 831 // 832 // This must run on the system stack because it acquires trace.lock. 833 // 834 //go:systemstack 835 func traceReader() *g { 836 gp := traceReaderAvailable() 837 if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) { 838 return nil 839 } 840 return gp 841 } 842 843 // traceReaderAvailable returns the trace reader if it is not currently 844 // scheduled and should be. Callers should first check that 845 // (traceEnabled() || traceShuttingDown()) is true. 846 func traceReaderAvailable() *g { 847 // There are three conditions under which we definitely want to schedule 848 // the reader: 849 // - The reader is lagging behind in finishing off the last generation. 850 // In this case, trace buffers could even be empty, but the trace 851 // advancer will be waiting on the reader, so we have to make sure 852 // to schedule the reader ASAP. 853 // - The reader has pending work to process for it's reader generation 854 // (assuming readerGen is not lagging behind). Note that we also want 855 // to be careful *not* to schedule the reader if there's no work to do. 856 // - The trace is shutting down. The trace stopper blocks on the reader 857 // to finish, much like trace advancement. 858 // 859 // We also want to be careful not to schedule the reader if there's no 860 // reason to. 861 if trace.flushedGen.Load() == trace.readerGen.Load() || trace.workAvailable.Load() || trace.shutdown.Load() { 862 return trace.reader.Load() 863 } 864 return nil 865 } 866 867 // Trace advancer goroutine. 868 var traceAdvancer traceAdvancerState 869 870 type traceAdvancerState struct { 871 timer *wakeableSleep 872 done chan struct{} 873 } 874 875 // start starts a new traceAdvancer. 876 func (s *traceAdvancerState) start() { 877 // Start a goroutine to periodically advance the trace generation. 878 s.done = make(chan struct{}) 879 s.timer = newWakeableSleep() 880 go func() { 881 for traceEnabled() { 882 // Set a timer to wake us up 883 s.timer.sleep(int64(debug.traceadvanceperiod)) 884 885 // Try to advance the trace. 886 traceAdvance(false) 887 } 888 s.done <- struct{}{} 889 }() 890 } 891 892 // stop stops a traceAdvancer and blocks until it exits. 893 func (s *traceAdvancerState) stop() { 894 s.timer.wake() 895 <-s.done 896 close(s.done) 897 s.timer.close() 898 } 899 900 // traceAdvancePeriod is the approximate period between 901 // new generations. 902 const defaultTraceAdvancePeriod = 1e9 // 1 second. 903 904 // wakeableSleep manages a wakeable goroutine sleep. 905 // 906 // Users of this type must call init before first use and 907 // close to free up resources. Once close is called, init 908 // must be called before another use. 909 type wakeableSleep struct { 910 timer *timer 911 912 // lock protects access to wakeup, but not send/recv on it. 913 lock mutex 914 wakeup chan struct{} 915 } 916 917 // newWakeableSleep initializes a new wakeableSleep and returns it. 918 func newWakeableSleep() *wakeableSleep { 919 s := new(wakeableSleep) 920 lockInit(&s.lock, lockRankWakeableSleep) 921 s.wakeup = make(chan struct{}, 1) 922 s.timer = new(timer) 923 s.timer.arg = s 924 s.timer.f = func(s any, _ uintptr) { 925 s.(*wakeableSleep).wake() 926 } 927 return s 928 } 929 930 // sleep sleeps for the provided duration in nanoseconds or until 931 // another goroutine calls wake. 932 // 933 // Must not be called by more than one goroutine at a time and 934 // must not be called concurrently with close. 935 func (s *wakeableSleep) sleep(ns int64) { 936 resetTimer(s.timer, nanotime()+ns) 937 lock(&s.lock) 938 if raceenabled { 939 raceacquire(unsafe.Pointer(&s.lock)) 940 } 941 wakeup := s.wakeup 942 if raceenabled { 943 racerelease(unsafe.Pointer(&s.lock)) 944 } 945 unlock(&s.lock) 946 <-wakeup 947 stopTimer(s.timer) 948 } 949 950 // wake awakens any goroutine sleeping on the timer. 951 // 952 // Safe for concurrent use with all other methods. 953 func (s *wakeableSleep) wake() { 954 // Grab the wakeup channel, which may be nil if we're 955 // racing with close. 956 lock(&s.lock) 957 if raceenabled { 958 raceacquire(unsafe.Pointer(&s.lock)) 959 } 960 if s.wakeup != nil { 961 // Non-blocking send. 962 // 963 // Others may also write to this channel and we don't 964 // want to block on the receiver waking up. This also 965 // effectively batches together wakeup notifications. 966 select { 967 case s.wakeup <- struct{}{}: 968 default: 969 } 970 } 971 if raceenabled { 972 racerelease(unsafe.Pointer(&s.lock)) 973 } 974 unlock(&s.lock) 975 } 976 977 // close wakes any goroutine sleeping on the timer and prevents 978 // further sleeping on it. 979 // 980 // Once close is called, the wakeableSleep must no longer be used. 981 // 982 // It must only be called once no goroutine is sleeping on the 983 // timer *and* nothing else will call wake concurrently. 984 func (s *wakeableSleep) close() { 985 // Set wakeup to nil so that a late timer ends up being a no-op. 986 lock(&s.lock) 987 if raceenabled { 988 raceacquire(unsafe.Pointer(&s.lock)) 989 } 990 wakeup := s.wakeup 991 s.wakeup = nil 992 993 // Close the channel. 994 close(wakeup) 995 996 if raceenabled { 997 racerelease(unsafe.Pointer(&s.lock)) 998 } 999 unlock(&s.lock) 1000 return 1001 } 1002