graph TB ---------- Warm‑up forward passes ---------- subgraph Warmup["Warm‑up Phase (ori)"] direction TB W0["recv_forward() -> input_tensors[0]"] W1{{"k < num_warmup_microbatches"}} W2["wait recv_prev_wait_handle"] W3["forward_step_helper(k)"] W4["send_forward_recv_forward() -> fwd_recv_buffer"] W5["append input_tensor to next chunk"] W6["send_backward_recv_backward()"] W0 --> W1 W1 --> W2 --> W3 --> W4 --> W6 --> W5 --> W1 end ---------- Cool‑down backward flush ---------- subgraph Cooldown["Cooldown Flush (ori)"] direction TB C0["recv_backward() -> output_tensor_grads"] C1{{"remaining backward microbatches"}} C2["wait recv_next_wait_handle"] C3["backward_step_helper(k)"] C4["send_backward_recv_backward()"] C0 --> C1 C1 --> C2 --> C3 --> C4 --> C1 end %% ---------- 阶段衔接 ---------- Start --> W0 W5 -.-> S0 S7 -.-> C0 C4 --> End