在config内的pp大于1的时会打开interleving,

graph TB
     --- Warm‑up Phase ---
    subgraph Warmup["Warm‑up Phase (sm_free_p2p)"]
      direction TB
      W0["recv_forwardA() -> input_tensors[0]"]
      W1{{"k < num_warmup_microbatches"}}
      W2["wait recv_prev_finished == 0"]
      W3["forward_step_helper(k)"]
      W4["send_forward_recv_forwardA() -> send_next_finished"]
      W5["append input_tensor to next chunk"]
      W6["send_backward_recv_backwardA() -> send_prev_finished / recv_next_finished"]

      W0 --> W1
      W1 --> W2 --> W3 --> W4 -->W6 --> W5 --> W1
    end

     --- Cool‑down Flush ---
    subgraph Cooldown["Cooldown Flush (sm_free_p2p)"]
      direction TB
      C0["recv_backwardA() -> output_tensor_grads"]
      C1{{"remaining backward microbatches"}}
      C2["wait recv_next_finished == 0"]
      C3["backward_step_helper(k)"]
      C4["send_backward_recv_backwardA()"]

      C0 --> C1
      C1 --> C2 --> C3 --> C4 --> C1
    end

    %% --- Stage links ---
    Start --> W0
    W5 -.-> S0
    S7 -.-> C0
    C4 --> End
时间         PP0 (rank0-3)           ┆          PP1 (rank4-7)
--------------------------------------------------------------------
t=0          FWD MB0  ──────────►    ┆         (等待)
t=1          FWD MB1  ──────────►    ┆  FWD MB0
t=2          (等梯度)   ◄── BWD MB0   ┆  FWD MB1
t=3          BWD MB0                 ┆  BWD MB1
t=4          BWD MB1  (完成)        ┆  (完成)