Image: Three-stage large language model training workflow

Description: A diagram showing a basic three stage workflow for training LLMs. % Compile with: pdflatex or lualatex \documentclass[tikz,border=6pt]{standalone} \usepackage{tikz} \usetikzlibrary{arrows.meta,positioning,fit,shapes.geometric,shapes.symbols,calc,backgrounds}
\begin{document} \begin{tikzpicture}[ font=\sffamily, >=Latex, dataNodeStyle/.style ={cylinder, draw, thick, shape border rotate=90, aspect=0.25, minimum height=1.8cm, minimum width=3.6cm, fill=blue!5, align=center}, processNodeStyle/.style={rounded corners, draw, thick, align=center, fill=green!5, minimum height=1.2cm}, rewardNodeStyle/.style ={rounded corners, draw, thick, align=center, fill=orange!15, minimum height=1.2cm}, humanNodeStyle/.style ={rounded corners, draw, thick, align=center, fill=purple!12, minimum height=1.2cm}, stageBoxStyle/.style ={draw, rounded corners, very thick, fill=white}, stageHeaderStyle/.style={draw, very thick, fill=gray!15, align=center, minimum height=1.2cm, inner sep=0pt, font=\large\bfseries}, captionTextStyle/.style={draw=none, fill=none, align=left, text width=\captionTextWidth, inner xsep=0pt, inner ysep=0pt}, legendSwatchStyle/.style={rounded corners, draw, very thick, align=center, minimum height=0.95cm, text width=\legendSwatchWidth, font=\bfseries\small}, legendBoxStyle/.style ={draw, rounded corners, very thick, fill=white}, arrowStyle/.style ={-Latex, very thick} ]
\coordinate (origin) at (0,0);
% ---------- Unified padding ---------- \def\cardInnerSep{18pt} \def\cardContentPadV{5mm} \def\headerHalfHeight{6mm}
% ---------- Layout (tight) ---------- \def\captionXShift{9.5cm} % requested \def\captionTextWidth{8cm} \def\stageLeft{-5.0cm} \def\stageRight{14.0cm}
\def\nodeWidth{7.2cm} \def\nodeWidthMedium{8.6cm} \def\nodeWidthLarge{8.6cm}
% ---------- Arrow geometry ---------- \def\leftBaseToSFT{3mm} \def\leftInstructionToRL{10mm} \def\leftInstructionToSample{3mm} \def\leftPromptToRL{32mm}
\def\offSmall{+2mm} \def\offSmallNeg{-2mm} \def\offTinyNeg{-1mm}
\def\origOffsetBaseToSFT{0mm} \def\origOffsetInstructionToRL{+2mm} \def\origOffsetInstructionToSample{-2mm} \def\origOffsetPromptToRL{0mm}
\def\destOffsetBaseToSFT{0mm} \def\destOffsetInstructionToRL{\offSmallNeg} \def\destOffsetInstructionToSample{\offTinyNeg} \def\destOffsetPromptToRL{\offSmall}
% ---------- Legend spacing ---------- \def\legendSwatchWidth{3.6cm} \def\legendColumnGap{2.5cm} % a little tighter to fit 3 columns cleanly \def\legendRowSep{16mm}
% ---------- Helpers ---------- \newcommand{\aligned}[2]{($(#1)+(#2,0)$)} \newcommand{\placeCaption}[2]{% \node[captionTextStyle] (#1_caption) at \aligned{#1}{\captionXShift} {#2};}
\newcommand{\squareArrowWestSimple}[5]{% \coordinate (startPt) at ($(#1.west)+(0,#4)$); \coordinate (leftPt) at ($(startPt)+(-#3,0)$); \coordinate (destPt) at ($(#2.west)+(0,#5)$); \draw[arrowStyle] (startPt) -- (leftPt) -- (leftPt |- destPt) -- (destPt); }
\newcommand{\boxedCardWithHeader}[5]{% \coordinate (#1TopSpacer) at ($ (#3.north) + (0,\headerHalfHeight+\cardContentPadV) $); \coordinate (#1BottomSpacer) at ($ (#4.south) + (0,-\cardContentPadV) $); \coordinate (#1LeftEdge) at ($ (#3.north -| origin) + (\stageLeft,0) $); \coordinate (#1RightEdge) at ($ (#3.north -| origin) + (\stageRight,0) $); \begin{pgfonlayer}{background} \node[stageBoxStyle, fit=#2 (#1TopSpacer) (#1BottomSpacer) (#1LeftEdge) (#1RightEdge), inner sep=\cardInnerSep] (#1) {}; \end{pgfonlayer} \node[stageHeaderStyle, fit=(#1.north west) (#1.north east), label=center:{#5}] (#1Header) {}; }
% ===================== Stage 1 ===================== \node[dataNodeStyle] (webdata) {Web,\\ Books, \\ Code, \\ Docs};
\node[processNodeStyle, minimum width=\nodeWidth] (pretrain) [below=12mm of webdata.south] {Pretrain LM\\(next-token prediction)};
\node[processNodeStyle, fill=white, minimum width=\nodeWidthMedium] (base) [below=12mm of pretrain.south] {\bfseries Pretrained Base Model};
\draw[arrowStyle] (webdata) -- (pretrain); \draw[arrowStyle] (pretrain) -- (base);
\placeCaption{webdata} {Large public corpora of language and code: Internet text in multiple languages, digitized books and papers, and open‑source code repositories.};
\placeCaption{pretrain} {Train a model that, given a sequence of input text, predicts a probability distribution of the next token. The model learns grammar, facts and coding from raw text.};
\placeCaption{base} {Strong text and code completion; not yet instruction‑following or preference‑aligned.};
\boxedCardWithHeader{stageOneBox} {(webdata) (pretrain) (base) (webdata_caption) (pretrain_caption) (base_caption)} {webdata_caption}{base_caption} {\bfseries Stage 1: Pretraining}
% ===================== Stage 2 ===================== \coordinate (stageTwoTop) at ($(stageOneBox.south -| origin)+(0,-40mm)$);
\node[dataNodeStyle, minimum width=3.8cm] (sftdata) at (stageTwoTop) {Prompts \\ + ideal answers};
\node[processNodeStyle, minimum width=\nodeWidth] (sft) [below=16mm of sftdata.south] {Supervised fine-tune\\on demonstrations};
\node[processNodeStyle, fill=white, minimum width=\nodeWidthMedium] (sftmodel) [below=12mm of sft.south] {\bfseries Instruction-tuned Model};
\draw[arrowStyle] (sftdata.south) -- (sft.north); \draw[arrowStyle] (sft) -- (sftmodel);
\placeCaption{sftdata} {Prompt–response demonstrations that specify tone, format, and constraints.\\[2ex] Example: “Explain binary search to a 10‑year‑old in five sentences.”};
\placeCaption{sft} {Train the model to imitate the reference answers so that it follows instructions.};
\placeCaption{sftmodel} {Follows instructions reliably; matches requested tone, format, and length.};
\boxedCardWithHeader{stageTwoBox} {(sftdata) (sft) (sftmodel) (sftdata_caption) (sft_caption) (sftmodel_caption)} {sftdata_caption}{sftmodel_caption} {\bfseries Stage 2: Supervised Fine-Tuning (SFT)}
% ===================== Stage 3 ===================== \coordinate (stageThreeTop) at ($(stageTwoBox.south -| origin)+(0,-40mm)$);
\node[dataNodeStyle, minimum width=3.8cm] (promptset) at (stageThreeTop) {Prompts};
\node[processNodeStyle, minimum width=\nodeWidth] (samplecand) [below=16mm of promptset.south] {Sample $k$ outputs\\from policy for each prompt};
\node[humanNodeStyle, minimum width=\nodeWidth] (rank) [below=11mm of samplecand.south] {Humans rank outputs\\(pairwise / listwise)};
\node[dataNodeStyle, minimum width=\nodeWidth] (prefdata) [below=11mm of rank.south] {Preference data\\(rankings / pairs)};
\node[rewardNodeStyle, minimum width=\nodeWidthMedium] (rm) [below=11mm of prefdata.south] {Train Reward Model\\(predict preference / score)};
\node[processNodeStyle, minimum width=\nodeWidthLarge] (ppo) [below=12mm of rm.south] {Reinforcement Learning (e.g., Proximal Policy Optimization)};
\node[processNodeStyle, fill=white, minimum width=\nodeWidthMedium] (final) [below=12mm of ppo.south] {\bfseries Aligned Assistant Model};
\draw[arrowStyle] (promptset.south) -- (samplecand.north); \draw[arrowStyle] (samplecand) -- (rank); \draw[arrowStyle] (rank) -- (prefdata); \draw[arrowStyle] (prefdata) -- (rm); \draw[arrowStyle] (rm) -- (ppo); \draw[arrowStyle] (ppo) -- (final);
\placeCaption{promptset} {Broad prompts for writing, QA, reasoning, coding, dialogue, and tool use.};
\placeCaption{samplecand} {Generate several diverse candidates per prompt.};
\placeCaption{rank} {Human labelers rank candidates.\\[2ex]For example, D $>$ C $>$ A $>$ B};
\placeCaption{prefdata} {Convert rankings to preference pairs or partial orders for training.};
\placeCaption{rm} {Reward model predicts a scalar preference score for each prompt–output pair. Once we have this model we no longer need human graders in the loop.\\[2ex]Note: This model can be reused and can originate from a completely different base model.};
\placeCaption{ppo} {Optimize the policy for higher reward while limiting Kullback–Leibler (KL) divergence from the SFT reference.};
\placeCaption{final} {A model that has been trained to satisfy the reward model. It is helpful, concise, and safe; follows instructions and meets constraints.};
\boxedCardWithHeader{stageThreeBox} {(promptset) (samplecand) (rank) (prefdata) (rm) (ppo) (final) (promptset_caption) (samplecand_caption) (rank_caption) (prefdata_caption) (rm_caption) (ppo_caption) (final_caption)} {promptset_caption}{final_caption} {\bfseries Stage 3: Reinforcement Learning from Human Feedback (RLHF)}
% ---------- Cross‑stage arrows ---------- \squareArrowWestSimple{base}{sft} {\leftBaseToSFT}{\origOffsetBaseToSFT}{\destOffsetBaseToSFT} \squareArrowWestSimple{sftmodel}{ppo} {\leftInstructionToRL}{\origOffsetInstructionToRL}{\destOffsetInstructionToRL} \squareArrowWestSimple{sftmodel}{samplecand} {\leftInstructionToSample}{\origOffsetInstructionToSample}{\destOffsetInstructionToSample} \squareArrowWestSimple{promptset}{ppo} {\leftPromptToRL}{\origOffsetPromptToRL}{\destOffsetPromptToRL}
% ===================== Legend (3 columns × 2 rows, swatches only) ===================== \coordinate (legendTopAnchor) at ($(stageThreeBox.south -| origin)+(0,-40mm)$); \coordinate (legendLeftEdge) at ($ (legendTopAnchor -| origin) + (\stageLeft,0) $); \coordinate (legendRightEdge) at ($ (legendTopAnchor -| origin) + (\stageRight,0) $);
% Column anchors \coordinate (legendCol1) at ($(legendLeftEdge)+(12mm,0)$); \coordinate (legendCol2) at ($(legendCol1)+(\legendSwatchWidth+\legendColumnGap,0)$); \coordinate (legendCol3) at ($(legendCol2)+(\legendSwatchWidth+\legendColumnGap,0)$);
% Row anchors \coordinate (legendRow1) at ($(legendTopAnchor)+(0,-\cardContentPadV-\headerHalfHeight)$); \coordinate (legendRow2) at ($(legendRow1)+(0,-\legendRowSep)$);
% Row 1: Data | Process | Reward \node[legendSwatchStyle, fill=blue!5, anchor=west] (legendData) at ($(legendCol1)+(0,0)$) {Data}; \node[legendSwatchStyle, fill=green!5, anchor=west] (legendProcess) at ($(legendCol2)+(0,0)$) {Process}; \node[legendSwatchStyle, fill=orange!15,anchor=west] (legendReward) at ($(legendCol3)+(0,0)$) {Reward};
% Row 2: Human | Model | Stage \node[legendSwatchStyle, fill=purple!12,anchor=west] (legendHuman) at ($(legendCol1)+(0,-\legendRowSep)$) {Human}; \node[legendSwatchStyle, fill=white, anchor=west] (legendModel) at ($(legendCol2)+(0,-\legendRowSep)$) {Model}; \node[legendSwatchStyle, fill=gray!15, anchor=west] (legendStage) at ($(legendCol3)+(0,-\legendRowSep)$) {Stage};
% Symmetric padding spacers \coordinate (legendTopSpacer) at ($ (legendData.north) + (0,\headerHalfHeight+\cardContentPadV) $); \coordinate (legendBottomSpacer) at ($ (legendHuman.south) + (0,-\cardContentPadV) $);
\begin{pgfonlayer}{background} \node[legendBoxStyle, fit=(legendData) (legendProcess) (legendReward) (legendHuman) (legendModel) (legendStage) (legendTopSpacer) (legendBottomSpacer) (legendLeftEdge) (legendRightEdge), inner sep=\cardInnerSep] (legendBox) {}; \end{pgfonlayer} \node[stageHeaderStyle, fit=(legendBox.north west) (legendBox.north east)] {Legend};
\end{tikzpicture} \end{document}
Usage Terms: Public domain
Image usage
The following page links to this image: