Spaces:
Running
Running
update
Browse files- assets/data/benchmarks/memusage_activations.html +2 -0
- assets/images/activation_recomputation.js +149 -0
- assets/images/activation_recomputation.svg +768 -0
- assets/images/first_steps_memory_profile.js +2 -2
- assets/images/first_steps_simple_training.js +2 -2
- dist/assets/.DS_Store +0 -0
- dist/assets/data/benchmarks/memusage_activations.html +2 -0
- dist/assets/images/activation_recomputation.js +1 -0
- dist/assets/images/activation_recomputation.svg +768 -0
- dist/assets/images/first_steps_memory_profile.js +1 -1
- dist/assets/images/first_steps_simple_training.js +1 -1
- dist/index.html +20 -4
- src/index.html +22 -3
assets/data/benchmarks/memusage_activations.html
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
<div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
|
2 |
+
<script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script> <div id="a841d1b3-f0b4-43f7-90f9-bbb31dc90094" class="plotly-graph-div" style="height:400px; width:1200px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("a841d1b3-f0b4-43f7-90f9-bbb31dc90094")) { Plotly.newPlot( "a841d1b3-f0b4-43f7-90f9-bbb31dc90094", [{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625,52.42681884765625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[9.25390625,28.5078125,97.015625,354.03125,1348.0625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125,488.8917236328125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[46.2578125,142.515625,485.03125,1770.0625,6740.125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125,3041.8564453125],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[145.703125,448.90625,1527.8125,5575.625,21231.25],"type":"bar","xaxis":"x3","yaxis":"y3"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"GB memory"},"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-8B","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-70B","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-405B","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"barmode":"stack","width":1200,"height":400,"legend":{"title":{}}}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
|
assets/images/activation_recomputation.js
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
// Function to enhance the SVG content by adding styles and data attributes
|
3 |
+
function enhanceSVGContent(originalContent) {
|
4 |
+
const parser = new DOMParser();
|
5 |
+
const doc = parser.parseFromString(originalContent, 'image/svg+xml');
|
6 |
+
|
7 |
+
// Create a style element with hover effects and insert it as the first child of the SVG
|
8 |
+
const styleElement = doc.createElementNS('http://www.w3.org/2000/svg', 'style');
|
9 |
+
styleElement.textContent = `
|
10 |
+
path[data-element-type="layer"] {
|
11 |
+
transition: all 0.3s;
|
12 |
+
cursor: pointer;
|
13 |
+
}
|
14 |
+
path[data-element-type="layer"]:hover {
|
15 |
+
fill: #b197fc !important;
|
16 |
+
transform: translate(0, -2px);
|
17 |
+
}
|
18 |
+
|
19 |
+
path[data-element-type="layer-updated"] {
|
20 |
+
transition: all 0.3s;
|
21 |
+
cursor: pointer;
|
22 |
+
}
|
23 |
+
|
24 |
+
path[data-element-type="layer-updated"]:hover {
|
25 |
+
fill:rgb(103, 56, 244) !important;
|
26 |
+
transform: scale(1.02);
|
27 |
+
transform: translate(0, -2px);
|
28 |
+
}
|
29 |
+
|
30 |
+
path[data-element-type="gradient"] {
|
31 |
+
transition: all 0.3s;
|
32 |
+
cursor: pointer;
|
33 |
+
}
|
34 |
+
path[data-element-type="gradient"]:hover {
|
35 |
+
fill: #f06595 !important;
|
36 |
+
transform: translate(0, -2px);
|
37 |
+
}
|
38 |
+
|
39 |
+
path[data-element-type="forward"] {
|
40 |
+
transition: all 0.3s;
|
41 |
+
cursor: pointer;
|
42 |
+
}
|
43 |
+
path[data-element-type="forward"]:hover {
|
44 |
+
stroke: #0c8599 !important;
|
45 |
+
stroke-width: 4 !important;
|
46 |
+
}
|
47 |
+
|
48 |
+
path[data-element-type="backward"] {
|
49 |
+
transition: all 0.3s;
|
50 |
+
cursor: pointer;
|
51 |
+
}
|
52 |
+
path[data-element-type="backward"]:hover {
|
53 |
+
stroke: #e8590c !important;
|
54 |
+
stroke-width: 4 !important;
|
55 |
+
}
|
56 |
+
|
57 |
+
path[data-element-type="optimization"] {
|
58 |
+
transition: all 0.3s;
|
59 |
+
cursor: pointer;
|
60 |
+
}
|
61 |
+
path[data-element-type="optimization"]:hover {
|
62 |
+
stroke: #087f5b !important;
|
63 |
+
stroke-width: 4 !important;
|
64 |
+
}
|
65 |
+
`;
|
66 |
+
doc.documentElement.insertBefore(styleElement, doc.documentElement.firstChild);
|
67 |
+
|
68 |
+
// Process neural network layers (purple nodes)
|
69 |
+
doc.querySelectorAll('path[fill="#d0bfff"]').forEach((node, index) => {
|
70 |
+
node.setAttribute('data-element-id', `layer-${index}`);
|
71 |
+
node.setAttribute('data-element-type', 'layer');
|
72 |
+
});
|
73 |
+
|
74 |
+
doc.querySelectorAll('path[fill="#9775fa"]').forEach((node, index) => {
|
75 |
+
node.setAttribute('data-element-id', `layer-updated-${index}`);
|
76 |
+
node.setAttribute('data-element-type', 'layer-updated');
|
77 |
+
});
|
78 |
+
|
79 |
+
// Process gradient nodes (pink nodes)
|
80 |
+
doc.querySelectorAll('path[fill="#f783ac"]').forEach((node, index) => {
|
81 |
+
node.setAttribute('data-element-id', `gradient-${index}`);
|
82 |
+
node.setAttribute('data-element-type', 'gradient');
|
83 |
+
});
|
84 |
+
|
85 |
+
// Process arrows by matching stroke colors
|
86 |
+
const arrowTypes = {
|
87 |
+
'#15aabf': 'forward',
|
88 |
+
'#fd7e14': 'backward',
|
89 |
+
'#099268': 'optimization'
|
90 |
+
};
|
91 |
+
|
92 |
+
Object.entries(arrowTypes).forEach(([color, type]) => {
|
93 |
+
doc.querySelectorAll(`path[stroke="${color}"]`).forEach((arrow, index) => {
|
94 |
+
arrow.setAttribute('data-element-id', `${type}-${index}`);
|
95 |
+
arrow.setAttribute('data-element-type', type);
|
96 |
+
});
|
97 |
+
});
|
98 |
+
|
99 |
+
// Make the SVG responsive
|
100 |
+
doc.documentElement.setAttribute('width', '100%');
|
101 |
+
doc.documentElement.setAttribute('height', '100%');
|
102 |
+
doc.documentElement.setAttribute('preserveAspectRatio', 'xMidYMid meet');
|
103 |
+
|
104 |
+
return new XMLSerializer().serializeToString(doc);
|
105 |
+
}
|
106 |
+
|
107 |
+
// Function to load an SVG file via fetch
|
108 |
+
async function loadSVG(url, containerId) {
|
109 |
+
try {
|
110 |
+
const response = await fetch(url);
|
111 |
+
if (!response.ok) {
|
112 |
+
throw new Error(`HTTP error! Status: ${response.status}`);
|
113 |
+
}
|
114 |
+
const svgText = await response.text();
|
115 |
+
const enhancedSVG = enhanceSVGContent(svgText);
|
116 |
+
document.getElementById(containerId).innerHTML = enhancedSVG;
|
117 |
+
} catch (error) {
|
118 |
+
console.error('Error loading SVG:', error);
|
119 |
+
document.getElementById(containerId).innerHTML = '<p>Error loading SVG.</p>';
|
120 |
+
}
|
121 |
+
}
|
122 |
+
|
123 |
+
// Load the SVG file (adjust the path if needed)
|
124 |
+
loadSVG('../assets/images/activation_recomputation.svg', 'svg-activation_recomputation');
|
125 |
+
|
126 |
+
// Set up event listeners to display a description of the hovered element
|
127 |
+
const svgContainer3 = document.getElementById('svg-activation_recomputation');
|
128 |
+
|
129 |
+
svgContainer3.addEventListener('mouseover', function (event) {
|
130 |
+
const target = event.target;
|
131 |
+
if (target.tagName.toLowerCase() === 'path' && target.hasAttribute('data-element-id')) {
|
132 |
+
const elementId = target.getAttribute('data-element-id');
|
133 |
+
const elementType = target.getAttribute('data-element-type');
|
134 |
+
const descriptions = {
|
135 |
+
layer: 'Neural Network Layer',
|
136 |
+
'layer-updated': 'Neural Network Layer (updated)',
|
137 |
+
gradient: 'Gradient Update Layer',
|
138 |
+
forward: 'Forward Pass Connection',
|
139 |
+
backward: 'Backward Pass Connection',
|
140 |
+
optimization: 'Optimization Step'
|
141 |
+
};
|
142 |
+
const description = descriptions[elementType] || elementType;
|
143 |
+
document.getElementById('svg-activation_recomputation-info').textContent = `Hovering over: ${description} (${elementId})`;
|
144 |
+
}
|
145 |
+
});
|
146 |
+
|
147 |
+
svgContainer3.addEventListener('mouseout', function () {
|
148 |
+
document.getElementById('svg-activation_recomputation-info').textContent = 'Hover over the network elements to see their details';
|
149 |
+
});
|
assets/images/activation_recomputation.svg
ADDED
|
assets/images/first_steps_memory_profile.js
CHANGED
@@ -151,10 +151,10 @@ svgContainer2.addEventListener('mouseover', function(event) {
|
|
151 |
'cache': 'Cache Operation'
|
152 |
};
|
153 |
const description = descriptions[elementType] || elementType;
|
154 |
-
document.getElementById('info').textContent = `Hovering over: ${description} (${elementId})`;
|
155 |
}
|
156 |
});
|
157 |
|
158 |
svgContainer2.addEventListener('mouseout', function() {
|
159 |
-
document.getElementById('info').textContent = 'Hover over the
|
160 |
});
|
|
|
151 |
'cache': 'Cache Operation'
|
152 |
};
|
153 |
const description = descriptions[elementType] || elementType;
|
154 |
+
document.getElementById('svg-first_steps_memory_profile-info').textContent = `Hovering over: ${description} (${elementId})`;
|
155 |
}
|
156 |
});
|
157 |
|
158 |
svgContainer2.addEventListener('mouseout', function() {
|
159 |
+
document.getElementById('svg-first_steps_memory_profile-info').textContent = 'Hover over the elements to see their details';
|
160 |
});
|
assets/images/first_steps_simple_training.js
CHANGED
@@ -140,10 +140,10 @@ svgContainer.addEventListener('mouseover', function (event) {
|
|
140 |
optimization: 'Optimization Step'
|
141 |
};
|
142 |
const description = descriptions[elementType] || elementType;
|
143 |
-
document.getElementById('info').textContent = `Hovering over: ${description} (${elementId})`;
|
144 |
}
|
145 |
});
|
146 |
|
147 |
svgContainer.addEventListener('mouseout', function () {
|
148 |
-
document.getElementById('info').textContent = 'Hover over the network elements to see their details';
|
149 |
});
|
|
|
140 |
optimization: 'Optimization Step'
|
141 |
};
|
142 |
const description = descriptions[elementType] || elementType;
|
143 |
+
document.getElementById('svg-first_steps_simple_training-info').textContent = `Hovering over: ${description} (${elementId})`;
|
144 |
}
|
145 |
});
|
146 |
|
147 |
svgContainer.addEventListener('mouseout', function () {
|
148 |
+
document.getElementById('svg-first_steps_simple_training-info').textContent = 'Hover over the network elements to see their details';
|
149 |
});
|
dist/assets/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
dist/assets/data/benchmarks/memusage_activations.html
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
<div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
|
2 |
+
<script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script> <div id="a841d1b3-f0b4-43f7-90f9-bbb31dc90094" class="plotly-graph-div" style="height:400px; width:1200px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("a841d1b3-f0b4-43f7-90f9-bbb31dc90094")) { Plotly.newPlot( "a841d1b3-f0b4-43f7-90f9-bbb31dc90094", [{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625,52.42681884765625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[9.25390625,28.5078125,97.015625,354.03125,1348.0625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125,488.8917236328125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[46.2578125,142.515625,485.03125,1770.0625,6740.125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125,3041.8564453125],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[145.703125,448.90625,1527.8125,5575.625,21231.25],"type":"bar","xaxis":"x3","yaxis":"y3"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"GB memory"},"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-8B","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-70B","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-405B","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"barmode":"stack","width":1200,"height":400,"legend":{"title":{}}}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
|
dist/assets/images/activation_recomputation.js
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
function enhanceSVGContent(t){const e=(new DOMParser).parseFromString(t,"image/svg+xml"),n=e.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n path[data-element-type="layer"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="layer"]:hover {\n fill: #b197fc !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="layer-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n \n path[data-element-type="layer-updated"]:hover {\n fill:rgb(103, 56, 244) !important;\n transform: scale(1.02);\n transform: translate(0, -2px);\n }\n\n path[data-element-type="gradient"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="gradient"]:hover {\n fill: #f06595 !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="forward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="forward"]:hover {\n stroke: #0c8599 !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="backward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="backward"]:hover {\n stroke: #e8590c !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="optimization"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="optimization"]:hover {\n stroke: #087f5b !important;\n stroke-width: 4 !important;\n }\n',e.documentElement.insertBefore(n,e.documentElement.firstChild),e.querySelectorAll('path[fill="#d0bfff"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-${e}`),t.setAttribute("data-element-type","layer")})),e.querySelectorAll('path[fill="#9775fa"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-updated-${e}`),t.setAttribute("data-element-type","layer-updated")})),e.querySelectorAll('path[fill="#f783ac"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`gradient-${e}`),t.setAttribute("data-element-type","gradient")})),Object.entries({"#15aabf":"forward","#fd7e14":"backward","#099268":"optimization"}).forEach((([t,n])=>{e.querySelectorAll(`path[stroke="${t}"]`).forEach(((t,e)=>{t.setAttribute("data-element-id",`${n}-${e}`),t.setAttribute("data-element-type",n)}))})),e.documentElement.setAttribute("width","100%"),e.documentElement.setAttribute("height","100%"),e.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(e)}async function loadSVG(t,e){try{const n=await fetch(t);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const a=enhanceSVGContent(await n.text());document.getElementById(e).innerHTML=a}catch(t){console.error("Error loading SVG:",t),document.getElementById(e).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/activation_recomputation.svg","svg-activation_recomputation");const svgContainer3=document.getElementById("svg-activation_recomputation");svgContainer3.addEventListener("mouseover",(function(t){const e=t.target;if("path"===e.tagName.toLowerCase()&&e.hasAttribute("data-element-id")){const t=e.getAttribute("data-element-id"),n=e.getAttribute("data-element-type"),a={layer:"Neural Network Layer","layer-updated":"Neural Network Layer (updated)",gradient:"Gradient Update Layer",forward:"Forward Pass Connection",backward:"Backward Pass Connection",optimization:"Optimization Step"}[n]||n;document.getElementById("svg-activation_recomputation-info").textContent=`Hovering over: ${a} (${t})`}})),svgContainer3.addEventListener("mouseout",(function(){document.getElementById("svg-activation_recomputation-info").textContent="Hover over the network elements to see their details"}));
|
dist/assets/images/activation_recomputation.svg
ADDED
|
dist/assets/images/first_steps_memory_profile.js
CHANGED
@@ -1 +1 @@
|
|
1 |
-
function enhanceSVGContent2(e){const t=(new DOMParser).parseFromString(e,"image/svg+xml"),n=t.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n /* Memory Block (free memory) */\n path[data-element-type="memory-block"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block"]:hover {\n fill: #a5d6a7 !important; /* slightly darker than original */\n transform: translate(0, -2px);\n }\n\n /* Memory Block (updated) */\n path[data-element-type="memory-block-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block-updated"]:hover {\n fill: #81c784 !important;\n transform: scale(1.02) translate(0, -2px);\n }\n\n /* Stack Segment */\n path[data-element-type="stack"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="stack"]:hover {\n fill: #ffd54f !important;\n transform: translate(0, -2px);\n }\n\n /* Read Operation Arrow */\n path[data-element-type="read"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="read"]:hover {\n stroke: #1e88e5 !important;\n stroke-width: 4 !important;\n }\n\n /* Write Operation Arrow */\n path[data-element-type="write"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="write"]:hover {\n stroke: #d32f2f !important;\n stroke-width: 4 !important;\n }\n\n /* Cache Operation Arrow */\n path[data-element-type="cache"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="cache"]:hover {\n stroke: #fbc02d !important;\n stroke-width: 4 !important;\n }\n ',t.documentElement.insertBefore(n,t.documentElement.firstChild),t.querySelectorAll('path[fill="#c8e6c9"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-${t}`),e.setAttribute("data-element-type","memory-block")})),t.querySelectorAll('path[fill="#a5d6a7"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-updated-${t}`),e.setAttribute("data-element-type","memory-block-updated")})),t.querySelectorAll('path[fill="#ffe082"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`stack-${t}`),e.setAttribute("data-element-type","stack")})),Object.entries({"#42a5f5":"read","#ef5350":"write","#ffca28":"cache"}).forEach((([e,n])=>{t.querySelectorAll(`path[stroke="${e}"]`).forEach(((e,t)=>{e.setAttribute("data-element-id",`${n}-${t}`),e.setAttribute("data-element-type",n)}))})),t.documentElement.setAttribute("width","100%"),t.documentElement.setAttribute("height","100%"),t.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(t)}async function loadSVG(e,t){try{console.log("Loading SVG from:",e);const n=await fetch(e);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const r=enhanceSVGContent2(await n.text());document.getElementById(t).innerHTML=r}catch(e){console.error("Error loading SVG:",e),document.getElementById(t).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_memory_profile.svg","svg-first_steps_memory_profile");const svgContainer2=document.getElementById("svg-first_steps_memory_profile");svgContainer2.addEventListener("mouseover",(function(e){const t=e.target;if("path"===t.tagName.toLowerCase()&&t.hasAttribute("data-element-id")){const e=t.getAttribute("data-element-id"),n=t.getAttribute("data-element-type"),r={"memory-block":"Memory Block","memory-block-updated":"Memory Block (updated)",stack:"Stack Segment",read:"Memory Read",write:"Memory Write",cache:"Cache Operation"}[n]||n;document.getElementById("info").textContent=`Hovering over: ${r} (${e})`}})),svgContainer2.addEventListener("mouseout",(function(){document.getElementById("info").textContent="Hover over the
|
|
|
1 |
+
function enhanceSVGContent2(e){const t=(new DOMParser).parseFromString(e,"image/svg+xml"),n=t.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n /* Memory Block (free memory) */\n path[data-element-type="memory-block"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block"]:hover {\n fill: #a5d6a7 !important; /* slightly darker than original */\n transform: translate(0, -2px);\n }\n\n /* Memory Block (updated) */\n path[data-element-type="memory-block-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block-updated"]:hover {\n fill: #81c784 !important;\n transform: scale(1.02) translate(0, -2px);\n }\n\n /* Stack Segment */\n path[data-element-type="stack"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="stack"]:hover {\n fill: #ffd54f !important;\n transform: translate(0, -2px);\n }\n\n /* Read Operation Arrow */\n path[data-element-type="read"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="read"]:hover {\n stroke: #1e88e5 !important;\n stroke-width: 4 !important;\n }\n\n /* Write Operation Arrow */\n path[data-element-type="write"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="write"]:hover {\n stroke: #d32f2f !important;\n stroke-width: 4 !important;\n }\n\n /* Cache Operation Arrow */\n path[data-element-type="cache"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="cache"]:hover {\n stroke: #fbc02d !important;\n stroke-width: 4 !important;\n }\n ',t.documentElement.insertBefore(n,t.documentElement.firstChild),t.querySelectorAll('path[fill="#c8e6c9"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-${t}`),e.setAttribute("data-element-type","memory-block")})),t.querySelectorAll('path[fill="#a5d6a7"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-updated-${t}`),e.setAttribute("data-element-type","memory-block-updated")})),t.querySelectorAll('path[fill="#ffe082"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`stack-${t}`),e.setAttribute("data-element-type","stack")})),Object.entries({"#42a5f5":"read","#ef5350":"write","#ffca28":"cache"}).forEach((([e,n])=>{t.querySelectorAll(`path[stroke="${e}"]`).forEach(((e,t)=>{e.setAttribute("data-element-id",`${n}-${t}`),e.setAttribute("data-element-type",n)}))})),t.documentElement.setAttribute("width","100%"),t.documentElement.setAttribute("height","100%"),t.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(t)}async function loadSVG(e,t){try{console.log("Loading SVG from:",e);const n=await fetch(e);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const r=enhanceSVGContent2(await n.text());document.getElementById(t).innerHTML=r}catch(e){console.error("Error loading SVG:",e),document.getElementById(t).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_memory_profile.svg","svg-first_steps_memory_profile");const svgContainer2=document.getElementById("svg-first_steps_memory_profile");svgContainer2.addEventListener("mouseover",(function(e){const t=e.target;if("path"===t.tagName.toLowerCase()&&t.hasAttribute("data-element-id")){const e=t.getAttribute("data-element-id"),n=t.getAttribute("data-element-type"),r={"memory-block":"Memory Block","memory-block-updated":"Memory Block (updated)",stack:"Stack Segment",read:"Memory Read",write:"Memory Write",cache:"Cache Operation"}[n]||n;document.getElementById("svg-first_steps_memory_profile-info").textContent=`Hovering over: ${r} (${e})`}})),svgContainer2.addEventListener("mouseout",(function(){document.getElementById("svg-first_steps_memory_profile-info").textContent="Hover over the elements to see their details"}));
|
dist/assets/images/first_steps_simple_training.js
CHANGED
@@ -1 +1 @@
|
|
1 |
-
function enhanceSVGContent(t){const e=(new DOMParser).parseFromString(t,"image/svg+xml"),n=e.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n path[data-element-type="layer"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="layer"]:hover {\n fill: #b197fc !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="layer-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n \n path[data-element-type="layer-updated"]:hover {\n fill:rgb(103, 56, 244) !important;\n transform: scale(1.02);\n transform: translate(0, -2px);\n }\n\n path[data-element-type="gradient"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="gradient"]:hover {\n fill: #f06595 !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="forward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="forward"]:hover {\n stroke: #0c8599 !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="backward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="backward"]:hover {\n stroke: #e8590c !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="optimization"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="optimization"]:hover {\n stroke: #087f5b !important;\n stroke-width: 4 !important;\n }\n',e.documentElement.insertBefore(n,e.documentElement.firstChild),e.querySelectorAll('path[fill="#d0bfff"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-${e}`),t.setAttribute("data-element-type","layer")})),e.querySelectorAll('path[fill="#9775fa"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-updated-${e}`),t.setAttribute("data-element-type","layer-updated")})),e.querySelectorAll('path[fill="#f783ac"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`gradient-${e}`),t.setAttribute("data-element-type","gradient")})),Object.entries({"#15aabf":"forward","#fd7e14":"backward","#099268":"optimization"}).forEach((([t,n])=>{e.querySelectorAll(`path[stroke="${t}"]`).forEach(((t,e)=>{t.setAttribute("data-element-id",`${n}-${e}`),t.setAttribute("data-element-type",n)}))})),e.documentElement.setAttribute("width","100%"),e.documentElement.setAttribute("height","100%"),e.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(e)}async function loadSVG(t,e){try{const n=await fetch(t);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const a=enhanceSVGContent(await n.text());document.getElementById(e).innerHTML=a}catch(t){console.error("Error loading SVG:",t),document.getElementById(e).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_simple_training.svg","svg-first_steps_simple_training");const svgContainer=document.getElementById("svg-first_steps_simple_training");svgContainer.addEventListener("mouseover",(function(t){const e=t.target;if("path"===e.tagName.toLowerCase()&&e.hasAttribute("data-element-id")){const t=e.getAttribute("data-element-id"),n=e.getAttribute("data-element-type"),a={layer:"Neural Network Layer","layer-updated":"Neural Network Layer (updated)",gradient:"Gradient Update Layer",forward:"Forward Pass Connection",backward:"Backward Pass Connection",optimization:"Optimization Step"}[n]||n;document.getElementById("info").textContent=`Hovering over: ${a} (${t})`}})),svgContainer.addEventListener("mouseout",(function(){document.getElementById("info").textContent="Hover over the network elements to see their details"}));
|
|
|
1 |
+
function enhanceSVGContent(t){const e=(new DOMParser).parseFromString(t,"image/svg+xml"),n=e.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n path[data-element-type="layer"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="layer"]:hover {\n fill: #b197fc !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="layer-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n \n path[data-element-type="layer-updated"]:hover {\n fill:rgb(103, 56, 244) !important;\n transform: scale(1.02);\n transform: translate(0, -2px);\n }\n\n path[data-element-type="gradient"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="gradient"]:hover {\n fill: #f06595 !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="forward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="forward"]:hover {\n stroke: #0c8599 !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="backward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="backward"]:hover {\n stroke: #e8590c !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="optimization"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="optimization"]:hover {\n stroke: #087f5b !important;\n stroke-width: 4 !important;\n }\n',e.documentElement.insertBefore(n,e.documentElement.firstChild),e.querySelectorAll('path[fill="#d0bfff"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-${e}`),t.setAttribute("data-element-type","layer")})),e.querySelectorAll('path[fill="#9775fa"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-updated-${e}`),t.setAttribute("data-element-type","layer-updated")})),e.querySelectorAll('path[fill="#f783ac"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`gradient-${e}`),t.setAttribute("data-element-type","gradient")})),Object.entries({"#15aabf":"forward","#fd7e14":"backward","#099268":"optimization"}).forEach((([t,n])=>{e.querySelectorAll(`path[stroke="${t}"]`).forEach(((t,e)=>{t.setAttribute("data-element-id",`${n}-${e}`),t.setAttribute("data-element-type",n)}))})),e.documentElement.setAttribute("width","100%"),e.documentElement.setAttribute("height","100%"),e.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(e)}async function loadSVG(t,e){try{const n=await fetch(t);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const a=enhanceSVGContent(await n.text());document.getElementById(e).innerHTML=a}catch(t){console.error("Error loading SVG:",t),document.getElementById(e).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_simple_training.svg","svg-first_steps_simple_training");const svgContainer=document.getElementById("svg-first_steps_simple_training");svgContainer.addEventListener("mouseover",(function(t){const e=t.target;if("path"===e.tagName.toLowerCase()&&e.hasAttribute("data-element-id")){const t=e.getAttribute("data-element-id"),n=e.getAttribute("data-element-type"),a={layer:"Neural Network Layer","layer-updated":"Neural Network Layer (updated)",gradient:"Gradient Update Layer",forward:"Forward Pass Connection",backward:"Backward Pass Connection",optimization:"Optimization Step"}[n]||n;document.getElementById("svg-first_steps_simple_training-info").textContent=`Hovering over: ${a} (${t})`}})),svgContainer.addEventListener("mouseout",(function(){document.getElementById("svg-first_steps_simple_training-info").textContent="Hover over the network elements to see their details"}));
|
dist/index.html
CHANGED
@@ -234,7 +234,7 @@
|
|
234 |
<p>It looks generally like this: </p>
|
235 |
|
236 |
<div class="svg-container" id="svg-first_steps_simple_training"> </div>
|
237 |
-
<div class="info" id="info">Hover over the network elements to see their details</div>
|
238 |
<script src="../assets/images/first_steps_simple_training.js"></script>
|
239 |
|
240 |
<p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
|
@@ -297,8 +297,14 @@
|
|
297 |
|
298 |
<p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
|
299 |
|
|
|
300 |
<!--<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
|
301 |
<script src="../assets/images/first_steps_memory_profile.js"></script>-->
|
|
|
|
|
|
|
|
|
|
|
302 |
|
303 |
<iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
|
304 |
|
@@ -353,7 +359,7 @@
|
|
353 |
<div class="note-box">
|
354 |
<p class="note-box-title">π Note</p>
|
355 |
<p class="note-box-content">
|
356 |
-
Some
|
357 |
|
358 |
</p>
|
359 |
</div>
|
@@ -415,7 +421,15 @@
|
|
415 |
|
416 |
<p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
|
417 |
|
418 |
-
<p><img alt="llama-memory-bars-no-recomp.png" src="/assets/images/placeholder.png" /></p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
419 |
|
420 |
<p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that weβll discuss later) stays roughly independent of the sequence length and batch size.</p>
|
421 |
|
@@ -429,7 +443,9 @@
|
|
429 |
|
430 |
<p>The general idea behind <strong><em>activation recomputation</em></strong> β also called <em>gradient checkpointing</em> or <em>rematerialization</em> β is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
|
431 |
|
432 |
-
<
|
|
|
|
|
433 |
|
434 |
<p>There are several strategies to select key activations to store:</p>
|
435 |
|
|
|
234 |
<p>It looks generally like this: </p>
|
235 |
|
236 |
<div class="svg-container" id="svg-first_steps_simple_training"> </div>
|
237 |
+
<div class="info" id="svg-first_steps_simple_training-info">Hover over the network elements to see their details</div>
|
238 |
<script src="../assets/images/first_steps_simple_training.js"></script>
|
239 |
|
240 |
<p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
|
|
|
297 |
|
298 |
<p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
|
299 |
|
300 |
+
<<<<<<< HEAD
|
301 |
<!--<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
|
302 |
<script src="../assets/images/first_steps_memory_profile.js"></script>-->
|
303 |
+
=======
|
304 |
+
<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
|
305 |
+
<div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
|
306 |
+
<script src="../assets/images/first_steps_memory_profile.js"></script>
|
307 |
+
>>>>>>> a1429a9 (update)
|
308 |
|
309 |
<iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
|
310 |
|
|
|
359 |
<div class="note-box">
|
360 |
<p class="note-box-title">π Note</p>
|
361 |
<p class="note-box-content">
|
362 |
+
Some libraries store grads in fp32 which would require an additional <d-math>m_{params\_fp32} = 4 * N</d-math> memory. This is done for example in nanotron, because <code>bf16</code> is lossy for smaller values and we always prioritize stability. See <a href="https://github.com/microsoft/DeepSpeed/issues/1773">this DeepSpeed issue</a> for more information.
|
363 |
|
364 |
</p>
|
365 |
</div>
|
|
|
421 |
|
422 |
<p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
|
423 |
|
424 |
+
<!-- <p><img alt="llama-memory-bars-no-recomp.png" src="/assets/images/placeholder.png" /></p> -->
|
425 |
+
<iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/memusage_activations.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
426 |
+
<script>
|
427 |
+
window.addEventListener('load', function() {
|
428 |
+
const frame = document.getElementById('plotFrame2');
|
429 |
+
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
430 |
+
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
431 |
+
});
|
432 |
+
</script>
|
433 |
|
434 |
<p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that weβll discuss later) stays roughly independent of the sequence length and batch size.</p>
|
435 |
|
|
|
443 |
|
444 |
<p>The general idea behind <strong><em>activation recomputation</em></strong> β also called <em>gradient checkpointing</em> or <em>rematerialization</em> β is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
|
445 |
|
446 |
+
<div class="svg-container" id="svg-activation_recomputation"> </div>
|
447 |
+
<div class="info" id="svg-activation_recomputation-info">Hover over the network elements to see their details</div>
|
448 |
+
<script src="../assets/images/activation_recomputation.js"></script>
|
449 |
|
450 |
<p>There are several strategies to select key activations to store:</p>
|
451 |
|
src/index.html
CHANGED
@@ -234,7 +234,7 @@
|
|
234 |
<p>It looks generally like this: </p>
|
235 |
|
236 |
<div class="svg-container" id="svg-first_steps_simple_training"> </div>
|
237 |
-
<div class="info" id="info">Hover over the network elements to see their details</div>
|
238 |
<script src="../assets/images/first_steps_simple_training.js"></script>
|
239 |
|
240 |
<p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
|
@@ -297,8 +297,14 @@
|
|
297 |
|
298 |
<p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
|
299 |
|
|
|
300 |
<!--<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
|
301 |
<script src="../assets/images/first_steps_memory_profile.js"></script>-->
|
|
|
|
|
|
|
|
|
|
|
302 |
|
303 |
<iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
|
304 |
|
@@ -353,7 +359,7 @@
|
|
353 |
<div class="note-box">
|
354 |
<p class="note-box-title">π Note</p>
|
355 |
<p class="note-box-content">
|
356 |
-
Some
|
357 |
|
358 |
</p>
|
359 |
</div>
|
@@ -415,7 +421,14 @@
|
|
415 |
|
416 |
<p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
|
417 |
|
418 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
419 |
|
420 |
<p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that weβll discuss later) stays roughly independent of the sequence length and batch size.</p>
|
421 |
|
@@ -429,7 +442,13 @@
|
|
429 |
|
430 |
<p>The general idea behind <strong><em>activation recomputation</em></strong> β also called <em>gradient checkpointing</em> or <em>rematerialization</em> β is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
|
431 |
|
|
|
432 |
<p><img alt="image.png" src="/assets/images/activation_recomputation.png" /></p>
|
|
|
|
|
|
|
|
|
|
|
433 |
|
434 |
<p>There are several strategies to select key activations to store:</p>
|
435 |
|
|
|
234 |
<p>It looks generally like this: </p>
|
235 |
|
236 |
<div class="svg-container" id="svg-first_steps_simple_training"> </div>
|
237 |
+
<div class="info" id="svg-first_steps_simple_training-info">Hover over the network elements to see their details</div>
|
238 |
<script src="../assets/images/first_steps_simple_training.js"></script>
|
239 |
|
240 |
<p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
|
|
|
297 |
|
298 |
<p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
|
299 |
|
300 |
+
<<<<<<< HEAD
|
301 |
<!--<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
|
302 |
<script src="../assets/images/first_steps_memory_profile.js"></script>-->
|
303 |
+
=======
|
304 |
+
<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
|
305 |
+
<div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
|
306 |
+
<script src="../assets/images/first_steps_memory_profile.js"></script>
|
307 |
+
>>>>>>> a1429a9 (update)
|
308 |
|
309 |
<iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
|
310 |
|
|
|
359 |
<div class="note-box">
|
360 |
<p class="note-box-title">π Note</p>
|
361 |
<p class="note-box-content">
|
362 |
+
Some libraries store grads in fp32 which would require an additional <d-math>m_{params\_fp32} = 4 * N</d-math> memory. This is done for example in nanotron, because <code>bf16</code> is lossy for smaller values and we always prioritize stability. See <a href="https://github.com/microsoft/DeepSpeed/issues/1773">this DeepSpeed issue</a> for more information.
|
363 |
|
364 |
</p>
|
365 |
</div>
|
|
|
421 |
|
422 |
<p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
|
423 |
|
424 |
+
<iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/memusage_activations.html" width="90%" scrolling="no" frameborder="0"></iframe>
|
425 |
+
<script>
|
426 |
+
window.addEventListener('load', function() {
|
427 |
+
const frame = document.getElementById('plotFrame2');
|
428 |
+
frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
|
429 |
+
frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
|
430 |
+
});
|
431 |
+
</script>
|
432 |
|
433 |
<p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that weβll discuss later) stays roughly independent of the sequence length and batch size.</p>
|
434 |
|
|
|
442 |
|
443 |
<p>The general idea behind <strong><em>activation recomputation</em></strong> β also called <em>gradient checkpointing</em> or <em>rematerialization</em> β is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
|
444 |
|
445 |
+
<<<<<<< HEAD
|
446 |
<p><img alt="image.png" src="/assets/images/activation_recomputation.png" /></p>
|
447 |
+
=======
|
448 |
+
<div class="svg-container" id="svg-activation_recomputation"> </div>
|
449 |
+
<div class="info" id="svg-activation_recomputation-info">Hover over the network elements to see their details</div>
|
450 |
+
<script src="../assets/images/activation_recomputation.js"></script>
|
451 |
+
>>>>>>> a1429a9 (update)
|
452 |
|
453 |
<p>There are several strategies to select key activations to store:</p>
|
454 |
|