thomwolf HF staff commited on
Commit
a8a77bd
Β·
1 Parent(s): fca5b64
assets/data/benchmarks/memusage_activations.html ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ <div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
2
+ <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script> <div id="a841d1b3-f0b4-43f7-90f9-bbb31dc90094" class="plotly-graph-div" style="height:400px; width:1200px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("a841d1b3-f0b4-43f7-90f9-bbb31dc90094")) { Plotly.newPlot( "a841d1b3-f0b4-43f7-90f9-bbb31dc90094", [{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625,52.42681884765625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[9.25390625,28.5078125,97.015625,354.03125,1348.0625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125,488.8917236328125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[46.2578125,142.515625,485.03125,1770.0625,6740.125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125,3041.8564453125],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[145.703125,448.90625,1527.8125,5575.625,21231.25],"type":"bar","xaxis":"x3","yaxis":"y3"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"GB memory"},"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-8B","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-70B","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-405B","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"barmode":"stack","width":1200,"height":400,"legend":{"title":{}}}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
assets/images/activation_recomputation.js ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ // Function to enhance the SVG content by adding styles and data attributes
3
+ function enhanceSVGContent(originalContent) {
4
+ const parser = new DOMParser();
5
+ const doc = parser.parseFromString(originalContent, 'image/svg+xml');
6
+
7
+ // Create a style element with hover effects and insert it as the first child of the SVG
8
+ const styleElement = doc.createElementNS('http://www.w3.org/2000/svg', 'style');
9
+ styleElement.textContent = `
10
+ path[data-element-type="layer"] {
11
+ transition: all 0.3s;
12
+ cursor: pointer;
13
+ }
14
+ path[data-element-type="layer"]:hover {
15
+ fill: #b197fc !important;
16
+ transform: translate(0, -2px);
17
+ }
18
+
19
+ path[data-element-type="layer-updated"] {
20
+ transition: all 0.3s;
21
+ cursor: pointer;
22
+ }
23
+
24
+ path[data-element-type="layer-updated"]:hover {
25
+ fill:rgb(103, 56, 244) !important;
26
+ transform: scale(1.02);
27
+ transform: translate(0, -2px);
28
+ }
29
+
30
+ path[data-element-type="gradient"] {
31
+ transition: all 0.3s;
32
+ cursor: pointer;
33
+ }
34
+ path[data-element-type="gradient"]:hover {
35
+ fill: #f06595 !important;
36
+ transform: translate(0, -2px);
37
+ }
38
+
39
+ path[data-element-type="forward"] {
40
+ transition: all 0.3s;
41
+ cursor: pointer;
42
+ }
43
+ path[data-element-type="forward"]:hover {
44
+ stroke: #0c8599 !important;
45
+ stroke-width: 4 !important;
46
+ }
47
+
48
+ path[data-element-type="backward"] {
49
+ transition: all 0.3s;
50
+ cursor: pointer;
51
+ }
52
+ path[data-element-type="backward"]:hover {
53
+ stroke: #e8590c !important;
54
+ stroke-width: 4 !important;
55
+ }
56
+
57
+ path[data-element-type="optimization"] {
58
+ transition: all 0.3s;
59
+ cursor: pointer;
60
+ }
61
+ path[data-element-type="optimization"]:hover {
62
+ stroke: #087f5b !important;
63
+ stroke-width: 4 !important;
64
+ }
65
+ `;
66
+ doc.documentElement.insertBefore(styleElement, doc.documentElement.firstChild);
67
+
68
+ // Process neural network layers (purple nodes)
69
+ doc.querySelectorAll('path[fill="#d0bfff"]').forEach((node, index) => {
70
+ node.setAttribute('data-element-id', `layer-${index}`);
71
+ node.setAttribute('data-element-type', 'layer');
72
+ });
73
+
74
+ doc.querySelectorAll('path[fill="#9775fa"]').forEach((node, index) => {
75
+ node.setAttribute('data-element-id', `layer-updated-${index}`);
76
+ node.setAttribute('data-element-type', 'layer-updated');
77
+ });
78
+
79
+ // Process gradient nodes (pink nodes)
80
+ doc.querySelectorAll('path[fill="#f783ac"]').forEach((node, index) => {
81
+ node.setAttribute('data-element-id', `gradient-${index}`);
82
+ node.setAttribute('data-element-type', 'gradient');
83
+ });
84
+
85
+ // Process arrows by matching stroke colors
86
+ const arrowTypes = {
87
+ '#15aabf': 'forward',
88
+ '#fd7e14': 'backward',
89
+ '#099268': 'optimization'
90
+ };
91
+
92
+ Object.entries(arrowTypes).forEach(([color, type]) => {
93
+ doc.querySelectorAll(`path[stroke="${color}"]`).forEach((arrow, index) => {
94
+ arrow.setAttribute('data-element-id', `${type}-${index}`);
95
+ arrow.setAttribute('data-element-type', type);
96
+ });
97
+ });
98
+
99
+ // Make the SVG responsive
100
+ doc.documentElement.setAttribute('width', '100%');
101
+ doc.documentElement.setAttribute('height', '100%');
102
+ doc.documentElement.setAttribute('preserveAspectRatio', 'xMidYMid meet');
103
+
104
+ return new XMLSerializer().serializeToString(doc);
105
+ }
106
+
107
+ // Function to load an SVG file via fetch
108
+ async function loadSVG(url, containerId) {
109
+ try {
110
+ const response = await fetch(url);
111
+ if (!response.ok) {
112
+ throw new Error(`HTTP error! Status: ${response.status}`);
113
+ }
114
+ const svgText = await response.text();
115
+ const enhancedSVG = enhanceSVGContent(svgText);
116
+ document.getElementById(containerId).innerHTML = enhancedSVG;
117
+ } catch (error) {
118
+ console.error('Error loading SVG:', error);
119
+ document.getElementById(containerId).innerHTML = '<p>Error loading SVG.</p>';
120
+ }
121
+ }
122
+
123
+ // Load the SVG file (adjust the path if needed)
124
+ loadSVG('../assets/images/activation_recomputation.svg', 'svg-activation_recomputation');
125
+
126
+ // Set up event listeners to display a description of the hovered element
127
+ const svgContainer3 = document.getElementById('svg-activation_recomputation');
128
+
129
+ svgContainer3.addEventListener('mouseover', function (event) {
130
+ const target = event.target;
131
+ if (target.tagName.toLowerCase() === 'path' && target.hasAttribute('data-element-id')) {
132
+ const elementId = target.getAttribute('data-element-id');
133
+ const elementType = target.getAttribute('data-element-type');
134
+ const descriptions = {
135
+ layer: 'Neural Network Layer',
136
+ 'layer-updated': 'Neural Network Layer (updated)',
137
+ gradient: 'Gradient Update Layer',
138
+ forward: 'Forward Pass Connection',
139
+ backward: 'Backward Pass Connection',
140
+ optimization: 'Optimization Step'
141
+ };
142
+ const description = descriptions[elementType] || elementType;
143
+ document.getElementById('svg-activation_recomputation-info').textContent = `Hovering over: ${description} (${elementId})`;
144
+ }
145
+ });
146
+
147
+ svgContainer3.addEventListener('mouseout', function () {
148
+ document.getElementById('svg-activation_recomputation-info').textContent = 'Hover over the network elements to see their details';
149
+ });
assets/images/activation_recomputation.svg ADDED
assets/images/first_steps_memory_profile.js CHANGED
@@ -151,10 +151,10 @@ svgContainer2.addEventListener('mouseover', function(event) {
151
  'cache': 'Cache Operation'
152
  };
153
  const description = descriptions[elementType] || elementType;
154
- document.getElementById('info').textContent = `Hovering over: ${description} (${elementId})`;
155
  }
156
  });
157
 
158
  svgContainer2.addEventListener('mouseout', function() {
159
- document.getElementById('info').textContent = 'Hover over the network elements to see their details';
160
  });
 
151
  'cache': 'Cache Operation'
152
  };
153
  const description = descriptions[elementType] || elementType;
154
+ document.getElementById('svg-first_steps_memory_profile-info').textContent = `Hovering over: ${description} (${elementId})`;
155
  }
156
  });
157
 
158
  svgContainer2.addEventListener('mouseout', function() {
159
+ document.getElementById('svg-first_steps_memory_profile-info').textContent = 'Hover over the elements to see their details';
160
  });
assets/images/first_steps_simple_training.js CHANGED
@@ -140,10 +140,10 @@ svgContainer.addEventListener('mouseover', function (event) {
140
  optimization: 'Optimization Step'
141
  };
142
  const description = descriptions[elementType] || elementType;
143
- document.getElementById('info').textContent = `Hovering over: ${description} (${elementId})`;
144
  }
145
  });
146
 
147
  svgContainer.addEventListener('mouseout', function () {
148
- document.getElementById('info').textContent = 'Hover over the network elements to see their details';
149
  });
 
140
  optimization: 'Optimization Step'
141
  };
142
  const description = descriptions[elementType] || elementType;
143
+ document.getElementById('svg-first_steps_simple_training-info').textContent = `Hovering over: ${description} (${elementId})`;
144
  }
145
  });
146
 
147
  svgContainer.addEventListener('mouseout', function () {
148
+ document.getElementById('svg-first_steps_simple_training-info').textContent = 'Hover over the network elements to see their details';
149
  });
dist/assets/.DS_Store ADDED
Binary file (6.15 kB). View file
 
dist/assets/data/benchmarks/memusage_activations.html ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ <div> <script type="text/javascript">window.PlotlyConfig = {MathJaxConfig: 'local'};</script>
2
+ <script charset="utf-8" src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script> <div id="a841d1b3-f0b4-43f7-90f9-bbb31dc90094" class="plotly-graph-div" style="height:400px; width:1200px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("a841d1b3-f0b4-43f7-90f9-bbb31dc90094")) { Plotly.newPlot( "a841d1b3-f0b4-43f7-90f9-bbb31dc90094", [{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125,26.213409423828125],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625,52.42681884765625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":true,"x":["1024","2048","4096","8192","16384"],"y":[9.25390625,28.5078125,97.015625,354.03125,1348.0625],"type":"bar","xaxis":"x","yaxis":"y"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625,244.44586181640625],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125,488.8917236328125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[46.2578125,142.515625,485.03125,1770.0625,6740.125],"type":"bar","xaxis":"x2","yaxis":"y2"},{"legendgroup":"parameters","marker":{"color":"#4ea5b7"},"name":"parameters","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"gradients","marker":{"color":"#e889ab"},"name":"gradients","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625,1520.92822265625],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"optimizer states","marker":{"color":"#cec0fa"},"name":"optimizer states","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125,3041.8564453125],"type":"bar","xaxis":"x3","yaxis":"y3"},{"legendgroup":"activations","marker":{"color":"#e38a42"},"name":"activations","showlegend":false,"x":["1024","2048","4096","8192","16384"],"y":[145.703125,448.90625,1527.8125,5575.625,21231.25],"type":"bar","xaxis":"x3","yaxis":"y3"}], {"template":{"data":{"histogram2dcontour":[{"type":"histogram2dcontour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"choropleth":[{"type":"choropleth","colorbar":{"outlinewidth":0,"ticks":""}}],"histogram2d":[{"type":"histogram2d","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmap":[{"type":"heatmap","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"heatmapgl":[{"type":"heatmapgl","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"contourcarpet":[{"type":"contourcarpet","colorbar":{"outlinewidth":0,"ticks":""}}],"contour":[{"type":"contour","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"surface":[{"type":"surface","colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]}],"mesh3d":[{"type":"mesh3d","colorbar":{"outlinewidth":0,"ticks":""}}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"parcoords":[{"type":"parcoords","line":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolargl":[{"type":"scatterpolargl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"scattergeo":[{"type":"scattergeo","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterpolar":[{"type":"scatterpolar","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"scattergl":[{"type":"scattergl","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatter3d":[{"type":"scatter3d","line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattermapbox":[{"type":"scattermapbox","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scatterternary":[{"type":"scatterternary","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"scattercarpet":[{"type":"scattercarpet","marker":{"colorbar":{"outlinewidth":0,"ticks":""}}}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"white","linecolor":"white","minorgridcolor":"white","startlinecolor":"#2a3f5f"},"type":"carpet"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}],"barpolar":[{"marker":{"line":{"color":"#E5ECF6","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"pie":[{"automargin":true,"type":"pie"}]},"layout":{"autotypenumbers":"strict","colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"hovermode":"closest","hoverlabel":{"align":"left"},"paper_bgcolor":"white","plot_bgcolor":"#E5ECF6","polar":{"bgcolor":"#E5ECF6","angularaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"radialaxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"ternary":{"bgcolor":"#E5ECF6","aaxis":{"gridcolor":"white","linecolor":"white","ticks":""},"baxis":{"gridcolor":"white","linecolor":"white","ticks":""},"caxis":{"gridcolor":"white","linecolor":"white","ticks":""}},"coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]]},"xaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"yaxis":{"gridcolor":"white","linecolor":"white","ticks":"","title":{"standoff":15},"zerolinecolor":"white","automargin":true,"zerolinewidth":2},"scene":{"xaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"yaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2},"zaxis":{"backgroundcolor":"#E5ECF6","gridcolor":"white","linecolor":"white","showbackground":true,"ticks":"","zerolinecolor":"white","gridwidth":2}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"geo":{"bgcolor":"white","landcolor":"#E5ECF6","subunitcolor":"white","showland":true,"showlakes":true,"lakecolor":"white"},"title":{"x":0.05},"mapbox":{"style":"light"}}},"xaxis":{"anchor":"y","domain":[0.0,0.2888888888888889],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis":{"anchor":"x","domain":[0.0,1.0],"title":{"text":"GB memory"},"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis2":{"anchor":"y2","domain":[0.35555555555555557,0.6444444444444445],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis2":{"anchor":"x2","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"xaxis3":{"anchor":"y3","domain":[0.7111111111111111,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"yaxis3":{"anchor":"x3","domain":[0.0,1.0],"showgrid":true,"gridwidth":1,"gridcolor":"LightGray"},"annotations":[{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-8B","x":0.14444444444444446,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-70B","x":0.5,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"},{"font":{"size":16},"showarrow":false,"text":"Meta-Llama-3.1-405B","x":0.8555555555555556,"xanchor":"center","xref":"paper","y":1.0,"yanchor":"bottom","yref":"paper"}],"barmode":"stack","width":1200,"height":400,"legend":{"title":{}}}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
dist/assets/images/activation_recomputation.js ADDED
@@ -0,0 +1 @@
 
 
1
+ function enhanceSVGContent(t){const e=(new DOMParser).parseFromString(t,"image/svg+xml"),n=e.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n path[data-element-type="layer"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="layer"]:hover {\n fill: #b197fc !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="layer-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n \n path[data-element-type="layer-updated"]:hover {\n fill:rgb(103, 56, 244) !important;\n transform: scale(1.02);\n transform: translate(0, -2px);\n }\n\n path[data-element-type="gradient"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="gradient"]:hover {\n fill: #f06595 !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="forward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="forward"]:hover {\n stroke: #0c8599 !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="backward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="backward"]:hover {\n stroke: #e8590c !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="optimization"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="optimization"]:hover {\n stroke: #087f5b !important;\n stroke-width: 4 !important;\n }\n',e.documentElement.insertBefore(n,e.documentElement.firstChild),e.querySelectorAll('path[fill="#d0bfff"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-${e}`),t.setAttribute("data-element-type","layer")})),e.querySelectorAll('path[fill="#9775fa"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-updated-${e}`),t.setAttribute("data-element-type","layer-updated")})),e.querySelectorAll('path[fill="#f783ac"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`gradient-${e}`),t.setAttribute("data-element-type","gradient")})),Object.entries({"#15aabf":"forward","#fd7e14":"backward","#099268":"optimization"}).forEach((([t,n])=>{e.querySelectorAll(`path[stroke="${t}"]`).forEach(((t,e)=>{t.setAttribute("data-element-id",`${n}-${e}`),t.setAttribute("data-element-type",n)}))})),e.documentElement.setAttribute("width","100%"),e.documentElement.setAttribute("height","100%"),e.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(e)}async function loadSVG(t,e){try{const n=await fetch(t);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const a=enhanceSVGContent(await n.text());document.getElementById(e).innerHTML=a}catch(t){console.error("Error loading SVG:",t),document.getElementById(e).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/activation_recomputation.svg","svg-activation_recomputation");const svgContainer3=document.getElementById("svg-activation_recomputation");svgContainer3.addEventListener("mouseover",(function(t){const e=t.target;if("path"===e.tagName.toLowerCase()&&e.hasAttribute("data-element-id")){const t=e.getAttribute("data-element-id"),n=e.getAttribute("data-element-type"),a={layer:"Neural Network Layer","layer-updated":"Neural Network Layer (updated)",gradient:"Gradient Update Layer",forward:"Forward Pass Connection",backward:"Backward Pass Connection",optimization:"Optimization Step"}[n]||n;document.getElementById("svg-activation_recomputation-info").textContent=`Hovering over: ${a} (${t})`}})),svgContainer3.addEventListener("mouseout",(function(){document.getElementById("svg-activation_recomputation-info").textContent="Hover over the network elements to see their details"}));
dist/assets/images/activation_recomputation.svg ADDED
dist/assets/images/first_steps_memory_profile.js CHANGED
@@ -1 +1 @@
1
- function enhanceSVGContent2(e){const t=(new DOMParser).parseFromString(e,"image/svg+xml"),n=t.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n /* Memory Block (free memory) */\n path[data-element-type="memory-block"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block"]:hover {\n fill: #a5d6a7 !important; /* slightly darker than original */\n transform: translate(0, -2px);\n }\n\n /* Memory Block (updated) */\n path[data-element-type="memory-block-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block-updated"]:hover {\n fill: #81c784 !important;\n transform: scale(1.02) translate(0, -2px);\n }\n\n /* Stack Segment */\n path[data-element-type="stack"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="stack"]:hover {\n fill: #ffd54f !important;\n transform: translate(0, -2px);\n }\n\n /* Read Operation Arrow */\n path[data-element-type="read"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="read"]:hover {\n stroke: #1e88e5 !important;\n stroke-width: 4 !important;\n }\n\n /* Write Operation Arrow */\n path[data-element-type="write"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="write"]:hover {\n stroke: #d32f2f !important;\n stroke-width: 4 !important;\n }\n\n /* Cache Operation Arrow */\n path[data-element-type="cache"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="cache"]:hover {\n stroke: #fbc02d !important;\n stroke-width: 4 !important;\n }\n ',t.documentElement.insertBefore(n,t.documentElement.firstChild),t.querySelectorAll('path[fill="#c8e6c9"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-${t}`),e.setAttribute("data-element-type","memory-block")})),t.querySelectorAll('path[fill="#a5d6a7"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-updated-${t}`),e.setAttribute("data-element-type","memory-block-updated")})),t.querySelectorAll('path[fill="#ffe082"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`stack-${t}`),e.setAttribute("data-element-type","stack")})),Object.entries({"#42a5f5":"read","#ef5350":"write","#ffca28":"cache"}).forEach((([e,n])=>{t.querySelectorAll(`path[stroke="${e}"]`).forEach(((e,t)=>{e.setAttribute("data-element-id",`${n}-${t}`),e.setAttribute("data-element-type",n)}))})),t.documentElement.setAttribute("width","100%"),t.documentElement.setAttribute("height","100%"),t.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(t)}async function loadSVG(e,t){try{console.log("Loading SVG from:",e);const n=await fetch(e);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const r=enhanceSVGContent2(await n.text());document.getElementById(t).innerHTML=r}catch(e){console.error("Error loading SVG:",e),document.getElementById(t).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_memory_profile.svg","svg-first_steps_memory_profile");const svgContainer2=document.getElementById("svg-first_steps_memory_profile");svgContainer2.addEventListener("mouseover",(function(e){const t=e.target;if("path"===t.tagName.toLowerCase()&&t.hasAttribute("data-element-id")){const e=t.getAttribute("data-element-id"),n=t.getAttribute("data-element-type"),r={"memory-block":"Memory Block","memory-block-updated":"Memory Block (updated)",stack:"Stack Segment",read:"Memory Read",write:"Memory Write",cache:"Cache Operation"}[n]||n;document.getElementById("info").textContent=`Hovering over: ${r} (${e})`}})),svgContainer2.addEventListener("mouseout",(function(){document.getElementById("info").textContent="Hover over the network elements to see their details"}));
 
1
+ function enhanceSVGContent2(e){const t=(new DOMParser).parseFromString(e,"image/svg+xml"),n=t.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n /* Memory Block (free memory) */\n path[data-element-type="memory-block"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block"]:hover {\n fill: #a5d6a7 !important; /* slightly darker than original */\n transform: translate(0, -2px);\n }\n\n /* Memory Block (updated) */\n path[data-element-type="memory-block-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="memory-block-updated"]:hover {\n fill: #81c784 !important;\n transform: scale(1.02) translate(0, -2px);\n }\n\n /* Stack Segment */\n path[data-element-type="stack"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="stack"]:hover {\n fill: #ffd54f !important;\n transform: translate(0, -2px);\n }\n\n /* Read Operation Arrow */\n path[data-element-type="read"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="read"]:hover {\n stroke: #1e88e5 !important;\n stroke-width: 4 !important;\n }\n\n /* Write Operation Arrow */\n path[data-element-type="write"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="write"]:hover {\n stroke: #d32f2f !important;\n stroke-width: 4 !important;\n }\n\n /* Cache Operation Arrow */\n path[data-element-type="cache"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="cache"]:hover {\n stroke: #fbc02d !important;\n stroke-width: 4 !important;\n }\n ',t.documentElement.insertBefore(n,t.documentElement.firstChild),t.querySelectorAll('path[fill="#c8e6c9"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-${t}`),e.setAttribute("data-element-type","memory-block")})),t.querySelectorAll('path[fill="#a5d6a7"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`memory-block-updated-${t}`),e.setAttribute("data-element-type","memory-block-updated")})),t.querySelectorAll('path[fill="#ffe082"]').forEach(((e,t)=>{e.setAttribute("data-element-id",`stack-${t}`),e.setAttribute("data-element-type","stack")})),Object.entries({"#42a5f5":"read","#ef5350":"write","#ffca28":"cache"}).forEach((([e,n])=>{t.querySelectorAll(`path[stroke="${e}"]`).forEach(((e,t)=>{e.setAttribute("data-element-id",`${n}-${t}`),e.setAttribute("data-element-type",n)}))})),t.documentElement.setAttribute("width","100%"),t.documentElement.setAttribute("height","100%"),t.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(t)}async function loadSVG(e,t){try{console.log("Loading SVG from:",e);const n=await fetch(e);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const r=enhanceSVGContent2(await n.text());document.getElementById(t).innerHTML=r}catch(e){console.error("Error loading SVG:",e),document.getElementById(t).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_memory_profile.svg","svg-first_steps_memory_profile");const svgContainer2=document.getElementById("svg-first_steps_memory_profile");svgContainer2.addEventListener("mouseover",(function(e){const t=e.target;if("path"===t.tagName.toLowerCase()&&t.hasAttribute("data-element-id")){const e=t.getAttribute("data-element-id"),n=t.getAttribute("data-element-type"),r={"memory-block":"Memory Block","memory-block-updated":"Memory Block (updated)",stack:"Stack Segment",read:"Memory Read",write:"Memory Write",cache:"Cache Operation"}[n]||n;document.getElementById("svg-first_steps_memory_profile-info").textContent=`Hovering over: ${r} (${e})`}})),svgContainer2.addEventListener("mouseout",(function(){document.getElementById("svg-first_steps_memory_profile-info").textContent="Hover over the elements to see their details"}));
dist/assets/images/first_steps_simple_training.js CHANGED
@@ -1 +1 @@
1
- function enhanceSVGContent(t){const e=(new DOMParser).parseFromString(t,"image/svg+xml"),n=e.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n path[data-element-type="layer"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="layer"]:hover {\n fill: #b197fc !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="layer-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n \n path[data-element-type="layer-updated"]:hover {\n fill:rgb(103, 56, 244) !important;\n transform: scale(1.02);\n transform: translate(0, -2px);\n }\n\n path[data-element-type="gradient"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="gradient"]:hover {\n fill: #f06595 !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="forward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="forward"]:hover {\n stroke: #0c8599 !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="backward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="backward"]:hover {\n stroke: #e8590c !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="optimization"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="optimization"]:hover {\n stroke: #087f5b !important;\n stroke-width: 4 !important;\n }\n',e.documentElement.insertBefore(n,e.documentElement.firstChild),e.querySelectorAll('path[fill="#d0bfff"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-${e}`),t.setAttribute("data-element-type","layer")})),e.querySelectorAll('path[fill="#9775fa"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-updated-${e}`),t.setAttribute("data-element-type","layer-updated")})),e.querySelectorAll('path[fill="#f783ac"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`gradient-${e}`),t.setAttribute("data-element-type","gradient")})),Object.entries({"#15aabf":"forward","#fd7e14":"backward","#099268":"optimization"}).forEach((([t,n])=>{e.querySelectorAll(`path[stroke="${t}"]`).forEach(((t,e)=>{t.setAttribute("data-element-id",`${n}-${e}`),t.setAttribute("data-element-type",n)}))})),e.documentElement.setAttribute("width","100%"),e.documentElement.setAttribute("height","100%"),e.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(e)}async function loadSVG(t,e){try{const n=await fetch(t);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const a=enhanceSVGContent(await n.text());document.getElementById(e).innerHTML=a}catch(t){console.error("Error loading SVG:",t),document.getElementById(e).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_simple_training.svg","svg-first_steps_simple_training");const svgContainer=document.getElementById("svg-first_steps_simple_training");svgContainer.addEventListener("mouseover",(function(t){const e=t.target;if("path"===e.tagName.toLowerCase()&&e.hasAttribute("data-element-id")){const t=e.getAttribute("data-element-id"),n=e.getAttribute("data-element-type"),a={layer:"Neural Network Layer","layer-updated":"Neural Network Layer (updated)",gradient:"Gradient Update Layer",forward:"Forward Pass Connection",backward:"Backward Pass Connection",optimization:"Optimization Step"}[n]||n;document.getElementById("info").textContent=`Hovering over: ${a} (${t})`}})),svgContainer.addEventListener("mouseout",(function(){document.getElementById("info").textContent="Hover over the network elements to see their details"}));
 
1
+ function enhanceSVGContent(t){const e=(new DOMParser).parseFromString(t,"image/svg+xml"),n=e.createElementNS("http://www.w3.org/2000/svg","style");return n.textContent='\n path[data-element-type="layer"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="layer"]:hover {\n fill: #b197fc !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="layer-updated"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n \n path[data-element-type="layer-updated"]:hover {\n fill:rgb(103, 56, 244) !important;\n transform: scale(1.02);\n transform: translate(0, -2px);\n }\n\n path[data-element-type="gradient"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="gradient"]:hover {\n fill: #f06595 !important;\n transform: translate(0, -2px);\n }\n\n path[data-element-type="forward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="forward"]:hover {\n stroke: #0c8599 !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="backward"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="backward"]:hover {\n stroke: #e8590c !important;\n stroke-width: 4 !important;\n }\n\n path[data-element-type="optimization"] {\n transition: all 0.3s;\n cursor: pointer;\n }\n path[data-element-type="optimization"]:hover {\n stroke: #087f5b !important;\n stroke-width: 4 !important;\n }\n',e.documentElement.insertBefore(n,e.documentElement.firstChild),e.querySelectorAll('path[fill="#d0bfff"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-${e}`),t.setAttribute("data-element-type","layer")})),e.querySelectorAll('path[fill="#9775fa"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`layer-updated-${e}`),t.setAttribute("data-element-type","layer-updated")})),e.querySelectorAll('path[fill="#f783ac"]').forEach(((t,e)=>{t.setAttribute("data-element-id",`gradient-${e}`),t.setAttribute("data-element-type","gradient")})),Object.entries({"#15aabf":"forward","#fd7e14":"backward","#099268":"optimization"}).forEach((([t,n])=>{e.querySelectorAll(`path[stroke="${t}"]`).forEach(((t,e)=>{t.setAttribute("data-element-id",`${n}-${e}`),t.setAttribute("data-element-type",n)}))})),e.documentElement.setAttribute("width","100%"),e.documentElement.setAttribute("height","100%"),e.documentElement.setAttribute("preserveAspectRatio","xMidYMid meet"),(new XMLSerializer).serializeToString(e)}async function loadSVG(t,e){try{const n=await fetch(t);if(!n.ok)throw new Error(`HTTP error! Status: ${n.status}`);const a=enhanceSVGContent(await n.text());document.getElementById(e).innerHTML=a}catch(t){console.error("Error loading SVG:",t),document.getElementById(e).innerHTML="<p>Error loading SVG.</p>"}}loadSVG("../assets/images/first_steps_simple_training.svg","svg-first_steps_simple_training");const svgContainer=document.getElementById("svg-first_steps_simple_training");svgContainer.addEventListener("mouseover",(function(t){const e=t.target;if("path"===e.tagName.toLowerCase()&&e.hasAttribute("data-element-id")){const t=e.getAttribute("data-element-id"),n=e.getAttribute("data-element-type"),a={layer:"Neural Network Layer","layer-updated":"Neural Network Layer (updated)",gradient:"Gradient Update Layer",forward:"Forward Pass Connection",backward:"Backward Pass Connection",optimization:"Optimization Step"}[n]||n;document.getElementById("svg-first_steps_simple_training-info").textContent=`Hovering over: ${a} (${t})`}})),svgContainer.addEventListener("mouseout",(function(){document.getElementById("svg-first_steps_simple_training-info").textContent="Hover over the network elements to see their details"}));
dist/index.html CHANGED
@@ -234,7 +234,7 @@
234
  <p>It looks generally like this: </p>
235
 
236
  <div class="svg-container" id="svg-first_steps_simple_training"> </div>
237
- <div class="info" id="info">Hover over the network elements to see their details</div>
238
  <script src="../assets/images/first_steps_simple_training.js"></script>
239
 
240
  <p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
@@ -297,8 +297,14 @@
297
 
298
  <p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
299
 
 
300
  <!--<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
301
  <script src="../assets/images/first_steps_memory_profile.js"></script>-->
 
 
 
 
 
302
 
303
  <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
304
 
@@ -353,7 +359,7 @@
353
  <div class="note-box">
354
  <p class="note-box-title">πŸ“ Note</p>
355
  <p class="note-box-content">
356
- Some librarie store grads in fp32 which would require an additional <d-math>m_{params\_fp32} = 4 * N</d-math> memory. This is done for example in nanotron, because <code>bf16</code> is lossy for smaller values and we always prioritize stability. See <a href="https://github.com/microsoft/DeepSpeed/issues/1773">this DeepSpeed issue</a> for more information.
357
 
358
  </p>
359
  </div>
@@ -415,7 +421,15 @@
415
 
416
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
417
 
418
- <p><img alt="llama-memory-bars-no-recomp.png" src="/assets/images/placeholder.png" /></p>
 
 
 
 
 
 
 
 
419
 
420
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
421
 
@@ -429,7 +443,9 @@
429
 
430
  <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
431
 
432
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
 
 
433
 
434
  <p>There are several strategies to select key activations to store:</p>
435
 
 
234
  <p>It looks generally like this: </p>
235
 
236
  <div class="svg-container" id="svg-first_steps_simple_training"> </div>
237
+ <div class="info" id="svg-first_steps_simple_training-info">Hover over the network elements to see their details</div>
238
  <script src="../assets/images/first_steps_simple_training.js"></script>
239
 
240
  <p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
 
297
 
298
  <p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
299
 
300
+ <<<<<<< HEAD
301
  <!--<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
302
  <script src="../assets/images/first_steps_memory_profile.js"></script>-->
303
+ =======
304
+ <div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
305
+ <div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
306
+ <script src="../assets/images/first_steps_memory_profile.js"></script>
307
+ >>>>>>> a1429a9 (update)
308
 
309
  <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
310
 
 
359
  <div class="note-box">
360
  <p class="note-box-title">πŸ“ Note</p>
361
  <p class="note-box-content">
362
+ Some libraries store grads in fp32 which would require an additional <d-math>m_{params\_fp32} = 4 * N</d-math> memory. This is done for example in nanotron, because <code>bf16</code> is lossy for smaller values and we always prioritize stability. See <a href="https://github.com/microsoft/DeepSpeed/issues/1773">this DeepSpeed issue</a> for more information.
363
 
364
  </p>
365
  </div>
 
421
 
422
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
423
 
424
+ <!-- <p><img alt="llama-memory-bars-no-recomp.png" src="/assets/images/placeholder.png" /></p> -->
425
+ <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/memusage_activations.html" width="90%" scrolling="no" frameborder="0"></iframe>
426
+ <script>
427
+ window.addEventListener('load', function() {
428
+ const frame = document.getElementById('plotFrame2');
429
+ frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
430
+ frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
431
+ });
432
+ </script>
433
 
434
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
435
 
 
443
 
444
  <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
445
 
446
+ <div class="svg-container" id="svg-activation_recomputation"> </div>
447
+ <div class="info" id="svg-activation_recomputation-info">Hover over the network elements to see their details</div>
448
+ <script src="../assets/images/activation_recomputation.js"></script>
449
 
450
  <p>There are several strategies to select key activations to store:</p>
451
 
src/index.html CHANGED
@@ -234,7 +234,7 @@
234
  <p>It looks generally like this: </p>
235
 
236
  <div class="svg-container" id="svg-first_steps_simple_training"> </div>
237
- <div class="info" id="info">Hover over the network elements to see their details</div>
238
  <script src="../assets/images/first_steps_simple_training.js"></script>
239
 
240
  <p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
@@ -297,8 +297,14 @@
297
 
298
  <p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
299
 
 
300
  <!--<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
301
  <script src="../assets/images/first_steps_memory_profile.js"></script>-->
 
 
 
 
 
302
 
303
  <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
304
 
@@ -353,7 +359,7 @@
353
  <div class="note-box">
354
  <p class="note-box-title">πŸ“ Note</p>
355
  <p class="note-box-content">
356
- Some librarie store grads in fp32 which would require an additional <d-math>m_{params\_fp32} = 4 * N</d-math> memory. This is done for example in nanotron, because <code>bf16</code> is lossy for smaller values and we always prioritize stability. See <a href="https://github.com/microsoft/DeepSpeed/issues/1773">this DeepSpeed issue</a> for more information.
357
 
358
  </p>
359
  </div>
@@ -415,7 +421,14 @@
415
 
416
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
417
 
418
- <p><img alt="memusage_activations.svg" src="/assets/images/memusage_activations.svg" /></p>
 
 
 
 
 
 
 
419
 
420
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
421
 
@@ -429,7 +442,13 @@
429
 
430
  <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
431
 
 
432
  <p><img alt="image.png" src="/assets/images/activation_recomputation.png" /></p>
 
 
 
 
 
433
 
434
  <p>There are several strategies to select key activations to store:</p>
435
 
 
234
  <p>It looks generally like this: </p>
235
 
236
  <div class="svg-container" id="svg-first_steps_simple_training"> </div>
237
+ <div class="info" id="svg-first_steps_simple_training-info">Hover over the network elements to see their details</div>
238
  <script src="../assets/images/first_steps_simple_training.js"></script>
239
 
240
  <p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>
 
297
 
298
  <p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
299
 
300
+ <<<<<<< HEAD
301
  <!--<div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
302
  <script src="../assets/images/first_steps_memory_profile.js"></script>-->
303
+ =======
304
+ <div class="svg-container l-body-outset" id="svg-first_steps_memory_profile"> </div>
305
+ <div class="info" id="svg-first_steps_memory_profile-info">Hover over the elements to see their details</div>
306
+ <script src="../assets/images/first_steps_memory_profile.js"></script>
307
+ >>>>>>> a1429a9 (update)
308
 
309
  <iframe id="plotFrame" src="assets/data/benchmarks/memory-profile.html" height="520" width="1000" scrolling="no" frameborder="0"></iframe>
310
 
 
359
  <div class="note-box">
360
  <p class="note-box-title">πŸ“ Note</p>
361
  <p class="note-box-content">
362
+ Some libraries store grads in fp32 which would require an additional <d-math>m_{params\_fp32} = 4 * N</d-math> memory. This is done for example in nanotron, because <code>bf16</code> is lossy for smaller values and we always prioritize stability. See <a href="https://github.com/microsoft/DeepSpeed/issues/1773">this DeepSpeed issue</a> for more information.
363
 
364
  </p>
365
  </div>
 
421
 
422
  <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
423
 
424
+ <iframe class="l-body-outset" id="plotFrame2" src="assets/data/benchmarks/memusage_activations.html" width="90%" scrolling="no" frameborder="0"></iframe>
425
+ <script>
426
+ window.addEventListener('load', function() {
427
+ const frame = document.getElementById('plotFrame2');
428
+ frame.style.height = frame.contentWindow.document.documentElement.scrollHeight + 'px';
429
+ frame.style.width = frame.contentWindow.document.documentElement.scrollWidth + 'px';
430
+ });
431
+ </script>
432
 
433
  <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>
434
 
 
442
 
443
  <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
444
 
445
+ <<<<<<< HEAD
446
  <p><img alt="image.png" src="/assets/images/activation_recomputation.png" /></p>
447
+ =======
448
+ <div class="svg-container" id="svg-activation_recomputation"> </div>
449
+ <div class="info" id="svg-activation_recomputation-info">Hover over the network elements to see their details</div>
450
+ <script src="../assets/images/activation_recomputation.js"></script>
451
+ >>>>>>> a1429a9 (update)
452
 
453
  <p>There are several strategies to select key activations to store:</p>
454