import gradio as gr import pandas as pd col=['Layer number', 'Hidden size', 'FFN Hidden size', 'Sequence length', 'Head number', 'Group number', 'dp', 'tp', 'pp', 'cp', 'GPU numbers', 'Batch size', 'FP8', 'Model parameters', 'Model_states', 'Activation', 'Total'] # # global data # table_data = pd.DataFrame(columns=col) def Get_GigaByte(memory): return memory / 1024**3 def Get_BillionParameter(parameter): return parameter / 1000**3 # model states: def Compute_Parameters_input(hidden_size, vocab_size, tp): num_parameters_word_embedding = hidden_size * vocab_size / tp num_parameters_position_embedding = 0 #args.hidden_size * args.seq_length return num_parameters_word_embedding + num_parameters_position_embedding def Compute_Parameters_output(hidden_size, vocab_size, tp): num_parameters_output_layernorm = 2 * hidden_size num_parameters_output_embedding = 0 # due to sharedWordEmbedding return num_parameters_output_layernorm + num_parameters_output_embedding def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, tp): # attention: # layernorm: 2h num_parameters_attention = 2 * hidden_size # QKV weight: 3h*h/tp, bias: 3h/tp # output linear weight: h*h/tp, bias: h num_parameters_attention_Q_weight = hidden_size * hidden_size / tp num_parameters_attention_KV_weight = 2 * kv_hidden_size * hidden_size / tp num_parameters_attention_Linear_weight = hidden_size * hidden_size / tp num_parameters_attention += num_parameters_attention_Q_weight + num_parameters_attention_KV_weight + num_parameters_attention_Linear_weight if is_bias == "True": num_parameters_attention += (hidden_size + 2 * kv_hidden_size) / tp + hidden_size return num_parameters_attention def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp): # MLP: # layernorm: 2h num_parameters_mlp = 2 * hidden_size # mlp1 weight: h*ffn/tp, bias: ffn/tp # mlp2 weight: ffn*h/tp, bias: h if act_func == "True": num_parameters_mlp += hidden_size * ffn_size * 3 / tp if is_bias == "True": num_parameters_mlp += ffn_size * 2 / tp + hidden_size else: num_parameters_mlp += hidden_size * ffn_size * 2 / tp if is_bias == "True": num_parameters_mlp += ffn_size / tp + hidden_size return num_parameters_mlp def Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp): if is_group_query == "False": group_query_num = head_num kv_hidden_size = hidden_size / head_num * group_query_num # input part num_parameters_input = Compute_Parameters_input(hidden_size, vocab_size, tp) # middle layers part num_parameters_attention = Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, tp) num_parameters_mlp = Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp) num_parameters_in_single_layer = num_parameters_attention + num_parameters_mlp num_parameters_in_total_layers = num_parameters_in_single_layer * layer_num / pp # output part parameters_output = Compute_Parameters_output(hidden_size, vocab_size, tp) if pp == 1: num_parameters_total = ( num_parameters_input + num_parameters_in_total_layers + parameters_output # num_parameters_output_layernorm ) else: num_parameters_total = ( num_parameters_input + num_parameters_in_total_layers ) return num_parameters_total def Compute_Weight(numParametersTotal, is_fp8, is_fp8_init): if is_fp8 == "False": weight_memory = 2 * numParametersTotal elif is_fp8_init == "False": weight_memory = 4 * numParametersTotal else: weight_memory = 2 * numParametersTotal return weight_memory def Compute_Gradient(numParametersTotal, g_ty): if g_ty == "FP32": gradient_memory = 4 * numParametersTotal elif g_ty =="BF16": gradient_memory = 2 * numParametersTotal return gradient_memory def Compute_Optimizer_states(numParametersTotal, o_ty, is_dist_opt, dp, cp): if o_ty == "FP32": optimizer_memory = 4 * 2 * numParametersTotal elif o_ty =="BF16": optimizer_memory = 2 * 2 * numParametersTotal if is_dist_opt == "True": optimizer_memory = optimizer_memory / (dp * cp) return optimizer_memory def Compute_Master_weight(numParametersTotal, is_dist_opt, dp, cp): master_weight_memory = 4 * numParametersTotal if is_dist_opt == "True": master_weight_memory = master_weight_memory / (dp * cp) return master_weight_memory def Compute_Model_states(vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty): numParametersTotal = Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, tp, pp) weight_memory = Compute_Weight(numParametersTotal, is_fp8, is_fp8_init) gradient_memory = Compute_Gradient(numParametersTotal, g_ty) optimizer_memory = Compute_Optimizer_states(numParametersTotal, o_ty, is_dist_opt, dp, cp) master_weight_memory = Compute_Master_weight(numParametersTotal, is_dist_opt, dp, cp) return numParametersTotal, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, \ weight_memory + gradient_memory + optimizer_memory + master_weight_memory # activation memory: def compute_activation_memory_attention(activation_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp): # LN 2bsq activation_mem_attn_ln = seq_length * b * hidden_size * 2 if is_sp == "False": activation_mem_attn_ln *= tp # attention input X, qkv 2bsh/1bsh activation_mem_attn_qkv = seq_length * b * hidden_size * activation_dtype if is_sp == "False": activation_mem_attn_qkv *= tp # attention q 2bsh activation_mem_attn_q = seq_length * b * hidden_size * 2 # attention k and v 4bsh activation_mem_attn_kv = seq_length * b * kv_hidden_size * 2 * 2 # attention proj input 2bsh/1bsh activation_mem_attn_proj = seq_length * b * hidden_size * activation_dtype # dropout bsh activation_mem_attn_dropout = seq_length * b * hidden_size if is_sp == "False": activation_mem_attn_dropout *= tp # bf16: 2+2+2+4+2+1=13bsh # fp8: 2+1+2+4+1+1=11bsh activation_memory_attn = ( activation_mem_attn_ln + activation_mem_attn_qkv + activation_mem_attn_q + activation_mem_attn_kv + activation_mem_attn_proj + activation_mem_attn_dropout ) return activation_memory_attn def compute_activation_memory_mlp(activation_dtype, seq_length, b, hidden_size, ffn_size, act_func, is_sp, tp): # LN 2bsh activation_mem_mlp_ln = seq_length * b * hidden_size * 2 if is_sp == "False": activation_mem_mlp_ln *= tp # FC1 2bsh/1bsh activation_mem_mlp_fc1 = seq_length * b * hidden_size * activation_dtype if is_sp == "False": activation_mem_mlp_fc1 *= tp # Act 8bsh if act_func == "Swiglu": activation_mem_mlp_act = seq_length * b * ffn_size * 2 * 2 else: activation_mem_mlp_act = seq_length * b * ffn_size * 2 # FC2 8bsh/4bsh activation_mem_mlp_fc2 = seq_length * b * ffn_size * activation_dtype # dropout bsh activation_mem_mlp_dropout = seq_length * b * hidden_size if is_sp == "False": activation_mem_mlp_dropout *= tp # bf16: 2+2+8+8+1=21 # fp8: 2+1+8+4+1=16 activation_memory_mlp = ( activation_mem_mlp_ln + activation_mem_mlp_fc1 + activation_mem_mlp_act + activation_mem_mlp_fc2 + activation_mem_mlp_dropout ) return activation_memory_mlp def compute_activation_memory_input(seq_length, b, hidden_size, pp): # embedding + Dropout return 8 * seq_length * b * pp + seq_length * b * hidden_size * pp def compute_activation_memory_output(seq_length, b, hidden_size, vocab_size): # Inputs to output layer and CE loss(bf16, fp32 * 2). return 2 * seq_length * b * hidden_size + (2 + 4 + 4) * seq_length * b * vocab_size def compute_activation_memory_pp(activation_memory, is_ip, vp, pp, num_microbatches): # Multiply by interleaved PP memory factor. if is_ip == "True": interleaved_schedule_memory_penalty = 1 + (pp - 1) / (pp * vp) activation_memory *= interleaved_schedule_memory_penalty # If using non-interleaved schedule, number of microbatches in pipeline can be less than pp_size, # so discount accordingly. if is_ip == "False" and pp > 1: if num_microbatches > 1: activation_memory *= min(1, num_microbatches / pp) return activation_memory def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, is_ip, vp): # Using formula in Table 2 of https://arxiv.org/pdf/2205.05198.pdf. # We are trying to compute the maximum activation footprint, so all calculations in this function # are for the first pipeline stage. # activation dataType if is_fp8 == "False": activation_dtype = 2 else: activation_dtype = 1 # kv_hidden_size if is_group_query == "False": group_query_num = head_num kv_hidden_size = hidden_size / head_num * group_query_num activation_memory_attn = compute_activation_memory_attention(activation_dtype, seq_length, b, hidden_size, kv_hidden_size, is_sp, tp) activation_memory_mlp = compute_activation_memory_mlp(activation_dtype, seq_length, b, hidden_size, ffn_size, act_func, is_sp, tp) activation_memory = activation_memory_attn + activation_memory_mlp activation_memory *= layer_num # Now add activation memory required for input embeddings, last LayerNorm and output layer. # Input to embedding (pp_size microbatches in flight). activation_memory_input = compute_activation_memory_input(seq_length, b, hidden_size, pp) activation_memory += activation_memory_input # get num_microbatches num_microbatches = b_global / b / dp / cp activation_memory = compute_activation_memory_pp(activation_memory, is_ip, vp, pp, num_microbatches) if pp == 1: # Inputs to output layer and CE loss(fp32). activation_memory_output = compute_activation_memory_output(seq_length, b, hidden_size, vocab_size) activation_memory += activation_memory_output elif pp > 1: # Sendrecv memory activation_memory += seq_length * b * hidden_size * 2 # Activation memory is partitioned by TP size due to tensor and sequence model parallelism. return activation_memory / tp / cp # compute_btn.click.function def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_sp, is_ip, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count): # get model states numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(vocab_size, layer_num, hidden_size, ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty) # get activation memory activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, is_ip, vp) # get model parameters numParametersTotal = Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, 1, 1) # get gpu number gpu_num = dp * tp * pp * cp # get B/GB numParametersTotal = round(Get_BillionParameter(numParametersTotal), 3) numParameters = round(Get_BillionParameter(numParameters), 3) model_states_memory = round(Get_GigaByte(model_states_memory), 3) activation_memory = round(Get_GigaByte(activation_memory), 3) Total = round(model_states_memory + activation_memory, 3) # record new_row = pd.DataFrame([[layer_num, hidden_size, ffn_size, seq_length, head_num, group_query_num, dp, tp, pp, cp, gpu_num, b, is_fp8, numParametersTotal, model_states_memory, activation_memory, Total]], columns=col) if count == 1: record_df = new_row else: record_df = record_df._append(new_row, ignore_index=True) count = count + 1 # return str(gpu_num), str(model_states) + " GB", str(activation) + " GB", str(total) + " GB", table_data return f""" GPU numbers = {str(gpu_num)}, \n Total model parameters = {str(numParametersTotal)} B, \n Model parameters = {str(numParameters)} B, \n Model_states = {str(model_states_memory)} GB, \n Activation = {str(activation_memory)} GB, \n Total memory consumption = {str(Total)} GB \n """, record_df, count def generate_csv(record_df): # 将 DataFrame 保存为 CSV 文件 csv_filename = "data.csv" record_df.to_csv(csv_filename, index=False) # 返回 CSV 文件路径 return csv_filename # formula string formula = r""" > **Note**🔑: In this formula, we assume LLM training with FP32 Gradient and Optimizer state, and bias = False, Zero1 = False, SP = True. $$ P_{input} = \frac{HV}{tp}, \quad P_{output} = 2H \\\\ P_{attn} = 2H + \frac{2H^2 + 2H_{KV} \times H}{tp}, \quad P_{MLP} = 2H + \\begin{cases} \frac{3H \times FFN}{tp}, & \text{if }GLU\text{ is True} \\\\ \frac{2H \times FFN}{tp}, & \text{if }GLU\text{ is False} \\end{cases} \\\\ P_{middle} = \frac{(P_{attn} + P_{MLP}) \times L}{pp} \\\\ P = P_{input} + P_{middle} + \\begin{cases} P_{output}, & \text{if }pp = 1 \\\\ 0, & \text{if }pp > 1 \\end{cases} \\\\ {Total\ Model\ parameters} = \\begin{cases} P, & \text{set tp = 1, pp = 1} \\\\ 2HV + 2H + (4H + 2H^2 + 2H_{KV} \times H + 3FFN \times H) \times L, & \text{general formula} \\end{cases} \\\\ {Model\ states} = {Model\ weight} + {Gradient} + {Optimizer\ state} + {Master\ weight} = \\begin{cases} 18P, & \text{BF16 training} \\\\ 18P, & \text{FP8 training with FP8 Init} \\\\ 20P, & \text{FP8 training w/o FP8 Init} \\end{cases} \\\\ $$ *** $$ A_{input} = (8SB + SBH) \times pp, \quad A_{output} = 2SBH + \\begin{cases} 10SBV, & \text{if }pp\text{ = 1} \\\\ 0, & \text{if }pp\text{ > 1} \\end{cases} \\\\ A_{attn} = 5SBH + 4SB \times H_{KV} + \\begin{cases} 2SBH, & \text{if } FP8 \text{ is True} \\\\ 4SBH, & \text{if } FP8 \text{ is False} \\end{cases} \\\\ A_{MLP} = 3SBH + \\begin{cases} SBH + SB \times FFN + 4SB \times FFN, & \text{if }FP8 \text{ is True and }GLU \text{ is True} \\\\ 2SBH + 2SB \times FFN + 4SB \times FFN, & \text{if }FP8 \text{ is False and }GLU \text{ is True} \\\\ SBH + SB \times FFN + 2SB \times FFN, & \text{if }FP8 \text{ is True and }GLU \text{ is False} \\\\ 2SBH + 2SB \times FFN + 2SB \times FFN, & \text{if }FP8 \text{ is False and }GLU \text{ is False} \\end{cases} \\\\ A_{middle} = (A_{attn} + A_{MLP}) \times L \\\\ A_{ip} = (A_{input} + A_{middle}) \times \\begin{cases} (1 + \frac{pp - 1}{pp \times vp}), & \text{if } Interleaved\ Pipeline \text{ is True} \\\\ min(1, \frac{microbatch}{pp}), & \text{if } Interleaved\ Pipeline \text{ is False and pp > 1} \\\\ 1, & \text{other} \\end{cases} \\\\ Activation = \\begin{cases} \frac{A_{ip} + A_{output}}{tp \times cp}, & \text{if pp = 1} \\\\ \frac{A_{ip} + 2BSH}{tp \times cp}, & \text{if pp > 1} \\end{cases} $$ *** $$ \\begin{gather} {GPU\ numbers} = tp \times pp \times dp \times cp\\\\ {Total\ memory\ consumption} = {Model\ states} + Activation \\end{gather} $$ """ with gr.Blocks() as demo: with gr.Row(): # Text gr.Markdown( """
Here's a GPU memory calculator, it helps you to compute memory comsumption in LLM training.