import torch import streamlit as st import numpy as np import plotly.express as px, plotly.graph_objects as go from plotly.subplots import make_subplots from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, GenerationConfig, AutoModelForCausalLM def top_token_ids(outputs, threshold=-np.inf): "Returns the index of the tokens whose score exceeds a threshold, for each output step" indexes = [] for tensor in outputs['scores']: candidates = np.argwhere(tensor.flatten() > threshold).numpy()[0] ordering_mask = np.argsort(tensor[0][candidates]) candidates = candidates[ordering_mask] if not isinstance(candidates, np.ndarray): indexes.append(np.array([candidates])) else: indexes.append(candidates) return indexes def plot_word_scores(top_token_ids, outputs, tokenizer, boolq=False, width=600): fig = make_subplots(rows=len(top_token_ids), cols=1) for step, candidates in enumerate(top_token_ids): fig.append_trace( go.Bar( y=[w[1:] for w in tokenizer.convert_ids_to_tokens(candidates)], x=outputs['scores'][step][0][candidates], orientation='h' ), row=step+1, col=1 ) fig.update_layout( width=500, height=300*len(top_token_ids), showlegend=False ) return fig st.title('How do LLM choose their words?') instruction = st.text_area(label='Write an instruction:', placeholder='Where is Venice located?') col1, col2 = st.columns(2) with col1: model_checkpoint = st.selectbox( "Model:", ("google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl") ) with col2: temperature = st.slider('Temperature:', min_value=0.0, max_value=1.0, value=0.5) top_p = st.slider('Top p:', min_value=0.5, max_value=1.0, value=0.99) # max_tokens = st.number_input('Max output length:', min_value=1, max_value=64, format='%i') max_tokens = st.slider('Max output length: ', min_value=1, max_value=64) # threshold = st.number_input('Min token score:: ', value=-10.0) tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = T5ForConditionalGeneration.from_pretrained( model_checkpoint, load_in_8bit=False, device_map="auto", offload_folder="offload" ) prompts = [ f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Response:""" ] inputs = tokenizer( prompts[0], return_tensors="pt", ) input_ids = inputs["input_ids"]#.to("cuda") generation_config = GenerationConfig( do_sample=True, temperature=temperature, top_p=0.995, # default 0.75 top_k=100, # default 80 repetition_penalty=1.5, max_new_tokens=max_tokens, ) if instruction: with torch.no_grad(): outputs = model.generate( input_ids=input_ids, attention_mask=torch.ones_like(input_ids), generation_config=generation_config, return_dict_in_generate=True, output_scores=True ) output_text = tokenizer.decode( outputs['sequences'][0],#.cuda(), skip_special_tokens=False ).strip() st.write(output_text) fig = plot_word_scores(top_token_ids(outputs, threshold=-10.0), outputs, tokenizer) st.plotly_chart(fig, theme=None, use_container_width=False)