Spaces:
Sleeping
Sleeping
import gradio as gr | |
import json | |
import numpy as np | |
from sentence_transformers import SentenceTransformer | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
model.max_seq_length = 256 | |
class NumpyEncoder(json.JSONEncoder): | |
def default(self, obj): | |
if isinstance(obj, np.ndarray): | |
return obj.tolist() | |
return json.JSONEncoder.default(self, obj) | |
def text_to_embedding(text): | |
# Tokenize the input text | |
tokens = model.tokenize(text) | |
# Check if the token count exceeds the model's maximum sequence length | |
if len(tokens) > model.max_seq_length: | |
# Split the input text into chunks | |
chunks = [] | |
for i in range(0, len(tokens), model.max_seq_length): | |
chunk = tokens[i:i + model.max_seq_length] | |
chunks.append(model.tokenizer.convert_tokens_to_string(chunk)) | |
# Encode each chunk and store the embeddings | |
embeddings = [] | |
for chunk in chunks: | |
embedding = model.encode(chunk) | |
embeddings.append(embedding) | |
# Calculate the average embedding | |
avg_embedding = np.mean(embeddings, axis=0) | |
else: | |
# If the token count is within the limit, just encode the input text | |
avg_embedding = model.encode(text) | |
return json.dumps(avg_embedding, cls=NumpyEncoder) | |
inputs = gr.inputs.Textbox(default="Type text here.") | |
outputs = gr.outputs.Textbox() | |
app = gr.Interface(fn=text_to_embedding, inputs=inputs, outputs=outputs, title="Text to Embedding") | |
app.launch() | |