Spaces:
Sleeping
Sleeping
File size: 1,522 Bytes
bcf7ef1 d1a1982 9a1bb4c bcf7ef1 f9e6213 d1a1982 bcf7ef1 f6866c2 bcf7ef1 d1afd99 bcf7ef1 ef4c9c2 bcf7ef1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import gradio as gr
import json
import numpy as np
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
model.max_seq_length = 256
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
def text_to_embedding(text):
# Tokenize the input text
tokens = model.tokenize(text)
# Check if the token count exceeds the model's maximum sequence length
if len(tokens) > model.max_seq_length:
# Split the input text into chunks
chunks = []
for i in range(0, len(tokens), model.max_seq_length):
chunk = tokens[i:i + model.max_seq_length]
chunks.append(model.tokenizer.convert_tokens_to_string(chunk))
# Encode each chunk and store the embeddings
embeddings = []
for chunk in chunks:
embedding = model.encode(chunk)
embeddings.append(embedding)
# Calculate the average embedding
avg_embedding = np.mean(embeddings, axis=0)
else:
# If the token count is within the limit, just encode the input text
avg_embedding = model.encode(text)
return json.dumps(avg_embedding, cls=NumpyEncoder)
inputs = gr.inputs.Textbox(default="Type text here.")
outputs = gr.outputs.Textbox()
app = gr.Interface(fn=text_to_embedding, inputs=inputs, outputs=outputs, title="Text to Embedding")
app.launch()
|