Spaces:
Running
Running
File size: 9,737 Bytes
15bbe10 c9e00de 15bbe10 c9e00de 8bbf037 15bbe10 187c8cf 76ed6d2 187c8cf 76ed6d2 9aa9fbb 76ed6d2 187c8cf 76ed6d2 eebf495 76ed6d2 187c8cf 76ed6d2 9aa9fbb 76ed6d2 187c8cf 15bbe10 187c8cf 76ed6d2 15bbe10 187c8cf 76ed6d2 15bbe10 fe9a872 15bbe10 c9e00de 15bbe10 fe9a872 15bbe10 eebf495 8bbf037 c77c9f7 eebf495 c77c9f7 eebf495 c77c9f7 eebf495 15bbe10 eebf495 8bbf037 15bbe10 eebf495 9d92eeb c9e00de 15bbe10 c9e00de 15bbe10 c9e00de 15bbe10 c9e00de 15bbe10 c9e00de 15bbe10 c9e00de 15bbe10 c9e00de 15bbe10 187c8cf 15bbe10 c9e00de 8bbf037 9d92eeb 15bbe10 9d92eeb 8bbf037 187c8cf 8bbf037 15bbe10 eebf495 8bbf037 9aa9fbb 8bbf037 9aa9fbb eebf495 8bbf037 9aa9fbb 8bbf037 9aa9fbb eebf495 9aa9fbb eebf495 723eefb eebf495 9aa9fbb eebf495 9aa9fbb eebf495 15bbe10 9aa9fbb c9e00de 15bbe10 187c8cf 15bbe10 c9e00de 15bbe10 c9e00de 187c8cf eebf495 9aa9fbb eebf495 9aa9fbb eebf495 9aa9fbb eebf495 9aa9fbb 15bbe10 9aa9fbb 15bbe10 c9e00de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
import numpy as np
from models import chat_with_model, embed
from prompts import create_gen_prompt, create_judge_prompt
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import streamlit as st # Import Streamlit
import queue
def generate_answer(question, previous_answers, model_name, open_router_key, openai_api_key,temperature,top_p):
"""Generates an answer to a question using the specified language model."""
gen_prompt = create_gen_prompt(question, previous_answers)
try:
new_answer = chat_with_model(prompt=gen_prompt, model=model_name, open_router_key=open_router_key,
openai_api_key=openai_api_key,temperature=temperature,top_p=top_p)
return new_answer
except Exception as e:
st.error(f"Error generating answer: {str(e)}") # Use st.error
return None
def evaluate_answer(question, new_answer, open_router_key, openai_api_key, judge_model_name,temperature,top_p):
"""Evaluates the coherence and novelty of an answer."""
judge_prompt = create_judge_prompt(question, new_answer)
judge = judge_model_name # Use the judge_model_name passed to the function
try:
judge_response = chat_with_model(prompt=judge_prompt, model=judge, open_router_key=open_router_key,
openai_api_key=openai_api_key,temperature=temperature,top_p=top_p)
coherence_score = int(judge_response.split("<coherence_score>")[1].split("</coherence_score>")[0])
return coherence_score
except Exception as e:
st.error(f"Error getting judge response: {str(e)}") # Use st.error
return None
def process_question(question, model_name, open_router_key, openai_api_key, result_queue, judge_model_name,coherence_threshold,novelty_threshold,temperature,top_p):
start_time = time.time()
previous_answers = []
question_novelty = 0
try:
while True:
new_answer = generate_answer(question, previous_answers, model_name, open_router_key, openai_api_key, temperature,top_p)
if new_answer is None:
break
coherence_score = evaluate_answer(question, new_answer, open_router_key, openai_api_key, judge_model_name,temperature,top_p)
if coherence_score is None:
break
if coherence_score <= coherence_threshold:
break
novelty_score = get_novelty_score(new_answer, previous_answers, openai_api_key)
if novelty_score < novelty_threshold:
break
result_dict = {
"type": "answer",
"question": question,
"answer": new_answer,
"coherence_score": coherence_score,
"novelty_score": novelty_score,
"results": [
{
"question": question,
"answers": previous_answers.copy() + [new_answer],
"coherence_score": coherence_score,
"novelty_score": question_novelty + novelty_score
}
]
}
if result_queue is not None: # Check if result_queue is provided
result_queue.put(result_dict)
yield result_dict # Use yield to return the result immediately
previous_answers.append(new_answer)
question_novelty += novelty_score
except Exception as e:
if result_queue is not None: # Check if result_queue is provided
result_queue.put({"type": "error", "message": str(e)})
time_taken = time.time() - start_time
if result_queue is not None: # Check if result_queue is provided
result_queue.put({
"type": "summary",
"question": question,
"total_novelty": question_novelty,
"time_taken": time_taken
})
return question_novelty, [
{
"question": question,
"answers": previous_answers,
"coherence_score": coherence_score,
"novelty_score": question_novelty
}
]
def get_novelty_score(new_answer: str, previous_answers: list, openai_api_key):
new_embedding = embed(new_answer, openai_api_key)
# If there are no previous answers, return maximum novelty
if not previous_answers:
return 1.0
previous_embeddings = [embed(answer, openai_api_key) for answer in previous_answers]
similarities = [
np.dot(new_embedding, prev_embedding) /
(np.linalg.norm(new_embedding) * np.linalg.norm(prev_embedding))
for prev_embedding in previous_embeddings
]
max_similarity = max(similarities)
novelty = 1 - max_similarity
return novelty
def benchmark_model_multithreaded(model_name, questions, open_router_key, openai_api_key, max_threads=None, judge_model_name=None,coherence_threshold=None,novelty_threshold=None,temperature=0,top_p=0):
novelty_score = 0
results = []
result_queue = queue.Queue() # Create a queue for communication
# Use max_threads if provided, otherwise default to the number of questions
if max_threads is None:
max_workers = len(questions)
else:
max_workers = max_threads
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit tasks to the thread pool
future_to_question = {
executor.submit(process_question, question, model_name, open_router_key, openai_api_key, result_queue, judge_model_name,coherence_threshold,novelty_threshold,temperature,top_p): question
for question in questions
}
# Collect results as they become available from futures and the queue
for future in as_completed(future_to_question):
for result in future.result(): # Iterate over yielded results from process_question
if result["type"] == "answer":
st.write(f"**Question:** {result['question']}")
st.write(f"**New Answer:**\n{result['answer']}")
st.write(f"Coherence Score: {result['coherence_score']}") # st.success for coherence
st.write(f"**Novelty Score:** {result['novelty_score']}")
results.extend(result["results"])
novelty_score += result["novelty_score"]
st.info(f"Total novelty score across all questions (so far): {novelty_score}") # st.info for running total
elif result["type"] == "summary":
st.info(f"Total novelty score for question '{result['question']}': {result['total_novelty']}") # st.info for summary
st.info(f"Time taken: {result['time_taken']} seconds") # st.info for summary
elif result["type"] == "error":
st.error(f"Error in thread: {result['message']}") # st.error for errors
# Process remaining results in the queue (if any)
while not result_queue.empty():
result = result_queue.get()
if result["type"] == "answer":
st.write(f"**Question:** {result['question']}")
st.write(f"**New Answer:**\n{result['answer']}")
st.success(f"Coherence Score: {result['coherence_score']}") # st.success for coherence
st.write(f"**Novelty Score:** {result['novelty_score']}")
results.extend(result["results"]) # Add results here
novelty_score += result["novelty_score"] # Update novelty score
st.warning(f"Total novelty score across all questions (so far): {novelty_score}")
elif result["type"] == "summary":
st.info(f"Total novelty score for question '{result['question']}': {result['total_novelty']}") # st.info for summary
st.info(f"Time taken: {result['time_taken']} seconds") # st.info for summary
elif result["type"] == "error":
st.error(f"Error in thread: {result['message']}") # st.error for errors
st.info(f"Final total novelty score across all questions: {novelty_score}")
return results
def benchmark_model_sequential(model_name, questions, open_router_key, openai_api_key, judge_model_name,coherence_threshold,novelty_threshold,temperature,top_p):
novelty_score = 0
results = []
for i, question in enumerate(questions):
for result in process_question(question, model_name, open_router_key, openai_api_key, None, judge_model_name,coherence_threshold,novelty_threshold,temperature,top_p):
if result["type"] == "answer":
st.write(f"**Question:** {result['question']}")
st.write(f"**New Answer:**\n{result['answer']}")
st.success(f"Coherence Score: {result['coherence_score']}") # st.success for coherence
st.write(f"**Novelty Score:** {result['novelty_score']}")
results.extend(result["results"])
novelty_score += result["novelty_score"] # Add to novelty score
st.success(f"Coherence Score: {result['coherence_score']}") # st.success for coherence
elif result["type"] == "summary":
st.info(f"Total novelty score for question '{result['question']}': {result['total_novelty']}") # st.info for summary
st.info(f"Time taken: {result['time_taken']} seconds") # st.info for summary
elif result["type"] == "error":
st.error(f"Error in thread: {result['message']}") # st.error for errors
st.info(f"Final total novelty score across all questions: {novelty_score}")
return results
|