Spaces:

cerebras
/

chain-of-thought

Running

App Files Files Community

daniel-cerebras commited on about 9 hours ago

Commit

8e74de2

•

1 Parent(s): a836b76

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -32

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-import time
 import json
 from cerebras.cloud.sdk import Cerebras
 from typing import List, Dict, Tuple, Any, Generator
@@ -24,14 +24,25 @@ def make_api_call(api_key: str, messages: List[Dict[str, str]], max_tokens: int,
         content = json.loads(response.choices[0].message.content)
-        # Calculate tokens per second
-        total_tokens = response.usage.total_tokens
-        elapsed_time = end_time - start_time
-        tokens_per_second = total_tokens / elapsed_time if elapsed_time > 0 else 0
         content['token_info'] = {
-            'total_tokens': total_tokens,
-            'tokens_per_second': tokens_per_second
         }
         return content
@@ -55,51 +66,50 @@ def generate_response(api_key: str, prompt: str) -> Generator[Tuple[List[Tuple[s
     ]
     steps = []
-    step_count = 1
     total_thinking_time = 0
-    total_tokens = 0
-    total_tokens_per_second = 0
     while True:
-        start_time = time.time()
         step_data = make_api_call(api_key, messages, 300)
-        thinking_time = time.time() - start_time
-        total_thinking_time += thinking_time
-        token_info = step_data.pop('token_info', {'total_tokens': 0, 'tokens_per_second': 0})
-        total_tokens += token_info['total_tokens']
-        total_tokens_per_second += token_info['tokens_per_second']
         step_title = f"Step {step_count}: {step_data['title']}"
-        step_content = f"{step_data['content']}\n\n**Cerebras LLM Call Duration: {thinking_time:.2f} seconds**\n**Tokens: {token_info['total_tokens']}, Tokens/s: {token_info['tokens_per_second']:.2f}**"
         steps.append((step_title, step_content))
         messages.append({"role": "assistant", "content": json.dumps(step_data)})
-        # Yield the current conversation, total thinking time, total tokens, and average tokens per second
-        yield steps, total_thinking_time, total_tokens, total_tokens_per_second / step_count if step_count > 0 else 0
         if step_data.get('next_action') == 'final_answer':
             break
-        step_count += 1
     # Request the final answer
     messages.append({"role": "user", "content": "Please provide the final answer based on your reasoning above."})
-    start_time = time.time()
     final_data = make_api_call(api_key, messages, 200, is_final_answer=True)
-    thinking_time = time.time() - start_time
-    total_thinking_time += thinking_time
-    token_info = final_data.pop('token_info', {'total_tokens': 0, 'tokens_per_second': 0})
-    total_tokens += token_info['total_tokens']
-    total_tokens_per_second += token_info['tokens_per_second']
-    final_content = f"{final_data.get('content', 'No final answer provided.')}\n\n**Final answer thinking time: {thinking_time:.2f} seconds**\n**Tokens: {token_info['total_tokens']}, Tokens/s: {token_info['tokens_per_second']:.2f}**"
     steps.append(("Final Answer", final_content))
-    # Yield the final conversation, total thinking time, total tokens, and average tokens per second
-    yield steps, total_thinking_time, total_tokens, total_tokens_per_second / (step_count + 1)
 def respond(api_key: str, message: str, history: List[Tuple[str, str]]) -> Generator[Tuple[List[Tuple[str, str]], str], None, None]:
     """
@@ -112,14 +122,14 @@ def respond(api_key: str, message: str, history: List[Tuple[str, str]]) -> Gener
     # Initialize the generator
     response_generator = generate_response(api_key, message)
-    for steps, total_time, total_tokens, avg_tokens_per_second in response_generator:
         conversation = history.copy()
         for title, content in steps[len(conversation):]:
             if title.startswith("Step") or title == "Final Answer":
                 conversation.append((title, content))
             else:
                 conversation.append((title, content))
-        yield conversation, f"**Total thinking time:** {total_time:.2f} seconds\n**Total tokens:** {total_tokens}\n**Average tokens/s:** {avg_tokens_per_second:.2f}"
 def main():
     with gr.Blocks() as demo:

 import gradio as gr
+import time
 import json
 from cerebras.cloud.sdk import Cerebras
 from typing import List, Dict, Tuple, Any, Generator
         content = json.loads(response.choices[0].message.content)
+        # Access time_info attributes directly
+        queue_time = response.time_info.queue_time
+        prompt_time = response.time_info.prompt_time
+        completion_time = response.time_info.completion_time
+        total_time = response.time_info.total_time
+        # Use the provided usage information
+        completion_tokens = response.usage.completion_tokens
+        # Calculate tokens per second using completion tokens
+        tokens_per_second = completion_tokens / total_time if total_time > 0 else 0
         content['token_info'] = {
+            'completion_tokens': completion_tokens,
+            'tokens_per_second': tokens_per_second,
+            'queue_time': queue_time,
+            'prompt_time': prompt_time,
+            'completion_time': completion_time,
+            'total_time': total_time  # Use total_time as the 'duration'
         }
         return content
     ]
     steps = []
+    step_count = 0
     total_thinking_time = 0
+    total_completion_tokens = 0
     while True:
         step_data = make_api_call(api_key, messages, 300)
+        token_info = step_data.pop('token_info', {'completion_tokens': 0, 'tokens_per_second': 0, 'duration': step_data.get('total_time', 0)})
+        # Use total_time from token_info as the duration
+        total_thinking_time += token_info.get('total_time', 0)
+        total_completion_tokens += token_info['completion_tokens']
+        step_count += 1
         step_title = f"Step {step_count}: {step_data['title']}"
+        step_content = f"{step_data['content']}\n\n**API Call Duration: {token_info['total_time']:.2f} seconds**\n**Completion Tokens: {token_info['completion_tokens']}, Tokens/s: {token_info['tokens_per_second']:.2f}**"
         steps.append((step_title, step_content))
         messages.append({"role": "assistant", "content": json.dumps(step_data)})
+        # Calculate the overall average tokens per second using completion tokens
+        overall_tokens_per_second = total_completion_tokens / total_thinking_time if total_thinking_time > 0 else 0
+        # Yield the current conversation, total thinking time, total completion tokens, and overall average tokens per second
+        yield steps, total_thinking_time, total_completion_tokens, overall_tokens_per_second
         if step_data.get('next_action') == 'final_answer':
             break
     # Request the final answer
     messages.append({"role": "user", "content": "Please provide the final answer based on your reasoning above."})
     final_data = make_api_call(api_key, messages, 200, is_final_answer=True)
+    token_info = final_data.pop('token_info', {'completion_tokens': 0, 'tokens_per_second': 0, 'duration': final_data.get('total_time', 0)})
+    total_thinking_time += token_info.get('total_time', 0)
+    total_completion_tokens += token_info['completion_tokens']
+    final_content = f"{final_data.get('content', 'No final answer provided.')}\n\n**Final answer API call duration: {token_info['total_time']:.2f} seconds**\n**Completion Tokens: {token_info['completion_tokens']}, Tokens/s: {token_info['tokens_per_second']:.2f}**"
     steps.append(("Final Answer", final_content))
+    # Calculate the final overall average tokens per second using completion tokens
+    overall_tokens_per_second = total_completion_tokens / total_thinking_time if total_thinking_time > 0 else 0
+    # Yield the final conversation, total thinking time, total completion tokens, and overall average tokens per second
+    yield steps, total_thinking_time, total_completion_tokens, overall_tokens_per_second
 def respond(api_key: str, message: str, history: List[Tuple[str, str]]) -> Generator[Tuple[List[Tuple[str, str]], str], None, None]:
     """
     # Initialize the generator
     response_generator = generate_response(api_key, message)
+    for steps, total_time, total_completion_tokens, avg_tokens_per_second in response_generator:
         conversation = history.copy()
         for title, content in steps[len(conversation):]:
             if title.startswith("Step") or title == "Final Answer":
                 conversation.append((title, content))
             else:
                 conversation.append((title, content))
+        yield conversation, f"**Total API call time:** {total_time:.2f} seconds\n**Completion tokens:** {total_completion_tokens}\n**Overall average tokens/s:** {avg_tokens_per_second:.2f}"
 def main():
     with gr.Blocks() as demo: