File size: 4,821 Bytes
0f55eee
 
 
 
 
 
 
 
 
 
 
d6042ff
0f55eee
 
 
 
 
 
d6042ff
0f55eee
 
 
 
d6042ff
 
 
 
 
 
 
 
 
0f55eee
d6042ff
 
 
0f55eee
 
 
 
 
d6042ff
0f55eee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6042ff
0f55eee
d6042ff
0f55eee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6042ff
 
0f55eee
 
 
 
d6042ff
 
0f55eee
 
d6042ff
0f55eee
d6042ff
0f55eee
 
 
 
 
 
 
 
d6042ff
 
 
 
 
0f55eee
 
 
 
d6042ff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os
import shutil
import gradio as gr
from transformers import ReactCodeAgent, HfEngine, Tool
import pandas as pd

from gradio import Chatbot
from test_streaming import stream_to_gradio
from huggingface_hub import login
from gradio.data_classes import FileData

#login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

llm_engine = HfEngine("meta-llama/Meta-Llama-3.1-70B-Instruct")

agent = ReactCodeAgent(
    tools=[],
    llm_engine=llm_engine,
    additional_authorized_imports=["numpy", "pandas", "matplotlib", "seaborn","scipy"],
    max_iterations=10,
)

base_prompt = """You are an expert data analyst.
You are given a data file and the data structure below.
The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!

When importing packages use this format: from package import module
For example: from matplotlib import pyplot as plt
Not: import matplotlib.pyplot as plt

As you work, check for NoneType values and convert to NAN.

Use the data file to answer the question or solve a problem given below.

In your final answer: summarize your findings
After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".

Structure of the data:
{structure_notes}

Question/Problem:
"""

example_notes="""This data is about the Titanic wreck in 1912.
The target figure is the survival of passengers, notes by 'Survived'
pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower
age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)
parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them."""

def get_images_in_directory(directory):
    image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'}

    image_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if os.path.splitext(file)[1].lower() in image_extensions:
                image_files.append(os.path.join(root, file))
    return image_files

def interact_with_agent(file_input, additional_notes):
    shutil.rmtree("./figures")
    os.makedirs("./figures")

    data_file = pd.read_csv(file_input)
    data_structure_notes = f"""- Description (output of .describe()):
    {data_file.describe()}
    - Columns with dtypes:
    {data_file.dtypes}"""

    prompt = base_prompt.format(structure_notes=data_structure_notes)

    if additional_notes and len(additional_notes) > 0:
        prompt += additional_notes

    messages = [gr.ChatMessage(role="user", content=additional_notes)]
    yield messages + [
        gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")
    ]

    plot_image_paths = {}
    for msg in stream_to_gradio(agent, prompt, data_file=data_file):
        messages.append(msg)
        for image_path in get_images_in_directory("./figures"):
            if image_path not in plot_image_paths:
                image_message = gr.ChatMessage(
                    role="assistant",
                    content=FileData(path=image_path, mime_type="image/png"),
                )
                plot_image_paths[image_path] = True
                messages.append(image_message)
        yield messages + [
            gr.ChatMessage(role="assistant", content="⏳ _Still processing..._")
        ]
    yield messages


with gr.Blocks(
    theme=gr.themes.Soft(
        primary_hue=gr.themes.colors.blue,
        secondary_hue=gr.themes.colors.yellow,
    )
) as demo:
    gr.Markdown("""# Llama-3.1 Data analyst 📊🤔

Drop a `.csv` file below and ask a question about your data. 
**Llama-3.1-70B will analyze and answer.**""")
    file_input = gr.File(label="Your file to analyze")
    text_input = gr.Textbox(
        label="Ask a question about your data?"
    )
    submit = gr.Button("Run", variant="primary")
    chatbot = gr.Chatbot(
        label="Data Analyst Agent",
        type="messages",
        avatar_images=(
            None,
            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
        ),
    )
    # gr.Examples(
    #     examples=[["./example/titanic.csv", example_notes]],
    #     inputs=[file_input, text_input],
    #     cache_examples=False
    # )

    submit.click(interact_with_agent, [file_input, text_input], [chatbot])

if __name__ == "__main__":
    demo.launch(server_port=7861)