Spaces:

suryadev1
/

astra

Running

File size: 18,286 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "960bac80-51c7-4e9f-ad2d-84cd6c710f98",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import pandas as pd\n",
    "from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score,auc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a34f21d0-0854-4a54-8f93-67718b2f969e",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_path = \"roc_data2.pkl\"\n",
    "\n",
    "# Open and load the pickle file\n",
    "with open(file_path, 'rb') as file:\n",
    "    data = pickle.load(file)\n",
    "\n",
    "\n",
    "# Print or use the data\n",
    "# data[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f9febed4-ce50-4e30-96ea-4b538ce2f9a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "inc_slider=1\n",
    "parent_location=\"ratio_proportion_change3_2223/sch_largest_100-coded/finetuning/\"\n",
    "test_info_location=parent_location+\"fullTest/test_info.txt\"\n",
    "test_location=parent_location+\"fullTest/test.txt\"\n",
    "test_info = pd.read_csv(test_info_location, sep=',', header=None, engine='python')\n",
    "grad_rate_data = pd.DataFrame(pd.read_pickle('school_grduation_rate.pkl'),columns=['school_number','grad_rate'])  # Load the grad_rate data\n",
    "\n",
    "# Step 1: Extract unique school numbers from test_info\n",
    "unique_schools = test_info[0].unique()\n",
    "\n",
    "# Step 2: Filter the grad_rate_data using the unique school numbers\n",
    "schools = grad_rate_data[grad_rate_data['school_number'].isin(unique_schools)]\n",
    "\n",
    "# Define a threshold for high and low graduation rates (adjust as needed)\n",
    "grad_rate_threshold = 0.9  \n",
    "\n",
    "# Step 4: Divide schools into high and low graduation rate groups\n",
    "high_grad_schools = schools[schools['grad_rate'] >= grad_rate_threshold]['school_number'].unique()\n",
    "low_grad_schools = schools[schools['grad_rate'] < grad_rate_threshold]['school_number'].unique()\n",
    "\n",
    "# Step 5: Sample percentage of schools from each group\n",
    "high_sample = pd.Series(high_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()\n",
    "low_sample = pd.Series(low_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()\n",
    "\n",
    "# Step 6: Combine the sampled schools\n",
    "random_schools = high_sample + low_sample\n",
    "\n",
    "# Step 7: Get indices for the sampled schools\n",
    "indices = test_info[test_info[0].isin(random_schools)].index.tolist()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fdfdf4b6-2752-4a21-9880-869af69f20cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "high_indices = test_info[(test_info[0].isin(high_sample))].index.tolist()\n",
    "low_indices = test_info[(test_info[0].isin(low_sample))].index.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a79a4598-5702-4cc8-9f07-8e18fdda648b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "997"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(high_indices)+len(low_indices)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4707f3e6-2f44-46d8-ad8c-b6c244f693af",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5342</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity1-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5343</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5344</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5345</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5346</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tDenomin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113359</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113360</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113361</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113362</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113363</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>997 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                        0\n",
       "5342    PercentChange-0\\tNumeratorQuantity1-0\\tNumerat...\n",
       "5343    PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "5344    PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "5345    PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...\n",
       "5346    PercentChange-0\\tNumeratorQuantity2-0\\tDenomin...\n",
       "...                                                   ...\n",
       "113359  PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...\n",
       "113360  PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "113361  PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "113362  PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "113363  PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "\n",
       "[997 rows x 1 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load the test file and select rows based on indices\n",
    "test = pd.read_csv(test_location, sep=',', header=None, engine='python')\n",
    "selected_rows_df2 = test.loc[indices]\n",
    "selected_rows_df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1d0c3d49-061f-486b-9c19-cf20945f3207",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "997"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "graduation_groups = [\n",
    "    'high' if idx in high_indices else 'low' for idx in selected_rows_df2.index\n",
    "]\n",
    "# graduation_groups\n",
    "len(graduation_groups)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d2508a0f-e5ca-432e-b99b-481ea4536d4d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "997"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "opt_task_groups = ['opt_task1' if test_info.loc[idx, 6] == 0 else 'opt_task2' for idx in selected_rows_df2.index]\n",
    "len(opt_task_groups)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ad0ce4a1-27fa-4867-8061-4054dbb340df",
   "metadata": {},
   "outputs": [],
   "source": [
    "t_label=data[0]\n",
    "p_label=data[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a4f4a2b9-3134-42ac-871b-4e117098cd0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 1: Align graduation_group, t_label, and p_label\n",
    "aligned_labels = list(zip(graduation_groups, t_label, p_label))\n",
    "opt_task_aligned = list(zip(opt_task_groups, t_label, p_label))\n",
    "# Step 2: Separate the labels for high and low groups\n",
    "high_t_labels = [t for grad, t, p in aligned_labels if grad == 'high']\n",
    "low_t_labels = [t for grad, t, p in aligned_labels if grad == 'low']\n",
    "\n",
    "high_p_labels = [p for grad, t, p in aligned_labels if grad == 'high']\n",
    "low_p_labels = [p for grad, t, p in aligned_labels if grad == 'low']\n",
    "\n",
    "\n",
    "opt_task1_t_labels = [t for task, t, p in opt_task_aligned if task == 'opt_task1']\n",
    "opt_task1_p_labels = [p for task, t, p in opt_task_aligned if task == 'opt_task1']\n",
    "\n",
    "opt_task2_t_labels = [t for task, t, p in opt_task_aligned if task == 'opt_task2']\n",
    "opt_task2_p_labels = [p for task, t, p in opt_task_aligned if task == 'opt_task2']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "74cda932-ce98-4ad5-9c29-a54bdc4ee086",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "opt_task1 ROC-AUC: 0.7592686234399062\n",
      "opt_task2 ROC-AUC: 0.7268598353289777\n"
     ]
    }
   ],
   "source": [
    "\n",
    "opt_task1_roc_auc = roc_auc_score(opt_task1_t_labels, opt_task1_p_labels) if len(set(opt_task1_t_labels)) > 1 else None\n",
    "opt_task2_roc_auc = roc_auc_score(opt_task2_t_labels, opt_task2_p_labels) if len(set(opt_task2_t_labels)) > 1 else None\n",
    "\n",
    "print(f\"opt_task1 ROC-AUC: {opt_task1_roc_auc}\")\n",
    "print(f\"opt_task2 ROC-AUC: {opt_task2_roc_auc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "c8e34660-83d0-46a1-a218-95d609e11729",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "997"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(low_t_labels)+len(high_t_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c11050db-2636-4c50-9cd4-b9943e5cee83",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "e1309e93-7063-4f48-bbc7-11a0d449c34e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ROC-AUC Score for High Graduation Rate Group: 0.675\n",
      "ROC-AUC Score for Low Graduation Rate Group: 0.7489795918367347\n"
     ]
    }
   ],
   "source": [
    "high_roc_auc = roc_auc_score(high_t_labels, high_p_labels) if len(set(high_t_labels)) > 1 else None\n",
    "low_roc_auc = roc_auc_score(low_t_labels, low_p_labels) if len(set(low_t_labels)) > 1 else None\n",
    "\n",
    "print(\"ROC-AUC Score for High Graduation Rate Group:\", high_roc_auc)\n",
    "print(\"ROC-AUC Score for Low Graduation Rate Group:\", low_roc_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a99e7812-817d-4f9f-b6fa-1a58aa3a34dc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Task Analysis Summary:\n",
      "-----------------------\n",
      "Ideal Task = OptionalTask_1:\n",
      "  Only OptionalTask_1 done: 22501\n",
      "  Only OptionalTask_2 done: 20014\n",
      "  Both done: 24854\n",
      "  None done: 38\n",
      "Ideal Task = OptionalTask_2:\n",
      "  Only OptionalTask_1 done: 12588\n",
      "  Only OptionalTask_2 done: 18942\n",
      "  Both done: 15147\n",
      "  None done: 78\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def analyze_row(row):\n",
    "    # Split the row into fields\n",
    "    fields = row.split(\"\\t\")\n",
    "\n",
    "    # Define tasks for OptionalTask_1, OptionalTask_2, and FinalAnswer\n",
    "    optional_task_1_subtasks = [\"DenominatorFactor\", \"NumeratorFactor\", \"EquationAnswer\"]\n",
    "    optional_task_2_subtasks = [\n",
    "        \"FirstRow2:1\", \"FirstRow2:2\", \"FirstRow1:1\", \"FirstRow1:2\", \n",
    "        \"SecondRow\", \"ThirdRow\"\n",
    "    ]\n",
    "\n",
    "    # Helper function to evaluate task attempts\n",
    "    def evaluate_tasks(fields, tasks):\n",
    "        task_status = {}\n",
    "        for task in tasks:\n",
    "            relevant_attempts = [f for f in fields if task in f]\n",
    "            if any(\"OK\" in attempt for attempt in relevant_attempts):\n",
    "                task_status[task] = \"Attempted (Successful)\"\n",
    "            elif any(\"ERROR\" in attempt for attempt in relevant_attempts):\n",
    "                task_status[task] = \"Attempted (Error)\"\n",
    "            elif any(\"JIT\" in attempt for attempt in relevant_attempts):\n",
    "                task_status[task] = \"Attempted (JIT)\"\n",
    "            else:\n",
    "                task_status[task] = \"Unattempted\"\n",
    "        return task_status\n",
    "\n",
    "    # Evaluate tasks for each category\n",
    "    optional_task_1_status = evaluate_tasks(fields, optional_task_1_subtasks)\n",
    "    optional_task_2_status = evaluate_tasks(fields, optional_task_2_subtasks)\n",
    "\n",
    "    # Check if tasks have any successful attempt\n",
    "    opt1_done = any(status == \"Attempted (Successful)\" for status in optional_task_1_status.values())\n",
    "    opt2_done = any(status == \"Attempted (Successful)\" for status in optional_task_2_status.values())\n",
    "\n",
    "    return opt1_done, opt2_done\n",
    "\n",
    "# Read data from test_info.txt\n",
    "# Read data from test_info.txt\n",
    "with open(test_info_location, \"r\") as file:\n",
    "    data = file.readlines()\n",
    "\n",
    "# Assuming test_info[7] is a list with ideal tasks for each instance\n",
    "ideal_tasks = test_info[6]  # A list where each element is either 1 or 2\n",
    "\n",
    "# Initialize counters\n",
    "task_counts = {\n",
    "    1: {\"only_opt1\": 0, \"only_opt2\": 0, \"both\": 0,\"none\":0},\n",
    "    2: {\"only_opt1\": 0, \"only_opt2\": 0, \"both\": 0,\"none\":0}\n",
    "}\n",
    "\n",
    "# Analyze rows\n",
    "for i, row in enumerate(data):\n",
    "    row = row.strip()\n",
    "    if not row:\n",
    "        continue\n",
    "\n",
    "    ideal_task = ideal_tasks[i]  # Get the ideal task for the current row\n",
    "    opt1_done, opt2_done = analyze_row(row)\n",
    "\n",
    "    if ideal_task == 0:\n",
    "        if opt1_done and not opt2_done:\n",
    "            task_counts[1][\"only_opt1\"] += 1\n",
    "        elif not opt1_done and opt2_done:\n",
    "            task_counts[1][\"only_opt2\"] += 1\n",
    "        elif opt1_done and opt2_done:\n",
    "            task_counts[1][\"both\"] += 1\n",
    "        else:\n",
    "            task_counts[1][\"none\"] +=1\n",
    "    elif ideal_task == 1:\n",
    "        if opt1_done and not opt2_done:\n",
    "            task_counts[2][\"only_opt1\"] += 1\n",
    "        elif not opt1_done and opt2_done:\n",
    "            task_counts[2][\"only_opt2\"] += 1\n",
    "        elif opt1_done and opt2_done:\n",
    "            task_counts[2][\"both\"] += 1\n",
    "        else:\n",
    "            task_counts[2][\"none\"] +=1\n",
    "\n",
    "# Create a string output for results\n",
    "output_summary = \"Task Analysis Summary:\\n\"\n",
    "output_summary += \"-----------------------\\n\"\n",
    "\n",
    "for ideal_task, counts in task_counts.items():\n",
    "    output_summary += f\"Ideal Task = OptionalTask_{ideal_task}:\\n\"\n",
    "    output_summary += f\"  Only OptionalTask_1 done: {counts['only_opt1']}\\n\"\n",
    "    output_summary += f\"  Only OptionalTask_2 done: {counts['only_opt2']}\\n\"\n",
    "    output_summary += f\"  Both done: {counts['both']}\\n\"\n",
    "    output_summary += f\"  None done: {counts['none']}\\n\"\n",
    "\n",
    "print(output_summary)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "3630406c-859a-43ab-a569-67d577cc9bf6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gradio as gr\n",
    "from matplotlib.figure import Figure"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "99833638-882d-4c75-bcc3-031e39cfb5a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"roc_data.pkl\", \"rb\") as f:\n",
    "        fpr, tpr, _ = pickle.load(f)\n",
    "roc_auc = auc(fpr, tpr)\n",
    "\n",
    "# Create a matplotlib figure\n",
    "fig = Figure()\n",
    "ax = fig.add_subplot(1, 1, 1)\n",
    "ax.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')\n",
    "ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n",
    "ax.set(xlabel='False Positive Rate', ylabel='True Positive Rate', title=f'Receiver Operating Curve (ROC)')\n",
    "ax.legend(loc=\"lower right\")\n",
    "ax.grid()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6eb3dece-5b33-4223-af9a-6b999bb2305b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}