Spaces:

suryadev1
/

astra

Running

File size: 18,568 Bytes

a326aed

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "960bac80-51c7-4e9f-ad2d-84cd6c710f98",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a34f21d0-0854-4a54-8f93-67718b2f969e",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_path = \"roc_data2.pkl\"\n",
    "\n",
    "# Open and load the pickle file\n",
    "with open(file_path, 'rb') as file:\n",
    "    data = pickle.load(file)\n",
    "\n",
    "\n",
    "# Print or use the data\n",
    "# data[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f9febed4-ce50-4e30-96ea-4b538ce2f9a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "inc_slider=1\n",
    "parent_location=\"ratio_proportion_change3_2223/sch_largest_100-coded/finetuning/\"\n",
    "test_info_location=parent_location+\"fullTest/test_info.txt\"\n",
    "test_location=parent_location+\"fullTest/test.txt\"\n",
    "test_info = pd.read_csv(test_info_location, sep=',', header=None, engine='python')\n",
    "grad_rate_data = pd.DataFrame(pd.read_pickle('school_grduation_rate.pkl'),columns=['school_number','grad_rate'])  # Load the grad_rate data\n",
    "\n",
    "# Step 1: Extract unique school numbers from test_info\n",
    "unique_schools = test_info[0].unique()\n",
    "\n",
    "# Step 2: Filter the grad_rate_data using the unique school numbers\n",
    "schools = grad_rate_data[grad_rate_data['school_number'].isin(unique_schools)]\n",
    "\n",
    "# Define a threshold for high and low graduation rates (adjust as needed)\n",
    "grad_rate_threshold = 0.9  \n",
    "\n",
    "# Step 4: Divide schools into high and low graduation rate groups\n",
    "high_grad_schools = schools[schools['grad_rate'] >= grad_rate_threshold]['school_number'].unique()\n",
    "low_grad_schools = schools[schools['grad_rate'] < grad_rate_threshold]['school_number'].unique()\n",
    "\n",
    "# Step 5: Sample percentage of schools from each group\n",
    "high_sample = pd.Series(high_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()\n",
    "low_sample = pd.Series(low_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()\n",
    "\n",
    "# Step 6: Combine the sampled schools\n",
    "random_schools = high_sample + low_sample\n",
    "\n",
    "# Step 7: Get indices for the sampled schools\n",
    "indices = test_info[test_info[0].isin(random_schools)].index.tolist()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "fdfdf4b6-2752-4a21-9880-869af69f20cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "high_indices = test_info[(test_info[0].isin(high_sample))].index.tolist()\n",
    "low_indices = test_info[(test_info[0].isin(low_sample))].index.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a79a4598-5702-4cc8-9f07-8e18fdda648b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "997"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(high_indices)+len(low_indices)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4707f3e6-2f44-46d8-ad8c-b6c244f693af",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5342</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity1-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5343</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5344</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5345</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5346</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tDenomin...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113359</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113360</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113361</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113362</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113363</th>\n",
       "      <td>PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>997 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                        0\n",
       "5342    PercentChange-0\\tNumeratorQuantity1-0\\tNumerat...\n",
       "5343    PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "5344    PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "5345    PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...\n",
       "5346    PercentChange-0\\tNumeratorQuantity2-0\\tDenomin...\n",
       "...                                                   ...\n",
       "113359  PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...\n",
       "113360  PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "113361  PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "113362  PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "113363  PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n",
       "\n",
       "[997 rows x 1 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load the test file and select rows based on indices\n",
    "test = pd.read_csv(test_location, sep=',', header=None, engine='python')\n",
    "selected_rows_df2 = test.loc[indices]\n",
    "selected_rows_df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "1d0c3d49-061f-486b-9c19-cf20945f3207",
   "metadata": {},
   "outputs": [],
   "source": [
    "graduation_groups = [\n",
    "    'high' if idx in high_indices else 'low' for idx in selected_rows_df2.index\n",
    "]\n",
    "# graduation_groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "ad0ce4a1-27fa-4867-8061-4054dbb340df",
   "metadata": {},
   "outputs": [],
   "source": [
    "t_label=data[0]\n",
    "p_label=data[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "a4f4a2b9-3134-42ac-871b-4e117098cd0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Step 1: Align graduation_group, t_label, and p_label\n",
    "aligned_labels = list(zip(graduation_groups, t_label, p_label))\n",
    "\n",
    "# Step 2: Separate the labels for high and low groups\n",
    "high_t_labels = [t for grad, t, p in aligned_labels if grad == 'high']\n",
    "low_t_labels = [t for grad, t, p in aligned_labels if grad == 'low']\n",
    "\n",
    "high_p_labels = [p for grad, t, p in aligned_labels if grad == 'high']\n",
    "low_p_labels = [p for grad, t, p in aligned_labels if grad == 'low']\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "c8e34660-83d0-46a1-a218-95d609e11729",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "997"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(low_t_labels)+len(high_t_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "c11050db-2636-4c50-9cd4-b9943e5cee83",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "e1309e93-7063-4f48-bbc7-11a0d449c34e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ROC-AUC Score for High Graduation Rate Group: 0.675\n",
      "ROC-AUC Score for Low Graduation Rate Group: 0.7489795918367347\n"
     ]
    }
   ],
   "source": [
    "high_roc_auc = roc_auc_score(high_t_labels, high_p_labels) if len(set(high_t_labels)) > 1 else None\n",
    "low_roc_auc = roc_auc_score(low_t_labels, low_p_labels) if len(set(low_t_labels)) > 1 else None\n",
    "\n",
    "print(\"ROC-AUC Score for High Graduation Rate Group:\", high_roc_auc)\n",
    "print(\"ROC-AUC Score for Low Graduation Rate Group:\", low_roc_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a99e7812-817d-4f9f-b6fa-1a58aa3a34dc",
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "cannot convert the series to <class 'int'>",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[4], line 47\u001b[0m\n\u001b[0;32m     44\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(test_info_location, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[0;32m     45\u001b[0m     data \u001b[38;5;241m=\u001b[39m file\u001b[38;5;241m.\u001b[39mreadlines()\n\u001b[1;32m---> 47\u001b[0m ideal_opt_task \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtest_info\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m7\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# Assuming test_info[7] is accessible and holds the ideal task (1 or 2)\u001b[39;00m\n\u001b[0;32m     49\u001b[0m \u001b[38;5;66;03m# Initialize counters\u001b[39;00m\n\u001b[0;32m     50\u001b[0m task_counts \u001b[38;5;241m=\u001b[39m {\n\u001b[0;32m     51\u001b[0m     \u001b[38;5;241m1\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124monly_opt1\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124monly_opt2\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mboth\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m},\n\u001b[0;32m     52\u001b[0m     \u001b[38;5;241m2\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124monly_opt1\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124monly_opt2\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mboth\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m}\n\u001b[0;32m     53\u001b[0m }\n",
      "File \u001b[1;32m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\series.py:230\u001b[0m, in \u001b[0;36m_coerce_method.<locals>.wrapper\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    222\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m    223\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCalling \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconverter\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m on a single element Series is \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    224\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeprecated and will raise a TypeError in the future. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    227\u001b[0m         stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m    228\u001b[0m     )\n\u001b[0;32m    229\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m converter(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miloc[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m--> 230\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcannot convert the series to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconverter\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
      "\u001b[1;31mTypeError\u001b[0m: cannot convert the series to <class 'int'>"
     ]
    }
   ],
   "source": [
    "parent_location=\"ratio_proportion_change3_2223/sch_largest_100-coded/finetuning/\"\n",
    "test_info_location=parent_location+\"fullTest/test_info.txt\"\n",
    "test_location=parent_location+\"fullTest/test.txt\"\n",
    "test_info = pd.read_csv(test_info_location, sep=',', header=None, engine='python')\n",
    "\n",
    "def analyze_row(row, ideal_opt_task):\n",
    "    # Split the row into fields\n",
    "    fields = row.split(\"\\t\")\n",
    "\n",
    "    # Define tasks for OptionalTask_1, OptionalTask_2, and FinalAnswer\n",
    "    optional_task_1_subtasks = [\"DenominatorFactor\", \"NumeratorFactor\", \"EquationAnswer\"]\n",
    "    optional_task_2_subtasks = [\n",
    "        \"FirstRow2:1\", \"FirstRow2:2\", \"FirstRow1:1\", \"FirstRow1:2\", \n",
    "        \"SecondRow\", \"ThirdRow\"\n",
    "    ]\n",
    "    final_answer_tasks = [\"FinalAnswer\"]\n",
    "\n",
    "    # Helper function to evaluate task attempts\n",
    "    def evaluate_tasks(fields, tasks):\n",
    "        task_status = {}\n",
    "        for task in tasks:\n",
    "            relevant_attempts = [f for f in fields if task in f]\n",
    "            if any(\"OK\" in attempt for attempt in relevant_attempts):\n",
    "                task_status[task] = \"Attempted (Successful)\"\n",
    "            elif any(\"ERROR\" in attempt for attempt in relevant_attempts):\n",
    "                task_status[task] = \"Attempted (Error)\"\n",
    "            elif any(\"JIT\" in attempt for attempt in relevant_attempts):\n",
    "                task_status[task] = \"Attempted (JIT)\"\n",
    "            else:\n",
    "                task_status[task] = \"Unattempted\"\n",
    "        return task_status\n",
    "\n",
    "    # Evaluate tasks for each category\n",
    "    optional_task_1_status = evaluate_tasks(fields, optional_task_1_subtasks)\n",
    "    optional_task_2_status = evaluate_tasks(fields, optional_task_2_subtasks)\n",
    "\n",
    "    # Check if tasks have any successful attempt\n",
    "    opt1_done = any(status == \"Attempted (Successful)\" for status in optional_task_1_status.values())\n",
    "    opt2_done = any(status == \"Attempted (Successful)\" for status in optional_task_2_status.values())\n",
    "\n",
    "    return opt1_done, opt2_done\n",
    "\n",
    "# Read data from test_info.txt\n",
    "with open(test_info_location, \"r\") as file:\n",
    "    data = file.readlines()\n",
    "\n",
    "ideal_opt_task = int(test_info[6])  # Assuming test_info[7] is accessible and holds the ideal task (1 or 2)\n",
    "\n",
    "# Initialize counters\n",
    "task_counts = {\n",
    "    1: {\"only_opt1\": 0, \"only_opt2\": 0, \"both\": 0},\n",
    "    2: {\"only_opt1\": 0, \"only_opt2\": 0, \"both\": 0}\n",
    "}\n",
    "\n",
    "for row in data:\n",
    "    row = row.strip()\n",
    "    if not row:\n",
    "        continue\n",
    "    opt1_done, opt2_done = analyze_row(row, ideal_opt_task)\n",
    "\n",
    "    if ideal_opt_task == 0:\n",
    "        if opt1_done and not opt2_done:\n",
    "            task_counts[1][\"only_opt1\"] += 1\n",
    "        elif not opt1_done and opt2_done:\n",
    "            task_counts[1][\"only_opt2\"] += 1\n",
    "        elif opt1_done and opt2_done:\n",
    "            task_counts[1][\"both\"] += 1\n",
    "    elif ideal_opt_task == 1:\n",
    "        if opt1_done and not opt2_done:\n",
    "            task_counts[2][\"only_opt1\"] += 1\n",
    "        elif not opt1_done and opt2_done:\n",
    "            task_counts[2][\"only_opt2\"] += 1\n",
    "        elif opt1_done and opt2_done:\n",
    "            task_counts[2][\"both\"] += 1\n",
    "\n",
    "# Create a string output for results\n",
    "output_summary = \"Task Analysis Summary:\\n\"\n",
    "output_summary += \"-----------------------\\n\"\n",
    "\n",
    "for ideal_task, counts in task_counts.items():\n",
    "    output_summary += f\"Ideal Task = OptionalTask_{ideal_task}:\\n\"\n",
    "    output_summary += f\"  Only OptionalTask_1 done: {counts['only_opt1']}\\n\"\n",
    "    output_summary += f\"  Only OptionalTask_2 done: {counts['only_opt2']}\\n\"\n",
    "    output_summary += f\"  Both done: {counts['both']}\\n\"\n",
    "\n",
    "print(output_summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65ad9383-741f-44eb-8e8f-853ee7bc52a2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}