{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "960bac80-51c7-4e9f-ad2d-84cd6c710f98", "metadata": {}, "outputs": [], "source": [ "import pickle\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "id": "a34f21d0-0854-4a54-8f93-67718b2f969e", "metadata": {}, "outputs": [], "source": [ "file_path = \"roc_data2.pkl\"\n", "\n", "# Open and load the pickle file\n", "with open(file_path, 'rb') as file:\n", " data = pickle.load(file)\n", "\n", "\n", "# Print or use the data\n", "# data[2]" ] }, { "cell_type": "code", "execution_count": 5, "id": "f9febed4-ce50-4e30-96ea-4b538ce2f9a1", "metadata": {}, "outputs": [], "source": [ "inc_slider=1\n", "parent_location=\"ratio_proportion_change3_2223/sch_largest_100-coded/finetuning/\"\n", "test_info_location=parent_location+\"fullTest/test_info.txt\"\n", "test_location=parent_location+\"fullTest/test.txt\"\n", "test_info = pd.read_csv(test_info_location, sep=',', header=None, engine='python')\n", "grad_rate_data = pd.DataFrame(pd.read_pickle('school_grduation_rate.pkl'),columns=['school_number','grad_rate']) # Load the grad_rate data\n", "\n", "# Step 1: Extract unique school numbers from test_info\n", "unique_schools = test_info[0].unique()\n", "\n", "# Step 2: Filter the grad_rate_data using the unique school numbers\n", "schools = grad_rate_data[grad_rate_data['school_number'].isin(unique_schools)]\n", "\n", "# Define a threshold for high and low graduation rates (adjust as needed)\n", "grad_rate_threshold = 0.9 \n", "\n", "# Step 4: Divide schools into high and low graduation rate groups\n", "high_grad_schools = schools[schools['grad_rate'] >= grad_rate_threshold]['school_number'].unique()\n", "low_grad_schools = schools[schools['grad_rate'] < grad_rate_threshold]['school_number'].unique()\n", "\n", "# Step 5: Sample percentage of schools from each group\n", "high_sample = pd.Series(high_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()\n", "low_sample = pd.Series(low_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()\n", "\n", "# Step 6: Combine the sampled schools\n", "random_schools = high_sample + low_sample\n", "\n", "# Step 7: Get indices for the sampled schools\n", "indices = test_info[test_info[0].isin(random_schools)].index.tolist()\n", "\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "fdfdf4b6-2752-4a21-9880-869af69f20cf", "metadata": {}, "outputs": [], "source": [ "high_indices = test_info[(test_info[0].isin(high_sample))].index.tolist()\n", "low_indices = test_info[(test_info[0].isin(low_sample))].index.tolist()" ] }, { "cell_type": "code", "execution_count": 7, "id": "a79a4598-5702-4cc8-9f07-8e18fdda648b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "997" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(high_indices)+len(low_indices)\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "4707f3e6-2f44-46d8-ad8c-b6c244f693af", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
5342PercentChange-0\\tNumeratorQuantity1-0\\tNumerat...
5343PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...
5344PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...
5345PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...
5346PercentChange-0\\tNumeratorQuantity2-0\\tDenomin...
......
113359PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...
113360PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...
113361PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...
113362PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...
113363PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...
\n", "

997 rows × 1 columns

\n", "
" ], "text/plain": [ " 0\n", "5342 PercentChange-0\\tNumeratorQuantity1-0\\tNumerat...\n", "5343 PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n", "5344 PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n", "5345 PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...\n", "5346 PercentChange-0\\tNumeratorQuantity2-0\\tDenomin...\n", "... ...\n", "113359 PercentChange-0\\tNumeratorQuantity2-2\\tNumerat...\n", "113360 PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n", "113361 PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n", "113362 PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n", "113363 PercentChange-0\\tNumeratorQuantity2-0\\tNumerat...\n", "\n", "[997 rows x 1 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load the test file and select rows based on indices\n", "test = pd.read_csv(test_location, sep=',', header=None, engine='python')\n", "selected_rows_df2 = test.loc[indices]\n", "selected_rows_df2" ] }, { "cell_type": "code", "execution_count": 11, "id": "1d0c3d49-061f-486b-9c19-cf20945f3207", "metadata": {}, "outputs": [], "source": [ "graduation_groups = [\n", " 'high' if idx in high_indices else 'low' for idx in selected_rows_df2.index\n", "]\n", "# graduation_groups" ] }, { "cell_type": "code", "execution_count": 43, "id": "ad0ce4a1-27fa-4867-8061-4054dbb340df", "metadata": {}, "outputs": [], "source": [ "t_label=data[0]\n", "p_label=data[1]" ] }, { "cell_type": "code", "execution_count": 47, "id": "a4f4a2b9-3134-42ac-871b-4e117098cd0e", "metadata": {}, "outputs": [], "source": [ "# Step 1: Align graduation_group, t_label, and p_label\n", "aligned_labels = list(zip(graduation_groups, t_label, p_label))\n", "\n", "# Step 2: Separate the labels for high and low groups\n", "high_t_labels = [t for grad, t, p in aligned_labels if grad == 'high']\n", "low_t_labels = [t for grad, t, p in aligned_labels if grad == 'low']\n", "\n", "high_p_labels = [p for grad, t, p in aligned_labels if grad == 'high']\n", "low_p_labels = [p for grad, t, p in aligned_labels if grad == 'low']\n", "\n" ] }, { "cell_type": "code", "execution_count": 50, "id": "c8e34660-83d0-46a1-a218-95d609e11729", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "997" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(low_t_labels)+len(high_t_labels)" ] }, { "cell_type": "code", "execution_count": 51, "id": "c11050db-2636-4c50-9cd4-b9943e5cee83", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score" ] }, { "cell_type": "code", "execution_count": 52, "id": "e1309e93-7063-4f48-bbc7-11a0d449c34e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ROC-AUC Score for High Graduation Rate Group: 0.675\n", "ROC-AUC Score for Low Graduation Rate Group: 0.7489795918367347\n" ] } ], "source": [ "high_roc_auc = roc_auc_score(high_t_labels, high_p_labels) if len(set(high_t_labels)) > 1 else None\n", "low_roc_auc = roc_auc_score(low_t_labels, low_p_labels) if len(set(low_t_labels)) > 1 else None\n", "\n", "print(\"ROC-AUC Score for High Graduation Rate Group:\", high_roc_auc)\n", "print(\"ROC-AUC Score for Low Graduation Rate Group:\", low_roc_auc)" ] }, { "cell_type": "code", "execution_count": 4, "id": "a99e7812-817d-4f9f-b6fa-1a58aa3a34dc", "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "cannot convert the series to ", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[4], line 47\u001b[0m\n\u001b[0;32m 44\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(test_info_location, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[0;32m 45\u001b[0m data \u001b[38;5;241m=\u001b[39m file\u001b[38;5;241m.\u001b[39mreadlines()\n\u001b[1;32m---> 47\u001b[0m ideal_opt_task \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtest_info\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m7\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Assuming test_info[7] is accessible and holds the ideal task (1 or 2)\u001b[39;00m\n\u001b[0;32m 49\u001b[0m \u001b[38;5;66;03m# Initialize counters\u001b[39;00m\n\u001b[0;32m 50\u001b[0m task_counts \u001b[38;5;241m=\u001b[39m {\n\u001b[0;32m 51\u001b[0m \u001b[38;5;241m1\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124monly_opt1\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124monly_opt2\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mboth\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m},\n\u001b[0;32m 52\u001b[0m \u001b[38;5;241m2\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124monly_opt1\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124monly_opt2\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mboth\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m}\n\u001b[0;32m 53\u001b[0m }\n", "File \u001b[1;32m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\series.py:230\u001b[0m, in \u001b[0;36m_coerce_method..wrapper\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 222\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m 223\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCalling \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconverter\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m on a single element Series is \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 224\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdeprecated and will raise a TypeError in the future. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 227\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m 228\u001b[0m )\n\u001b[0;32m 229\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m converter(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miloc[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m--> 230\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcannot convert the series to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconverter\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[1;31mTypeError\u001b[0m: cannot convert the series to " ] } ], "source": [ "parent_location=\"ratio_proportion_change3_2223/sch_largest_100-coded/finetuning/\"\n", "test_info_location=parent_location+\"fullTest/test_info.txt\"\n", "test_location=parent_location+\"fullTest/test.txt\"\n", "test_info = pd.read_csv(test_info_location, sep=',', header=None, engine='python')\n", "\n", "def analyze_row(row, ideal_opt_task):\n", " # Split the row into fields\n", " fields = row.split(\"\\t\")\n", "\n", " # Define tasks for OptionalTask_1, OptionalTask_2, and FinalAnswer\n", " optional_task_1_subtasks = [\"DenominatorFactor\", \"NumeratorFactor\", \"EquationAnswer\"]\n", " optional_task_2_subtasks = [\n", " \"FirstRow2:1\", \"FirstRow2:2\", \"FirstRow1:1\", \"FirstRow1:2\", \n", " \"SecondRow\", \"ThirdRow\"\n", " ]\n", " final_answer_tasks = [\"FinalAnswer\"]\n", "\n", " # Helper function to evaluate task attempts\n", " def evaluate_tasks(fields, tasks):\n", " task_status = {}\n", " for task in tasks:\n", " relevant_attempts = [f for f in fields if task in f]\n", " if any(\"OK\" in attempt for attempt in relevant_attempts):\n", " task_status[task] = \"Attempted (Successful)\"\n", " elif any(\"ERROR\" in attempt for attempt in relevant_attempts):\n", " task_status[task] = \"Attempted (Error)\"\n", " elif any(\"JIT\" in attempt for attempt in relevant_attempts):\n", " task_status[task] = \"Attempted (JIT)\"\n", " else:\n", " task_status[task] = \"Unattempted\"\n", " return task_status\n", "\n", " # Evaluate tasks for each category\n", " optional_task_1_status = evaluate_tasks(fields, optional_task_1_subtasks)\n", " optional_task_2_status = evaluate_tasks(fields, optional_task_2_subtasks)\n", "\n", " # Check if tasks have any successful attempt\n", " opt1_done = any(status == \"Attempted (Successful)\" for status in optional_task_1_status.values())\n", " opt2_done = any(status == \"Attempted (Successful)\" for status in optional_task_2_status.values())\n", "\n", " return opt1_done, opt2_done\n", "\n", "# Read data from test_info.txt\n", "with open(test_info_location, \"r\") as file:\n", " data = file.readlines()\n", "\n", "ideal_opt_task = int(test_info[6]) # Assuming test_info[7] is accessible and holds the ideal task (1 or 2)\n", "\n", "# Initialize counters\n", "task_counts = {\n", " 1: {\"only_opt1\": 0, \"only_opt2\": 0, \"both\": 0},\n", " 2: {\"only_opt1\": 0, \"only_opt2\": 0, \"both\": 0}\n", "}\n", "\n", "for row in data:\n", " row = row.strip()\n", " if not row:\n", " continue\n", " opt1_done, opt2_done = analyze_row(row, ideal_opt_task)\n", "\n", " if ideal_opt_task == 0:\n", " if opt1_done and not opt2_done:\n", " task_counts[1][\"only_opt1\"] += 1\n", " elif not opt1_done and opt2_done:\n", " task_counts[1][\"only_opt2\"] += 1\n", " elif opt1_done and opt2_done:\n", " task_counts[1][\"both\"] += 1\n", " elif ideal_opt_task == 1:\n", " if opt1_done and not opt2_done:\n", " task_counts[2][\"only_opt1\"] += 1\n", " elif not opt1_done and opt2_done:\n", " task_counts[2][\"only_opt2\"] += 1\n", " elif opt1_done and opt2_done:\n", " task_counts[2][\"both\"] += 1\n", "\n", "# Create a string output for results\n", "output_summary = \"Task Analysis Summary:\\n\"\n", "output_summary += \"-----------------------\\n\"\n", "\n", "for ideal_task, counts in task_counts.items():\n", " output_summary += f\"Ideal Task = OptionalTask_{ideal_task}:\\n\"\n", " output_summary += f\" Only OptionalTask_1 done: {counts['only_opt1']}\\n\"\n", " output_summary += f\" Only OptionalTask_2 done: {counts['only_opt2']}\\n\"\n", " output_summary += f\" Both done: {counts['both']}\\n\"\n", "\n", "print(output_summary)" ] }, { "cell_type": "code", "execution_count": null, "id": "65ad9383-741f-44eb-8e8f-853ee7bc52a2", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }