{ "cells": [ { "cell_type": "code", "execution_count": 27, "id": "960bac80-51c7-4e9f-ad2d-84cd6c710f98", "metadata": {}, "outputs": [], "source": [ "import pickle\n", "import pandas as pd\n", "from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score,auc" ] }, { "cell_type": "code", "execution_count": 3, "id": "a34f21d0-0854-4a54-8f93-67718b2f969e", "metadata": {}, "outputs": [], "source": [ "file_path = \"roc_data2.pkl\"\n", "\n", "# Open and load the pickle file\n", "with open(file_path, 'rb') as file:\n", " data = pickle.load(file)\n", "\n", "\n", "# Print or use the data\n", "# data[2]" ] }, { "cell_type": "code", "execution_count": 4, "id": "f9febed4-ce50-4e30-96ea-4b538ce2f9a1", "metadata": {}, "outputs": [], "source": [ "inc_slider=1\n", "parent_location=\"ratio_proportion_change3_2223/sch_largest_100-coded/finetuning/\"\n", "test_info_location=parent_location+\"fullTest/test_info.txt\"\n", "test_location=parent_location+\"fullTest/test.txt\"\n", "test_info = pd.read_csv(test_info_location, sep=',', header=None, engine='python')\n", "grad_rate_data = pd.DataFrame(pd.read_pickle('school_grduation_rate.pkl'),columns=['school_number','grad_rate']) # Load the grad_rate data\n", "\n", "# Step 1: Extract unique school numbers from test_info\n", "unique_schools = test_info[0].unique()\n", "\n", "# Step 2: Filter the grad_rate_data using the unique school numbers\n", "schools = grad_rate_data[grad_rate_data['school_number'].isin(unique_schools)]\n", "\n", "# Define a threshold for high and low graduation rates (adjust as needed)\n", "grad_rate_threshold = 0.9 \n", "\n", "# Step 4: Divide schools into high and low graduation rate groups\n", "high_grad_schools = schools[schools['grad_rate'] >= grad_rate_threshold]['school_number'].unique()\n", "low_grad_schools = schools[schools['grad_rate'] < grad_rate_threshold]['school_number'].unique()\n", "\n", "# Step 5: Sample percentage of schools from each group\n", "high_sample = pd.Series(high_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()\n", "low_sample = pd.Series(low_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()\n", "\n", "# Step 6: Combine the sampled schools\n", "random_schools = high_sample + low_sample\n", "\n", "# Step 7: Get indices for the sampled schools\n", "indices = test_info[test_info[0].isin(random_schools)].index.tolist()\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "fdfdf4b6-2752-4a21-9880-869af69f20cf", "metadata": {}, "outputs": [], "source": [ "high_indices = test_info[(test_info[0].isin(high_sample))].index.tolist()\n", "low_indices = test_info[(test_info[0].isin(low_sample))].index.tolist()" ] }, { "cell_type": "code", "execution_count": 6, "id": "a79a4598-5702-4cc8-9f07-8e18fdda648b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "997" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(high_indices)+len(low_indices)\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "4707f3e6-2f44-46d8-ad8c-b6c244f693af", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "
---|---|
5342 | \n", "PercentChange-0\\tNumeratorQuantity1-0\\tNumerat... | \n", "
5343 | \n", "PercentChange-0\\tNumeratorQuantity2-0\\tNumerat... | \n", "
5344 | \n", "PercentChange-0\\tNumeratorQuantity2-0\\tNumerat... | \n", "
5345 | \n", "PercentChange-0\\tNumeratorQuantity2-2\\tNumerat... | \n", "
5346 | \n", "PercentChange-0\\tNumeratorQuantity2-0\\tDenomin... | \n", "
... | \n", "... | \n", "
113359 | \n", "PercentChange-0\\tNumeratorQuantity2-2\\tNumerat... | \n", "
113360 | \n", "PercentChange-0\\tNumeratorQuantity2-0\\tNumerat... | \n", "
113361 | \n", "PercentChange-0\\tNumeratorQuantity2-0\\tNumerat... | \n", "
113362 | \n", "PercentChange-0\\tNumeratorQuantity2-0\\tNumerat... | \n", "
113363 | \n", "PercentChange-0\\tNumeratorQuantity2-0\\tNumerat... | \n", "
997 rows × 1 columns
\n", "