{ "cells": [ { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "import sys\n", "sys.path.insert(0, \"..\")\n", "import vaex\n", "from vaex.ml import LabelEncoder\n", "import spacy\n", "import pandas as pd\n", "from tqdm import tqdm\n", "import os\n", "import multiprocessing as mp\n", "from src.preprocessing import PreprocessingPipeline, encode\n", "from src.wordifier import ModelConfigs\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "pipe = PreprocessingPipeline(\n", " language=\"English\",\n", " pre_steps=list(PreprocessingPipeline.pipeline_components().keys()),\n", " lemmatization_step=list(PreprocessingPipeline.lemmatization_component().keys())[1],\n", " post_steps=list(PreprocessingPipeline.pipeline_components().keys()),\n", ")" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "def fn(t):\n", " return pipe.post(pipe.lemma(pipe.nlp(pipe.pre(t))))" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "vdf = vaex.from_pandas(df)\n", "vdf[\"processed_text\"] = vdf.apply(fn, arguments=[vdf[\"text\"]], vectorize=False)\n", "df = vdf.to_pandas_df()" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2021-11-28 17:01:36.883 \n", " \u001b[33m\u001b[1mWarning:\u001b[0m to view this Streamlit app on a browser, run it with the following\n", " command:\n", "\n", " streamlit run /Users/pietrolesci/miniconda3/envs/wordify/lib/python3.7/site-packages/ipykernel_launcher.py [ARGUMENTS]\n" ] } ], "source": [ "import streamlit as st\n", "pbar = st.progress(0)\n", "N = 100\n", "for i, _ in enumerate(range(N)):\n", " if i % N == 0:\n", " pbar.progress(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "configs = ModelConfigs\n", "clf = Pipeline(\n", " [\n", " (\"tfidf\", TfidfVectorizer()),\n", " (\n", " \"classifier\",\n", " LogisticRegression(\n", " penalty=\"l1\",\n", " C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],\n", " solver=\"liblinear\",\n", " multi_class=\"auto\",\n", " max_iter=500,\n", " class_weight=\"balanced\",\n", " ),\n", " ),\n", " ]\n", ")\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Pipeline(steps=[('tfidf', TfidfVectorizer()),\n", " ('classifier',\n", " LogisticRegression(C=1, class_weight='balanced', max_iter=500,\n", " penalty='l1', solver='liblinear'))])" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.fit(df[\"text\"], df[\"label\"])" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['00', '000', '00001', ..., 'ís', 'über', 'überwoman'], dtype=object)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "def wordifier(df, text_col, label_col, configs=ModelConfigs):\n", "\n", " n_instances, n_features = X.shape\n", " n_classes = np.unique(y)\n", "\n", " # NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n", " sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n", "\n", " sample_size = min(\n", " # this is the maximum supported\n", " configs.MAX_SELECTION.value,\n", " # at minimum you want MIN_SELECTION but in general you want\n", " # n_instances * sample_fraction\n", " max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n", " # however if previous one is bigger the the available instances take\n", " # the number of available instances\n", " n_instances,\n", " )\n", "\n", " # TODO: might want to try out something to subsample features at each iteration\n", "\n", " # initialize coefficient matrices\n", " pos_scores = np.zeros((n_classes, n_features), dtype=int)\n", " neg_scores = np.zeros((n_classes, n_features), dtype=int)\n", "\n", " for _ in range(configs.NUM_ITERS.value):\n", "\n", " # run randomized regression\n", " clf = Pipeline([\n", " ('tfidf', TfidfVectorizer()), \n", " ('classifier', LogisticRegression(\n", " penalty=\"l1\",\n", " C=configs.PENALTIES.value[\n", " np.random.randint(len(configs.PENALTIES.value))\n", " ],\n", " solver=\"liblinear\",\n", " multi_class=\"auto\",\n", " max_iter=500,\n", " class_weight=\"balanced\",\n", " ))]\n", " )\n", "\n", " # sample indices to subsample matrix\n", " selection = resample(\n", " np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size\n", " )\n", "\n", " # fit\n", " try:\n", " clf.fit(X[selection], y[selection])\n", " except ValueError:\n", " continue\n", "\n", " # record coefficients\n", " if n_classes == 2:\n", " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n", " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n", " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n", " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n", " else:\n", " pos_scores += clf.coef_ > 0\n", " neg_scores += clf.coef_ < 0\n", "\n", "\n", " # normalize\n", " pos_scores = pos_scores / configs.NUM_ITERS.value\n", " neg_scores = neg_scores / configs.NUM_ITERS.value\n", "\n", " # get only active features\n", " pos_positions = np.where(\n", " pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0\n", " )\n", " neg_positions = np.where(\n", " neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0\n", " )\n", "\n", " # prepare DataFrame\n", " X_names = clf.steps[0][1].get_feature_names_out()\n", " pos = [\n", " (X_names[i], pos_scores[c, i], y_names[c])\n", " for c, i in zip(*pos_positions.nonzero())\n", " ]\n", " neg = [\n", " (X_names[i], neg_scores[c, i], y_names[c])\n", " for c, i in zip(*neg_positions.nonzero())\n", " ]\n", "\n", " posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values(\n", " [\"label\", \"score\"], ascending=False\n", " )\n", " negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values(\n", " [\"label\", \"score\"], ascending=False\n", " )\n", "\n", " return posdf, negdf" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "res = vdf.apply(wordifier, arguments=[vdf.processed_text, vdf.encoded_label], vectorize=False)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "from vaex.ml.sklearn import Predictor" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "clf = Pipeline(\n", " [\n", " (\n", " \"tfidf\",\n", " TfidfVectorizer(\n", " input=\"content\", # default: file already in memory\n", " encoding=\"utf-8\", # default\n", " decode_error=\"strict\", # default\n", " strip_accents=None, # do nothing\n", " lowercase=False, # do nothing\n", " preprocessor=None, # do nothing - default\n", " tokenizer=None, # default\n", " stop_words=None, # do nothing\n", " analyzer=\"word\",\n", " ngram_range=(1, 3), # maximum 3-ngrams\n", " min_df=0.001,\n", " max_df=0.75,\n", " sublinear_tf=True,\n", " ),\n", " ),\n", " (\n", " \"classifier\",\n", " LogisticRegression(\n", " penalty=\"l1\",\n", " C=configs.PENALTIES.value[np.random.randint(len(configs.PENALTIES.value))],\n", " solver=\"liblinear\",\n", " multi_class=\"auto\",\n", " max_iter=500,\n", " class_weight=\"balanced\",\n", " ),\n", " ),\n", " ]\n", ")\n", "\n", "vaex_model = Predictor(\n", " features=[\"processed_text\"],\n", " target=\"encoded_label\",\n", " model=clf,\n", " prediction_name=\"prediction\",\n", ")\n" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "unhashable type: 'list'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/var/folders/b_/m81mmt0s6gv48kdvk44n2l740000gn/T/ipykernel_52217/687453386.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvaex_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/ml/sklearn.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, df, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m '''\n\u001b[1;32m 104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mvalues\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 6897\u001b[0m \u001b[0mIf\u001b[0m \u001b[0many\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0mcontain\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasks\u001b[0m \u001b[0mare\u001b[0m \u001b[0mignored\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0melements\u001b[0m \u001b[0mare\u001b[0m \u001b[0mreturned\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mwell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6898\u001b[0m \"\"\"\n\u001b[0;32m-> 6899\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__array__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6900\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m__array__\u001b[0;34m(self, dtype, parallel)\u001b[0m\n\u001b[1;32m 5989\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcolumn_type\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5990\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cannot cast %r (of type %r) to %r\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5991\u001b[0;31m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'numpy'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5992\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misMaskedArray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5993\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, progress)\u001b[0m\n\u001b[1;32m 2962\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2963\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2964\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_evaluate_implementation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2965\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2966\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mdocsubst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m_evaluate_implementation\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, raw, progress)\u001b[0m\n\u001b[1;32m 6207\u001b[0m \u001b[0;31m# TODO: For NEP branch: dtype -> dtype_evaluate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6209\u001b[0;31m \u001b[0mexpression_to_evaluate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# lets assume we have to do them all\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6211\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mexpression\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: unhashable type: 'list'" ] } ], "source": [ "vaex_model.fit(vdf)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "b'\\x80\\x03c__main__\\nwordifier\\nq\\x00.'" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pickle\n", "pickle.dumps(wordifier)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "unhashable type: 'list'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/var/folders/b_/m81mmt0s6gv48kdvk44n2l740000gn/T/ipykernel_52217/687453386.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvaex_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/ml/sklearn.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, df, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m '''\n\u001b[1;32m 104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mvalues\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 6897\u001b[0m \u001b[0mIf\u001b[0m \u001b[0many\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[0mcontain\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasks\u001b[0m \u001b[0mare\u001b[0m \u001b[0mignored\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mmasked\u001b[0m \u001b[0melements\u001b[0m \u001b[0mare\u001b[0m \u001b[0mreturned\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mwell\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6898\u001b[0m \"\"\"\n\u001b[0;32m-> 6899\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__array__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6900\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m__array__\u001b[0;34m(self, dtype, parallel)\u001b[0m\n\u001b[1;32m 5989\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcolumn_type\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5990\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cannot cast %r (of type %r) to %r\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5991\u001b[0;31m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'numpy'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5992\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misMaskedArray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5993\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36mevaluate\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, progress)\u001b[0m\n\u001b[1;32m 2962\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate_iterator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2963\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2964\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_evaluate_implementation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpression\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mi2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mselection\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mselection\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiltered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfiltered\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marray_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparallel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparallel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mchunk_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprogress\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2965\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2966\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mdocsubst\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/wordify/lib/python3.7/site-packages/vaex/dataframe.py\u001b[0m in \u001b[0;36m_evaluate_implementation\u001b[0;34m(self, expression, i1, i2, out, selection, filtered, array_type, parallel, chunk_size, raw, progress)\u001b[0m\n\u001b[1;32m 6207\u001b[0m \u001b[0;31m# TODO: For NEP branch: dtype -> dtype_evaluate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6208\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6209\u001b[0;31m \u001b[0mexpression_to_evaluate\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# lets assume we have to do them all\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6210\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6211\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mexpression\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpressions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mTypeError\u001b[0m: unhashable type: 'list'" ] } ], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res = []\n", "with tqdm(total=len(df)) as pbar:\n", " for doc in tqdm(nlp.pipe(df[\"text\"].values, batch_size=500, n_process=n_cpus)):\n", " res.append([i.lemma_ for i in doc])\n", " pbar.update(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pickle" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fn(t):\n", " return " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%timeit\n", "with mp.Pool(mp.cpu_count()) as pool:\n", " new_s = pool.map(nlp, df[\"text\"].values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from typing import List\n", "import numpy as np\n", "import pandas as pd\n", "import streamlit as st\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.utils import resample\n", "\n", "from src.configs import ModelConfigs\n", "\n", "\n", "def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):\n", "\n", " n_instances, n_features = X.shape\n", " n_classes = len(y_names)\n", "\n", " # NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n", " sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n", "\n", " sample_size = min(\n", " # this is the maximum supported\n", " configs.MAX_SELECTION.value,\n", " # at minimum you want MIN_SELECTION but in general you want\n", " # n_instances * sample_fraction\n", " max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n", " # however if previous one is bigger the the available instances take\n", " # the number of available instances\n", " n_instances,\n", " )\n", "\n", " # TODO: might want to try out something to subsample features at each iteration\n", "\n", " # initialize coefficient matrices\n", " pos_scores = np.zeros((n_classes, n_features), dtype=int)\n", " neg_scores = np.zeros((n_classes, n_features), dtype=int)\n", "\n", " with st.spinner(\"Wordifying!\"):\n", " pbar = st.progress(0)\n", "\n", " for i, _ in enumerate(range(configs.NUM_ITERS.value)):\n", "\n", " # run randomized regression\n", " clf = LogisticRegression(\n", " penalty=\"l1\",\n", " C=configs.PENALTIES.value[\n", " np.random.randint(len(configs.PENALTIES.value))\n", " ],\n", " solver=\"liblinear\",\n", " multi_class=\"auto\",\n", " max_iter=500,\n", " class_weight=\"balanced\",\n", " )\n", "\n", " # sample indices to subsample matrix\n", " selection = resample(\n", " np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size\n", " )\n", "\n", " # fit\n", " try:\n", " clf.fit(X[selection], y[selection])\n", " except ValueError:\n", " continue\n", "\n", " # record coefficients\n", " if n_classes == 2:\n", " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n", " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n", " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n", " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n", " else:\n", " pos_scores += clf.coef_ > 0\n", " neg_scores += clf.coef_ < 0\n", "\n", " pbar.progress(i + 1)\n", "\n", " # normalize\n", " pos_scores = pos_scores / configs.NUM_ITERS.value\n", " neg_scores = neg_scores / configs.NUM_ITERS.value\n", "\n", " # get only active features\n", " pos_positions = np.where(\n", " pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0\n", " )\n", " neg_positions = np.where(\n", " neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0\n", " )\n", "\n", " # prepare DataFrame\n", " pos = [\n", " (X_names[i], pos_scores[c, i], y_names[c])\n", " for c, i in zip(*pos_positions.nonzero())\n", " ]\n", " neg = [\n", " (X_names[i], neg_scores[c, i], y_names[c])\n", " for c, i in zip(*neg_positions.nonzero())\n", " ]\n", "\n", " posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values(\n", " [\"label\", \"score\"], ascending=False\n", " )\n", " negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values(\n", " [\"label\", \"score\"], ascending=False\n", " )\n", "\n", " return posdf, negdf\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "path = \"../../../../Downloads/wordify_10000_copy.xlsx\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_excel(path, dtype=str).dropna()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# df = pd.read_excel(\"../data/test_de.xlsx\")\n", "# mdf = mpd.read_csv(\"../data/test_en.csv\")\n", "language = \"English\"\n", "nlp = spacy.load(Languages[language].value, exclude=[\"parser\", \"ner\", \"pos\", \"tok2vec\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "prep = TextPreprocessor(\n", " language=\"English\", \n", " cleaning_steps=list(TextPreprocessor._cleaning_options().keys()),\n", " lemmatizer_when=None,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"p_text\"] = prep.fit_transform(df[\"text\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X, y, X_names, y_names = encode(df[\"p_text\"], df[\"label\"]).values()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "clf = LogisticRegression(\n", " penalty=\"l1\",\n", " C=0.05,#ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n", " solver=\"liblinear\",\n", " multi_class=\"auto\",\n", " max_iter=500,\n", " class_weight=\"balanced\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "clf.fit(X, y)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n_instances, n_features = X.shape\n", "n_classes = len(y_names)\n", "\n", "# NOTE: the * 10 / 10 trick is to have \"nice\" round-ups\n", "sample_fraction = np.ceil((n_features / n_instances) * 10) / 10\n", "\n", "sample_size = min(\n", " # this is the maximum supported\n", " ModelConfigs.MAX_SELECTION.value,\n", " # at minimum you want MIN_SELECTION but in general you want\n", " # n_instances * sample_fraction\n", " max(ModelConfigs.MIN_SELECTION.value, int(n_instances * sample_fraction)),\n", " # however if previous one is bigger the the available instances take\n", " # the number of available instances\n", " n_instances,\n", ")\n", "\n", "# TODO: might want to try out something to subsample features at each iteration\n", "\n", "# initialize coefficient matrices\n", "pos_scores = np.zeros((n_classes, n_features), dtype=int)\n", "neg_scores = np.zeros((n_classes, n_features), dtype=int)\n", "\n", "for _ in trange(ModelConfigs.NUM_ITERS.value):\n", "\n", " # run randomized regression\n", " clf = LogisticRegression(\n", " penalty=\"l1\",\n", " C=ModelConfigs.PENALTIES.value[np.random.randint(len(ModelConfigs.PENALTIES.value))],\n", " solver=\"liblinear\",\n", " multi_class=\"auto\",\n", " max_iter=500,\n", " class_weight=\"balanced\",\n", " )\n", "\n", " # sample indices to subsample matrix\n", " selection = resample(np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size)\n", "\n", " # fit\n", " try:\n", " clf.fit(X[selection], y[selection])\n", " except ValueError:\n", " continue\n", "\n", " # record coefficients\n", " if n_classes == 2:\n", " pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)\n", " neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)\n", " pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)\n", " neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)\n", " else:\n", " pos_scores += clf.coef_ > 0\n", " neg_scores += clf.coef_ < 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# normalize\n", "pos_scores = pos_scores / ModelConfigs.NUM_ITERS.value\n", "neg_scores = neg_scores / ModelConfigs.NUM_ITERS.value\n", "\n", "# get only active features\n", "pos_positions = np.where(pos_scores >= ModelConfigs.SELECTION_THRESHOLD.value, pos_scores, 0)\n", "neg_positions = np.where(neg_scores >= ModelConfigs.SELECTION_THRESHOLD.value, neg_scores, 0)\n", "\n", "# prepare DataFrame\n", "pos = [(X_names[i], pos_scores[c, i], y_names[c]) for c, i in zip(*pos_positions.nonzero())]\n", "neg = [(X_names[i], neg_scores[c, i], y_names[c]) for c, i in zip(*neg_positions.nonzero())]\n", "\n", "posdf = pd.DataFrame(pos, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)\n", "negdf = pd.DataFrame(neg, columns=\"word score label\".split()).sort_values([\"label\", \"score\"], ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "aa7efd0b3ada76bb0689aa8ed0b61d7de788847e3d11d2d142fc5800c765982f" }, "kernelspec": { "display_name": "Python 3.8.3 64-bit ('py38': conda)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" }, "orig_nbformat": 2 }, "nbformat": 4, "nbformat_minor": 2 }