Spaces:
Build error
Build error
File size: 3,463 Bytes
02c2d7e 51cab9d 02c2d7e 51cab9d 02c2d7e 51cab9d 02c2d7e 51cab9d 02c2d7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
from typing import List
import numpy as np
import pandas as pd
import streamlit as st
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from stqdm import stqdm
from .configs import ModelConfigs
stqdm.pandas()
def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):
n_instances, n_features = X.shape
n_classes = len(y_names)
# NOTE: the * 10 / 10 trick is to have "nice" round-ups
sample_fraction = np.ceil((n_features / n_instances) * 10) / 10
sample_size = min(
# this is the maximum supported
configs.MAX_SELECTION.value,
# at minimum you want MIN_SELECTION but in general you want
# n_instances * sample_fraction
max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
# however if previous one is bigger the the available instances take
# the number of available instances
n_instances,
)
# TODO: might want to try out something to subsample features at each iteration
# initialize coefficient matrices
pos_scores = np.zeros((n_classes, n_features), dtype=int)
neg_scores = np.zeros((n_classes, n_features), dtype=int)
with st.spinner("Wordifying!"):
for _ in stqdm(range(configs.NUM_ITERS.value)):
# run randomized regression
clf = LogisticRegression(
penalty="l1",
C=configs.PENALTIES.value[
np.random.randint(len(configs.PENALTIES.value))
],
solver="liblinear",
multi_class="auto",
max_iter=500,
class_weight="balanced",
)
# sample indices to subsample matrix
selection = resample(
np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size
)
# fit
try:
clf.fit(X[selection], y[selection])
except ValueError:
continue
# record coefficients
if n_classes == 2:
pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
else:
pos_scores += clf.coef_ > 0
neg_scores += clf.coef_ < 0
# normalize
pos_scores = pos_scores / configs.NUM_ITERS.value
neg_scores = neg_scores / configs.NUM_ITERS.value
# get only active features
pos_positions = np.where(
pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0
)
neg_positions = np.where(
neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0
)
# prepare DataFrame
pos = [
(X_names[i], pos_scores[c, i], y_names[c])
for c, i in zip(*pos_positions.nonzero())
]
neg = [
(X_names[i], neg_scores[c, i], y_names[c])
for c, i in zip(*neg_positions.nonzero())
]
posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(
["label", "score"], ascending=False
)
negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
["label", "score"], ascending=False
)
return posdf, negdf
|