File size: 3,463 Bytes
02c2d7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51cab9d
 
 
02c2d7e
 
 
 
 
 
 
51cab9d
 
 
02c2d7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51cab9d
 
 
 
 
 
02c2d7e
 
51cab9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02c2d7e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from typing import List
import numpy as np
import pandas as pd
import streamlit as st
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from stqdm import stqdm

from .configs import ModelConfigs

stqdm.pandas()


def wordifier(X, y, X_names: List[str], y_names: List[str], configs=ModelConfigs):

    n_instances, n_features = X.shape
    n_classes = len(y_names)

    # NOTE: the * 10 / 10 trick is to have "nice" round-ups
    sample_fraction = np.ceil((n_features / n_instances) * 10) / 10

    sample_size = min(
        # this is the maximum supported
        configs.MAX_SELECTION.value,
        # at minimum you want MIN_SELECTION but in general you want
        # n_instances * sample_fraction
        max(configs.MIN_SELECTION.value, int(n_instances * sample_fraction)),
        # however if previous one is bigger the the available instances take
        # the number of available instances
        n_instances,
    )

    # TODO: might want to try out something to subsample features at each iteration

    # initialize coefficient matrices
    pos_scores = np.zeros((n_classes, n_features), dtype=int)
    neg_scores = np.zeros((n_classes, n_features), dtype=int)

    with st.spinner("Wordifying!"):

        for _ in stqdm(range(configs.NUM_ITERS.value)):

            # run randomized regression
            clf = LogisticRegression(
                penalty="l1",
                C=configs.PENALTIES.value[
                    np.random.randint(len(configs.PENALTIES.value))
                ],
                solver="liblinear",
                multi_class="auto",
                max_iter=500,
                class_weight="balanced",
            )

            # sample indices to subsample matrix
            selection = resample(
                np.arange(n_instances), replace=True, stratify=y, n_samples=sample_size
            )

            # fit
            try:
                clf.fit(X[selection], y[selection])
            except ValueError:
                continue

            # record coefficients
            if n_classes == 2:
                pos_scores[1] = pos_scores[1] + (clf.coef_ > 0.0)
                neg_scores[1] = neg_scores[1] + (clf.coef_ < 0.0)
                pos_scores[0] = pos_scores[0] + (clf.coef_ < 0.0)
                neg_scores[0] = neg_scores[0] + (clf.coef_ > 0.0)
            else:
                pos_scores += clf.coef_ > 0
                neg_scores += clf.coef_ < 0

        # normalize
        pos_scores = pos_scores / configs.NUM_ITERS.value
        neg_scores = neg_scores / configs.NUM_ITERS.value

        # get only active features
        pos_positions = np.where(
            pos_scores >= configs.SELECTION_THRESHOLD.value, pos_scores, 0
        )
        neg_positions = np.where(
            neg_scores >= configs.SELECTION_THRESHOLD.value, neg_scores, 0
        )

        # prepare DataFrame
        pos = [
            (X_names[i], pos_scores[c, i], y_names[c])
            for c, i in zip(*pos_positions.nonzero())
        ]
        neg = [
            (X_names[i], neg_scores[c, i], y_names[c])
            for c, i in zip(*neg_positions.nonzero())
        ]

    posdf = pd.DataFrame(pos, columns="word score label".split()).sort_values(
        ["label", "score"], ascending=False
    )
    negdf = pd.DataFrame(neg, columns="word score label".split()).sort_values(
        ["label", "score"], ascending=False
    )

    return posdf, negdf