|
import gradio as gr |
|
from time import time |
|
from scipy import sparse |
|
from scipy import linalg |
|
|
|
from sklearn.datasets import make_regression |
|
from sklearn.linear_model import Lasso |
|
|
|
|
|
def load_dataset(): |
|
X, y = make_regression(n_samples=200, n_features=5000, random_state=0) |
|
|
|
X_sp = sparse.coo_matrix(X) |
|
return X,X_sp,y |
|
|
|
def compare_lasso_dense(): |
|
alpha_dense = 1 |
|
alpha_sparse = 0.1 |
|
sparse_lasso = Lasso(alpha= alpha_sparse, fit_intercept=False, max_iter=1000) |
|
dense_lasso = Lasso(alpha=alpha_dense, fit_intercept=False, max_iter=1000) |
|
|
|
t0 = time() |
|
sparse_lasso.fit(X_sp, y) |
|
|
|
elapse1 = time() - t0 |
|
|
|
t1 = time() |
|
dense_lasso.fit(X, y) |
|
|
|
elapse2 = time() - t1 |
|
|
|
|
|
coeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_) |
|
|
|
return f"Sparse Lasso done in {(elapse1):.3f}s\t\n" + f"Dense Lasso done in {(elapse2):.3f}s\t\n" + f"Distance between coefficients : {coeff_diff:.2e}\t\n" |
|
|
|
def compare_lasso_sparse(): |
|
|
|
Xs = X.copy() |
|
|
|
Xs[Xs < 2.5] = 0.0 |
|
|
|
Xs_sp = sparse.coo_matrix(Xs) |
|
Xs_sp = Xs_sp.tocsc() |
|
|
|
|
|
print(f"Matrix density : {(Xs_sp.nnz / float(X.size) * 100):.3f}%") |
|
matrix_density = Xs_sp.nnz / float(X.size) * 100 |
|
|
|
alpha_dense = 1 |
|
alpha_sparse = 0.1 |
|
sparse_lasso = Lasso(alpha= alpha_sparse, fit_intercept=False, max_iter=1000) |
|
dense_lasso = Lasso(alpha=alpha_dense, fit_intercept=False, max_iter=1000) |
|
|
|
t0 = time() |
|
sparse_lasso.fit(Xs_sp, y) |
|
print(f"Sparse Lasso done in {(time() - t0):.3f}s") |
|
elapses1 = time() - t0 |
|
|
|
t1 = time() |
|
dense_lasso.fit(Xs, y) |
|
print(f"Dense Lasso done in {(time() - t1):.3f}s") |
|
elapses2 = time() - t1 |
|
|
|
|
|
coeff_diff = linalg.norm(sparse_lasso.coef_ - dense_lasso.coef_) |
|
print(f"Distance between coefficients : {coeff_diff:.2e}") |
|
return f"Matrix density : {(Xs_sp.nnz / float(X.size) * 100):.3f}%\t\n"+ f"Sparse Lasso done in {(elapses1):.3f}s\t\n" + f"Dense Lasso done in {(elapses2):.3f}s\t\n" + f"Distance between coefficients : {coeff_diff:.2e}\t\n" |
|
|
|
|
|
X,X_sp,y = load_dataset() |
|
|
|
|
|
|
|
|
|
|
|
title = " Lasso on Dense and Sparse data " |
|
info = '''**Comparing the two Lasso implementations on Dense data** |
|
We create a linear regression problem that is suitable for the Lasso, that is to say, with more features than samples. |
|
We then store the data matrix in both dense (the usual) and sparse format, and train a Lasso on each. We compute the |
|
runtime of both and check that they learned the same model by |
|
computing the Euclidean norm of the difference between the coefficients they learned. |
|
Because the data is dense, we expect better runtime with a dense data format. |
|
''' |
|
|
|
info2='''***Comparing the two Lasso implementations on Sparse data*** |
|
We make the previous problem sparse by replacing all small values with 0 |
|
and run the same comparisons as above. Because the data is now sparse, |
|
we expect the implementation that uses the sparse data format to be faster. |
|
''' |
|
|
|
conclusion = '''**Conclusion** |
|
We show that linear_model.Lasso provides the same results for dense and sparse data and that in the case of sparse data the speed is improved**. |
|
''' |
|
with gr.Blocks() as demo: |
|
gr.Markdown(f"# {title}") |
|
gr.Markdown(info) |
|
|
|
txt_3 = gr.Textbox(value="", label="Dense Lasso comparison") |
|
btn = gr.Button(value="Dense Lasso comparison") |
|
btn.click(compare_lasso_dense, outputs=[txt_3]) |
|
|
|
gr.Markdown(info2) |
|
|
|
txt_4 = gr.Textbox(value="", label="Sparse Lasso comparison") |
|
btn = gr.Button(value="Sparse Lasso comparison") |
|
btn.click(compare_lasso_sparse, outputs=[txt_4]) |
|
|
|
gr.Markdown(conclusion) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|