File size: 4,672 Bytes
4bb7f61
 
 
e43c159
4bb7f61
1f1c873
 
 
 
 
 
 
4bb7f61
be7919b
 
4bb7f61
 
 
 
e04e4a9
4bb7f61
 
 
 
 
 
 
 
 
 
 
 
0120ffb
 
4bb7f61
 
 
 
 
e25275e
 
 
 
 
 
0c84a8d
e25275e
 
6d3dcaa
 
26648f7
e25275e
fce03ed
 
1dd8382
798b1e0
26648f7
fce03ed
e25275e
 
4bb7f61
 
 
 
e25275e
30210d3
4bb7f61
 
 
 
 
 
ae4476a
 
29235ac
ae4476a
 
 
29235ac
 
 
 
 
9790fd3
29235ac
 
 
 
ae4476a
 
 
29235ac
 
ae4476a
29235ac
 
 
 
 
 
 
 
 
ae4476a
 
 
 
 
2c4e31b
8cd5122
bbbe59d
ab32001
4bb7f61
165f08b
 
 
 
 
 
 
 
 
 
c36b21a
165f08b
c9896ad
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
from fpdf import FPDF

# Interface utilisateur
st.set_page_config(
    page_title="Traduction d'une phrase en pictogrammes ARASAAC",
    page_icon="📝",
    layout="wide"
)

# Charger le modèle et le tokenizer
# checkpoint = "Propicto/t2p-t5-large-orfeo"
checkpoint = "Propicto/t2p-nllb-200-distilled-600M-all"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Lire le lexique
@st.cache_data
def read_lexicon(lexicon):
    df = pd.read_csv(lexicon, sep='\t')
    df['keyword_no_cat'] = df['lemma'].str.split(' #').str[0].str.strip().str.replace(' ', '_')
    return df

lexicon = read_lexicon("lexicon.csv")

# Processus de sortie de la traduction
def process_output_trad(pred):
    return pred.split()

def get_id_picto_from_predicted_lemma(df_lexicon, lemma):
    if lemma.endswith("!"):
        lemma = lemma[:-1]
    id_picto = df_lexicon.loc[df_lexicon['keyword_no_cat'] == lemma, 'id_picto'].tolist()
    return (id_picto[0], lemma) if id_picto else (0, lemma)

# Génération du contenu HTML pour afficher les pictogrammes
def generate_html(ids):
    html_content = '<html><head><style>'
    html_content += '''
        figure {
            display: inline-block;
            text-align: center;
            font-family: Arial, sans-serif;
            margin: 0;
        }
        figcaption {
            color: black;
            background-color: white;
            border-radius: 5px;
        }
        img {
            background-color: white;
            margin: 0;
            padding: 0;
            border-radius: 6px;
        }
    '''
    html_content += '</style></head><body>'
    for picto_id, lemma in ids:
        if picto_id != 0:  # ignore invalid IDs
            img_url = f"https://static.arasaac.org/pictograms/{picto_id}/{picto_id}_500.png"
            html_content += f'''
            <figure>
                <img src="{img_url}" alt="{lemma}" width="100" height="100"/>
                <figcaption>{lemma}</figcaption>
            </figure>
            '''
    html_content += '</body></html>'
    return html_content


def generate_pdf(ids):
    pdf = FPDF(orientation='L', unit='mm', format='A4')  # 'L' for landscape orientation
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    
    # Start positions
    x_start = 10
    y_start = 10
    img_width = 50
    img_height = 50
    spacing = 1
    max_width = 297  # A4 landscape width in mm
    current_x = x_start
    current_y = y_start
    
    for picto_id, lemma in ids:
        if picto_id != 0:  # ignore invalid IDs
            img_url = f"https://static.arasaac.org/pictograms/{picto_id}/{picto_id}_500.png"
            pdf.image(img_url, x=current_x, y=current_y, w=img_width, h=img_height)
            pdf.set_xy(current_x, current_y + img_height + 5)
            pdf.set_font("Arial", size=12)
            pdf.cell(img_width, 10, txt=lemma, ln=1, align='C')
            
            current_x += img_width + spacing
            
            # Move to the next line if exceeds max width
            if current_x + img_width > max_width:
                current_x = x_start
                current_y += img_height + spacing + 10  # Adjust for image height and some spacing

    pdf_path = "pictograms.pdf"
    pdf.output(pdf_path)
    return pdf_path


st.title("Traduction d'une phrase en pictogrammes ARASAAC")

st.info("Text-to-Pictograms traduit une phrase en français en pictogrammes ARASAAC. Renseignez une phrase, puis validez. Vous pouvez sauvegarder la traduction au format PDF en cliquant sur le bouton en bas de page.", icon='ℹ️')
pictogram_ids = []
sentence = st.text_input("Entrez une phrase en français:")
if sentence:
    with st.spinner("Affichage des pictogrammes..."):
        inputs = tokenizer(sentence, return_tensors="pt").input_ids
        outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)

        sentence_to_map = process_output_trad(pred)
        pictogram_ids = [get_id_picto_from_predicted_lemma(lexicon, lemma) for lemma in sentence_to_map]

        html = generate_html(pictogram_ids)
        st.components.v1.html(html, height=200, scrolling=True)

if pictogram_ids:
    # Container to hold the download button
    pdf_path = generate_pdf(pictogram_ids)
    with open(pdf_path, "rb") as pdf_file:
        st.download_button(label="Télécharger la traduction en PDF", data=pdf_file, file_name="pictograms.pdf", mime="application/pdf")