File size: 2,757 Bytes
d14409f
9eda247
d14409f
 
 
 
 
3037d70
33901fb
5c9ffe3
 
 
 
 
 
 
 
 
 
3037d70
 
 
 
 
 
d14409f
3037d70
 
d14409f
3037d70
 
9eda247
d14409f
3037d70
 
d14409f
 
33901fb
9eda247
d14409f
 
 
0b65d75
3037d70
ea34410
 
 
 
 
 
 
 
d14409f
a1b3563
d14409f
3037d70
d14409f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
from hazm import Normalizer, word_tokenize, Lemmatizer, Chunker

# Initialize Hazm components
lemmatizer = Lemmatizer()
chunker = Chunker(model='resources/chunker.model')

def process_text(text, operation, correct_spacing, remove_diacritics, remove_specials_chars, decrease_repeated_chars, persian_style, persian_numbers, unicodes_replacement, seperate_mi):
    # Initialize the Normalizer with user-selected parameters
    normalizer = Normalizer(
        correct_spacing=correct_spacing,
        remove_diacritics=remove_diacritics,
        remove_specials_chars=remove_specials_chars,
        decrease_repeated_chars=decrease_repeated_chars,
        persian_style=persian_style,
        persian_numbers=persian_numbers,
        unicodes_replacement=unicodes_replacement,
        seperate_mi=seperate_mi
    )
    
    result = ""

    if operation == "normalize":
        result = normalizer.normalize(text)
    elif operation == "tokenize":
        tokens = word_tokenize(text)
        result = " ".join(tokens)  # Show tokens as a space-separated string
    elif operation == "lemmatize":
        lemmas = [lemmatizer.lemmatize(token) for token in word_tokenize(text)]
        result = " ".join(lemmas)  # Show lemmas as a space-separated string
    elif operation == "chunk":
        pos_tags = word_tokenize(text)
        chunks = chunker.parse(pos_tags)
        result = str(chunks)  # Show chunks as text
    
    return result

# Define Gradio interface
operations = ['normalize', 'tokenize', 'lemmatize', 'chunk']
iface = gr.Interface(
    fn=process_text,
    inputs=[
        gr.Textbox(lines=10, label="Input Text"),
        gr.Radio(operations, label="Select Operation", type="value"),  # Radio button to select one operation at a time
        gr.Checkbox(value=True, label="Correct Spacing", interactive=True),
        gr.Checkbox(value=True, label="Remove Diacritics", interactive=True),
        gr.Checkbox(value=True, label="Remove Special Characters", interactive=True),
        gr.Checkbox(value=True, label="Decrease Repeated Characters", interactive=True),
        gr.Checkbox(value=True, label="Persian Style", interactive=True),
        gr.Checkbox(value=True, label="Persian Numbers", interactive=True),
        gr.Checkbox(value=True, label="Unicodes Replacement", interactive=True),
        gr.Checkbox(value=True, label="Separate 'می'", interactive=True)
    ],
    outputs=gr.Textbox(label="Processed Text", interactive=False, lines=10, show_copy_button=True, show_label=True),  # Output as copyable text
    title="Persian Text Processor with Hazm",
    description="Select an operation and normalization parameters to process the input text using Hazm."
)

if __name__ == "__main__":
    iface.launch()