QuadLingualModel

Running

App Files Files Community

EzekielMW commited on Aug 29, 2024

Commit

1558a49

verified ·

1 Parent(s): a51e729

Create app.py

Browse files

Files changed (1) hide show

app.py +192 -0

app.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from fastapi import FastAPI, Request
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from fastapi.middleware.cors import CORSMiddleware
+import torch
+import os
+import yaml
+import transformers
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Adjust this as needed
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Load the model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("EzekielMW/Eksl_dataset")
+model = AutoModelForSeq2SeqLM.from_pretrained("EzekielMW/Eksl_dataset")
+# Where should output files be stored locally
+drive_folder = "./serverlogs"
+if not os.path.exists(drive_folder):
+    os.makedirs(drive_folder)
+# Large batch sizes generally give good results for translation
+effective_train_batch_size = 480
+train_batch_size = 6
+eval_batch_size = train_batch_size
+gradient_accumulation_steps = int(effective_train_batch_size / train_batch_size)
+# Everything in one yaml string, so that it can all be logged.
+yaml_config = '''
+training_args:
+  output_dir: "{drive_folder}"
+  eval_strategy: steps
+  eval_steps: 100
+  save_steps: 100
+  gradient_accumulation_steps: {gradient_accumulation_steps}
+  learning_rate: 3.0e-4  # Include decimal point to parse as float
+  # optim: adafactor
+  per_device_train_batch_size: {train_batch_size}
+  per_device_eval_batch_size: {eval_batch_size}
+  weight_decay: 0.01
+  save_total_limit: 3
+  max_steps: 500
+  predict_with_generate: True
+  fp16: True
+  logging_dir: "{drive_folder}"
+  load_best_model_at_end: True
+  metric_for_best_model: loss
+  seed: 123
+  push_to_hub: False
+max_input_length: 128
+eval_pretrained_model: False
+early_stopping_patience: 4
+data_dir: .
+# Use a 600M parameter model here, which is easier to train on a free Colab
+# instance. Bigger models work better, however: results will be improved
+# if able to train on nllb-200-1.3B instead.
+model_checkpoint: facebook/nllb-200-distilled-600M
+datasets:
+  train:
+    huggingface_load:
+      # We will load two datasets here: English/KSL Gloss, and also SALT
+      # Swahili/English, so that we can try out multi-way translation.
+      - path: EzekielMW/Eksl_dataset
+        split: train[:-1000]
+      - path: sunbird/salt
+        name: text-all
+        split: train
+    source:
+      # This is a text translation only, no audio.
+      type: text
+      # The source text can be any of English, KSL or Swahili.
+      language: [eng,ksl,swa]
+      preprocessing:
+        # The models are case sensitive, so if the training text is all
+        # capitals, then it will only learn to translate capital letters and
+        # won't understand lower case. Make everything lower case for now.
+        - lower_case
+        # We can also augment the spelling of the input text, which makes the
+        # model more robust to spelling errors.
+        - augment_characters
+    target:
+      type: text
+      # The target text with any of English, KSL or Swahili.
+      language: [eng,ksl,swa]
+      # The models are case sensitive: make everything lower case for now.
+      preprocessing:
+        - lower_case
+    shuffle: True
+    allow_same_src_and_tgt_language: False
+  validation:
+    huggingface_load:
+      # Use the last 500 of the KSL examples for validation.
+      - path: EzekielMW/Eksl_dataset
+        split: train[-1000:]
+      # Add some Swahili validation text.
+      - path: sunbird/salt
+        name: text-all
+        split: dev
+    source:
+      type: text
+      language: [swa,ksl,eng]
+      preprocessing:
+        - lower_case
+    target:
+      type: text
+      language: [swa,ksl,eng]
+      preprocessing:
+        - lower_case
+    allow_same_src_and_tgt_language: False
+'''
+yaml_config = yaml_config.format(
+    drive_folder=drive_folder,
+    train_batch_size=train_batch_size,
+    eval_batch_size=eval_batch_size,
+    gradient_accumulation_steps=gradient_accumulation_steps,
+)
+config = yaml.safe_load(yaml_config)
+training_settings = transformers.Seq2SeqTrainingArguments(
+    **config["training_args"])
+# The pre-trained model that we use has support for some African languages, but
+# we need to adapt the tokenizer to languages that it wasn't trained with,
+# such as KSL. Here we reuse the token from a different language.
+LANGUAGE_CODES = ["eng", "swa", "ksl"]
+code_mapping = {
+    # Exact/close mapping
+    'eng': 'eng_Latn',
+    'swa': 'swh_Latn',
+    # Random mapping
+    'ksl': 'ace_Latn',
+}
+tokenizer = transformers.NllbTokenizer.from_pretrained(
+    config['model_checkpoint'],
+    src_lang='eng_Latn',
+    tgt_lang='eng_Latn')
+offset = tokenizer.sp_model_size + tokenizer.fairseq_offset
+for code in LANGUAGE_CODES:
+    i = tokenizer.convert_tokens_to_ids(code_mapping[code])
+    tokenizer._added_tokens_encoder[code] = i
+# Define a translation function
+def translate(text, source_language, target_language):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    inputs = tokenizer(text.lower(), return_tensors="pt").to(device)
+    inputs['input_ids'][0][0] = tokenizer.convert_tokens_to_ids(source_language)
+    translated_tokens = model.to(device).generate(
+        **inputs,
+        forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_language),
+        max_length=100,
+        num_beams=5,
+    )
+    result = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+    if target_language == 'ksl':
+        result = result.upper()
+    return result
+@app.post("/translate")
+async def translate_text(request: Request):
+    data = await request.json()
+    text = data.get("text")
+    source_language = data.get("source_language")
+    target_language = data.get("target_language")
+    translation = translate(text, source_language, target_language)
+    return {"translation": translation}