{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"},"kaggle":{"accelerator":"none","dataSources":[],"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# Install necessary libraries\n!pip install transformers pandas datasets accelerate","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Additional installations for PyTorch and CUDA\n!pip install torch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import re\nimport pandas as pd\nimport torch\nfrom datasets import Dataset\nfrom transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup\nfrom accelerate import Accelerator","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Text cleaning functions\ndef fix_text(text):\n text = text.replace('&', '&')\n text = text.replace('<', '<')\n text = text.replace('>', '>')\n return text\n\ndef clean_tweet(tweet, allow_new_lines=False):\n bad_start = ['http:', 'https:']\n for w in bad_start:\n tweet = re.sub(f\" {w}\\\\S+\", \"\", tweet) # removes white space before url\n tweet = re.sub(f\"{w}\\\\S+ \", \"\", tweet) # in case a tweet starts with a url\n tweet = re.sub(f\"\\n{w}\\\\S+ \", \"\", tweet) # in case the url is on a new line\n tweet = re.sub(f\"\\n{w}\\\\S+\", \"\", tweet) # in case the url is alone on a new line\n tweet = re.sub(f\"{w}\\\\S+\", \"\", tweet) # any other case?\n tweet = re.sub(' +', ' ', tweet) # replace multiple spaces with one space\n if not allow_new_lines: # remove new lines\n tweet = ' '.join(tweet.split())\n return tweet.strip()\n\ndef boring_tweet(tweet):\n \"Check if this is a boring tweet\"\n boring_stuff = ['http', '@', '#']\n not_boring_words = len([None for w in tweet.split() if all(bs not in w.lower() for bs in boring_stuff)])\n return not_boring_words < 3","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Load and filter the dataset for a specified party\ndef load_and_filter_data(party):\n curated_tweets = pd.read_csv('/kaggle/input/curated-tweets/curated_tweets.csv')\n data = curated_tweets[curated_tweets.Partei == party][['text']].astype(str)\n return data","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Initialize tokenizer\ndef initialize_tokenizer():\n tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='', eos_token='', pad_token='')\n return tokenizer","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Prepare dataset for training\ndef prepare_dataset(data, tokenizer):\n training_examples = f' ' + data['text'] + ''\n task_df = pd.DataFrame({'text': training_examples})\n tweet_data = Dataset.from_pandas(task_df)\n\n def preprocess(example):\n return tokenizer(example['text'], truncation=True)\n\n tweet_data = tweet_data.map(preprocess, batched=False)\n tweet_data = tweet_data.train_test_split(train_size=.8)\n return tweet_data","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Initialize model and related components\ndef initialize_model_and_components(tokenizer):\n model = GPT2LMHeadModel.from_pretrained('gpt2')\n model.resize_token_embeddings(len(tokenizer))\n data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n return model, data_collator","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Set training arguments\ndef set_training_arguments():\n training_args = TrainingArguments(\n output_dir=\"/kaggle/working/tweets\",\n overwrite_output_dir=True,\n num_train_epochs=3,\n per_device_train_batch_size=6,\n per_device_eval_batch_size=6,\n load_best_model_at_end=True,\n log_level='info',\n evaluation_strategy='epoch',\n save_strategy='epoch',\n learning_rate=2e-4,\n warmup_steps=1e2,\n seed=38,\n report_to=\"none\",\n )\n return training_args","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Train and evaluate the model\ndef train_and_evaluate_model(model, training_args, tweet_data, data_collator):\n optimizer = AdamW(model.parameters(), lr=2e-4, eps=1e-8)\n total_steps = len(tweet_data[\"train\"]) * training_args.num_train_epochs\n scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=1e2, num_training_steps=total_steps)\n\n trainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=tweet_data[\"train\"],\n eval_dataset=tweet_data[\"test\"],\n data_collator=data_collator,\n optimizers=(optimizer, scheduler),\n )\n trainer.train()\n trainer.evaluate()","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Generate text using the fine-tuned model\ndef generate_text(model, tokenizer, prompt):\n device = torch.device(\"cuda\")\n model.eval()\n generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0).to(device)\n\n sample_outputs = model.generate(\n generated,\n do_sample=True,\n top_k=20,\n max_length=70,\n top_p=0.98,\n num_return_sequences=10,\n temperature=0.95\n )\n\n for i, sample_output in enumerate(sample_outputs):\n print(\"{}: {}\\n\\n\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Save the fine-tuned model\ndef save_model(model, tokenizer, output_dir):\n model.save_pretrained(output_dir)\n tokenizer.save_pretrained(output_dir)","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Main function to execute the workflow\ndef main(party):\n data = load_and_filter_data(party)\n tokenizer = initialize_tokenizer()\n tweet_data = prepare_dataset(data, tokenizer)\n model, data_collator = initialize_model_and_components(tokenizer)\n training_args = set_training_arguments()\n train_and_evaluate_model(model, training_args, tweet_data, data_collator)\n\n # Generate some example text\n prompt = \"Die Deutsche Kultur\"\n generate_text(model, tokenizer, prompt)\n\n # Save the model\n save_model(model, tokenizer, \"/kaggle/working/{}_gpt2-finetuned\".format(party))","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Train your desired model\nparty = \"Die Linke\" # Parties available for training: AfD, FDP, Fraktionslos, SPD, Bündnis 90/Die Grünen, CDU, CSU, Die Linke\nmain(party)","metadata":{},"execution_count":null,"outputs":[]}]}