File size: 3,412 Bytes
5a43ec1 07df5e5 5a43ec1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import re
import pandas as pd
from tqdm.auto import tqdm
from transformers import pipeline
from transformers import AutoTokenizer
model_checkpoint = "Pclanglais/French-TV-transcript-NER"
token_classifier = pipeline(
"token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def split_text(text, max_tokens=500):
# Split the text by newline characters
parts = text.split("\n")
chunks = []
current_chunk = ""
for part in parts:
# Add part to current chunk
if current_chunk:
temp_chunk = current_chunk + "\n" + part
else:
temp_chunk = part
# Tokenize the temporary chunk
num_tokens = len(tokenizer.tokenize(temp_chunk))
if num_tokens <= max_tokens:
current_chunk = temp_chunk
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = part
if current_chunk:
chunks.append(current_chunk)
# If no newlines were found and still exceeding max_tokens, split further
if len(chunks) == 1 and len(tokenizer.tokenize(chunks[0])) > max_tokens:
long_text = chunks[0]
chunks = []
while len(tokenizer.tokenize(long_text)) > max_tokens:
split_point = len(long_text) // 2
while split_point < len(long_text) and not re.match(r'\s', long_text[split_point]):
split_point += 1
# Ensure split_point does not go out of range
if split_point >= len(long_text):
split_point = len(long_text) - 1
chunks.append(long_text[:split_point].strip())
long_text = long_text[split_point:].strip()
if long_text:
chunks.append(long_text)
return chunks
complete_data = pd.read_parquet("[file with transcripts]")
print(complete_data)
classified_list = []
list_prompt = []
list_page = []
list_file = []
list_id = []
text_id = 1
for index, row in complete_data.iterrows():
prompt, current_file = str(row["corrected_text"]), row["identifier"]
prompt = re.sub("\n", " ¶ ", prompt)
# Tokenize the prompt and check if it exceeds 500 tokens
num_tokens = len(tokenizer.tokenize(prompt))
if num_tokens > 500:
# Split the prompt into chunks
chunks = split_text(prompt, max_tokens=500)
for chunk in chunks:
list_file.append(current_file)
list_prompt.append(chunk)
list_id.append(text_id)
else:
list_file.append(current_file)
list_prompt.append(prompt)
list_id.append(text_id)
text_id = text_id + 1
full_classification = []
batch_size = 4
for out in tqdm(token_classifier(list_prompt, batch_size=batch_size), total=len(list_prompt)/batch_size):
full_classification.append(out)
id_row = 0
for classification in full_classification:
try:
df = pd.DataFrame(classification)
df["identifier"] = list_file[id_row]
df["text_id"] = list_id[id_row]
df['word'] = df['word'].replace(' ¶ ', ' \n ', regex=True)
print(df)
classified_list.append(df)
except:
pass
id_row = id_row + 1
classified_list = pd.concat(classified_list)
# Display the DataFrame
print(classified_list)
classified_list.to_csv("result_transcripts.tsv", sep = "\t")
|