spidder / preprocessing.py
khulnasoft's picture
Update preprocessing.py
3b7d5e3 verified
raw
history blame contribute delete
432 Bytes
import re
def preprocess_text(text):
# Remove special characters and digits
text = re.sub(r'\W', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
with open("data.csv", "r") as file:
data = file.readlines()
cleaned_data = [preprocess_text(line) for line in data]
# Save the cleaned data
with open("cleaned_data.txt", "w") as file:
for entry in cleaned_data:
file.write(entry + "\n")