|
import hashlib |
|
import csv |
|
import os |
|
|
|
|
|
def hash_md5(filepath): |
|
md5_hash = hashlib.md5() |
|
try: |
|
with open(filepath, "rb") as file: |
|
for chunk in iter(lambda: file.read(128 * md5_hash.block_size), b''): |
|
md5_hash.update(chunk) |
|
return md5_hash.hexdigest() |
|
except Exception as e: |
|
return str(e) |
|
CSV_FILE_PATH = "audio_plus_hash_uniq_07102024.csv" |
|
|
|
def update_csv_with_files(csv_file_path, audio_old_path, audio_16000_path, new_transcription): |
|
hash_old = hash_md5(audio_old_path) |
|
hash_new = hash_md5(audio_16000_path) |
|
update_csv(csv_file_path, hash_old, hash_new, audio_old_path, new_transcription) |
|
|
|
def update_csv(csv_file_path, search_hash, hash_16000, new_path, new_transcription): |
|
|
|
with open(csv_file_path, mode='r+', newline='', encoding='utf-8') as file: |
|
reader = csv.DictReader(file) |
|
fieldnames = reader.fieldnames |
|
rows = list(reader) |
|
found = False |
|
|
|
|
|
for i, row in enumerate(rows): |
|
if row['hash'] == search_hash or row['hash_16000'] == search_hash: |
|
rows[i]['hash_16000'] = hash_16000 |
|
rows[i]['transcription'] = new_transcription |
|
found = True |
|
break |
|
|
|
if found: |
|
|
|
file.seek(0) |
|
writer = csv.DictWriter(file, fieldnames=fieldnames) |
|
writer.writeheader() |
|
writer.writerows(rows) |
|
return |
|
|
|
with open(csv_file_path, mode='a', newline='', encoding='utf-8') as append_file: |
|
writer = csv.DictWriter(append_file, fieldnames=fieldnames) |
|
writer.writerow({ |
|
'hash': search_hash, |
|
'hash_16000': hash_16000, |
|
'filepath': new_path, |
|
'transcription': new_transcription |
|
}) |
|
|