Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Starting on T4

Update ppt_chunker.py

5458cd4 verified about 2 months ago

No virus

1.67 kB

	from io import StringIO
	from multiprocessing import cpu_count
	from concurrent.futures import ProcessPoolExecutor

	import spacy
	from unstructured.partition.pptx import partition_pptx
	from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars

	from ordered_multimap import OrderedMultiIndexMapWeakRef

	def process_text(text_1, text_2):
	tokens = nlp(text_1)
	if token[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
	return text_1 + ' ' + text_2
	return text_1

	def ppt_chunk(file_like, model):
	elements = partition_pptx(file=file_like)

	with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
	results = list(executor.map(process_text, texts))

	for elem in elements:
	elem.text = clean(elem.text, bullets=True)
	type = elem.to_dict()['type']
	print(f'UNSTRUCTURED TEXT: {type} , {text}')


	weakDict = OrderedMultiIndexMapWeakRef()

	metadata_main_title = ''

	for pn, slide in enumerate(ppt.slides):
	if pn < from_page:
	continue
	if pn >= to_page:
	break

	try:
	_ = slide.shapes[0]
	except IndexError:
	continue

	text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]

	if len(text_shapes) == 1:
	metadata_main_title = text_shapes[0].text_frame.text
	continue

	print(f'SLIDE TEXT: {slide_text}')
	weakDict.insert(slide_text, metadata_main_title)
	return weakDict

	raise NotImplementedError(
	"file type not supported yet(pptx)")