Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Paused

App Files Files Community

Multipurpose-AI-Agent-Development / ppt_chunker.py

devve1

Update ppt_chunker.py

def8b51 verified about 2 months ago

raw

history blame

3.28 kB

	from io import StringIO
	from typing import List

	import pathos.multiprocessing as mp
	from unstructured.partition.pptx import partition_pptx
	from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars

	from ordered_multimap import OrderedMultiIndexMapWeakRef

	COMMON_WORDS = {"BY", "IN", "ON", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT"}

	def process_chunk(chunk, nlp):
	marked = []

	for i in range(len(chunk[1])):
	current = chunk[1][i]
	current_text = current[1]

	if (type(current) is list) and current_text.isupper() and (current[0] == 'Title'):
	tokens = nlp(current_text)

	try:
	next = chunk[1][i+1]
	except IndexError:
	continue

	if (type(next) is list) and next[1].isupper() and (next[0] == 'Title'):
	print(f'TOKEN: {current_text}, {tokens[-1]}, {tokens[-1].pos_}')
	if (tokens[-1].pos_ in ["ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]) or ((tokens[-1].pos_ == 'PROPN') and (tokens[-1] in COMMON_WORDS)):
	chunk[1][i+1][1] = current_text + ' ' + next[1]
	marked.append(i)

	for i in marked:
	del chunk[1][i]

	return chunk

	def ppt_chunk(file_like, model):
	import time
	elements = partition_pptx(file=file_like)

	chunks = []
	current_chunk = []
	list_items = []

	for elem in elements:
	if elem.category == "PageBreak":
	if current_chunk or list_items:
	if list_items:
	current_chunk.append("\n".join(list_items))
	list_items = []

	chunks.append((elem.id, current_chunk))
	current_chunk = []
	else:
	if elem.category == "ListItem":
	list_items.append(elem.text)
	else:
	current_chunk.append([elem.category, elem.text])

	sr = time.time()
	for chunk in chunks:
	chunk = process_chunk(chunk, model)

	er = time.time()
	fr = er - sr
	print(f'TIME {fr}')
	#with mp.Pool(mp.cpu_count()) as pool:
	#results = pool.imap(process_chunk, chunks)

	print('PASSED AFTER')

	for chunk in chunks:
	for i, sub_chunk in enumerate(chunk[1]):
	if type(sub_chunk) is tuple:
	print(f'MODIFIED TEXT {i} : {sub_chunk[1]}')
	else:
	print(f'MODIFIED TEXT {i} : {sub_chunk}')


	weakDict = OrderedMultiIndexMapWeakRef()

	metadata_main_title = ''

	for pn, slide in enumerate(ppt.slides):
	if pn < from_page:
	continue
	if pn >= to_page:
	break

	try:
	_ = slide.shapes[0]
	except IndexError:
	continue

	text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]

	if len(text_shapes) == 1:
	metadata_main_title = text_shapes[0].text_frame.text
	continue

	print(f'SLIDE TEXT: {slide_text}')
	weakDict.insert(slide_text, metadata_main_title)
	return weakDict

	raise NotImplementedError(
	"file type not supported yet(pptx)")