Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Sleeping

File size: 2,892 Bytes

f6b288c
e3f2cf0
56374e1
c85610d
388d88a
5458cd4
230b5de
30dee92
43fb5f8
e3f2cf0
6a50b6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c213948
 
0672b29
c213948
e3f2cf0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312e73d
 
e3f2cf0
 
5e24a65
312e73d
 
6a50b6a
 
 
 
 
 
312e73d
836e4af
 
d454573
bb51c01
d454573
 
 
 
 
 
bb51c01
cf1b883
73f4441
cf1b883
73f4441
 
 
 
 
 
 
d454573
9e930eb
fcde85f
73f4441
56374e1
 
b17fe4d

from io import StringIO
from typing import List

import pathos.multiprocessing as mp
from unstructured.partition.pptx import partition_pptx
from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars

from ordered_multimap import OrderedMultiIndexMapWeakRef

def process_chunk(chunk):
    marked = []
    
    for i in range(len(chunk[1])):
        current = chunk[1][i]
        current_text = current[1]
        
        if (type(current) is tuple) and current_text.isupper() and (current[0] == 'Title'):
            tokens = nlp.pipe(current_text)
            
            try:
                next = chunk[1][i+1]
            except IndexError:
                continue
                
            if (type(next) is tuple) and next[1].isupper() and (next[0] == 'Title'):
                if token[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
                    chunk[1][i+1][1] = current_text + ' ' + next[1]
                    marked.append(i)

    for i in marked:
        del chunk[1][i]

    return chunk

def ppt_chunk(file_like, model):
    elements = partition_pptx(file=file_like)

    chunks = []
    current_chunk = []
    list_items = []

    for elem in elements:
        if elem.category == "PageBreak":
            if current_chunk or list_items:
                if list_items:
                    current_chunk.append("\n".join(list_items))
                    list_items = []
                    
                chunks.append((elem.id, current_chunk))
                current_chunk = []
        else:
            if elem.category == "ListItem":
                list_items.append(elem.text)
            else:
                current_chunk.append((elem.category, elem.text))

    print('PASSED')

    with mp.Pool(mp.cpu_count()) as pool:
        results = pool.imap(process_chunk, chunks)

    print('PASSED AFTER')

    for chunk in chunks:
        for i, sub_chunk in enumerate(chunk[1]):
            if type(sub_chunk) is tuple:
                print(f'MODIFIED TEXT {i} : {sub_chunk[1]}')
            else:
                print(f'MODIFIED TEXT {i} : {sub_chunk}')

        
    weakDict = OrderedMultiIndexMapWeakRef()

    metadata_main_title = ''
    
    for pn, slide in enumerate(ppt.slides):
        if pn < from_page:
            continue
        if pn >= to_page:
            break

        try:
            _ = slide.shapes[0]
        except IndexError:
            continue

        text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]

        if len(text_shapes) == 1:
            metadata_main_title = text_shapes[0].text_frame.text
            continue
            
        print(f'SLIDE TEXT: {slide_text}')
        weakDict.insert(slide_text, metadata_main_title)
    return weakDict

    raise NotImplementedError(
        "file type not supported yet(pptx)")