Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Running on T4

File size: 6,162 Bytes

f6b288c
e3f2cf0
56374e1
388d88a
230b5de
30dee92
43fb5f8
0c73db6
3b7e1ca
def8b51
e26d78f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209bd42
36df6c5
3b7e1ca
 
0672b29
c213948
3b7e1ca
 
 
 
e3f2cf0
 
b329558
 
e3f2cf0
3b7e1ca
e3f2cf0
 
3b7e1ca
 
 
e3f2cf0
3b7e1ca
 
 
273b8a5
1d92d6e
bca2f2c
3b7e1ca
ad1ff1a
bca2f2c
e3f2cf0
e26d78f
3b7e1ca
952b8a3
b0871bc
3b7e1ca
952b8a3
3b7e1ca
 
7e2e235
3b7e1ca
e3f2cf0
e276989
e3f2cf0
36df6c5
 
e26d78f
f519f21
e26d78f
8e86a37
e4f1905
 
 
e26d78f
 
0c73db6
 
33e94b3
0c73db6
 
 
 
 
 
1bf9065
0c73db6
1bf9065
e26d78f
 
 
0c73db6
e26d78f
e4f1905
e26d78f
1bf9065
 
209bd42
36df6c5
e26d78f
744f6e9
836e4af
 
7575807
 
8e86a37
 
744f6e9
 
 
 
 
 
 
 
 
 
 
7575807
 
 
 
331d96b
 
 
7575807
01d064a
7575807
01d064a
331d96b
 
c3ec8ff
 
7575807
c3ec8ff
 
7575807
331d96b
 
c3ec8ff
 
7575807
c3ec8ff
 
01d064a
40e07b6
28de8cf
 
7575807
8a06fe4
744f6e9
 
 
 
 
 
a58d6a9
744f6e9
8e86a37
56374e1
 
b17fe4d

from io import StringIO
from typing import List

from unstructured.partition.pptx import partition_pptx

from ordered_multimap import OrderedMultiIndexMapWeakRef

WRONG_NOUNS = {"BY", "IN", "ON", "INTO", "A", "AT", "WITH", "FROM", "TO", "AND", "OR", "BUT", "OWN", 'BEHIND', 'THROUGH', 'FIERCE', "'S", '&'}
NON_ENDING_PUNCT = {',', ':', ';', "'", '/', '-'}

def process_chunk(chunk, nlp):
    marked = []
        
    for i in range(len(chunk[1])):
        current = chunk[1][i]
            
        if (type(current) is list) and current[1].isupper() and (current[0] == ('Title' or 'UncategorizedText')):
            tokens = nlp(current[1])
            
            try:
                next_ = chunk[1][i+1]
                    
                if type(next_) is not list:
                    continue
                
            except IndexError:
                continue
                    
            if next_[1].isupper() and (next_[0] == ('Title' or 'UncategorizedText')):
                if (tokens[-1].pos_ in {'SYM', "ADP", 'ADV', 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"}) or ((tokens[-1].pos_ in {'PROPN', 'NOUN', 'VERB'}) and (str(tokens[-1]) in WRONG_NOUNS)):
                    chunk[1][i+1][1] = current[1] + ' ' + next_[1]
                    marked.append(i)

    for i in marked:
        del chunk[1][i]
            
    return chunk

def ppt_chunk(file_like, nlp):
    import time

    s = time.time()
    elements = partition_pptx(file=file_like)

    e = time.time()
    f = e - s
    print(f'TIME {f}')

    chunks = []
    current_chunk = []
    list_items = set()
    marked = set()

    for i, elem in enumerate(elements):
        if elem.category == "PageBreak":
            if current_chunk or list_items:
                if current_chunk:
                    current_chunk = [elem for elem in current_chunk if elem[1] not in marked]
                    
                if list_items:
                    duplicate = marked.intersection(list_items)
                    if duplicate:
                        list_items = list_items - duplicate

                    current_chunk.append("\n".join(list_items))
                    list_items = set()

                chunks.append([elem.id, current_chunk])
                current_chunk = []
        else:
            if (elem.text[-1] in NON_ENDING_PUNCT) and (elem.category != 'Table'):
                try:
                    next_ = elements[i+1]
                except IndexError:
                    pass
                elements[i+1].text = elem.text + ' ' + next_.text
                marked.add(elem.text)
                
            if (elem.category == "ListItem") or (elem.category == 'NarrativeText'):
                list_items.add(elem.text)
            else:
                current_chunk.append([elem.category, elem.text])

    sr = time.time()

    for chunk in chunks:
        chunk = process_chunk(chunk, nlp)

    tables = []
    j = 0
    
    while j < len(chunks):       
        new_sub_chunks = []
        only_tables = True
        title = ''
        
        for i, sub_chunk in enumerate(chunks[j][1]):
            print(f'TEST : {sub_chunk}')
            if (i == 0) and ((sub_chunk[0] == 'Title') or (sub_chunk[0] == 'UncategorizedText')):
                title = sub_chunk[1]
                
            if sub_chunk[0] == 'Table':
                if title != '':
                    tables.append([chunks[j][0], title, sub_chunk])
                else:
                    tables.append([chunks[j][0], sub_chunk])
            else:
                new_sub_chunks.append(sub_chunk)
                only_tables = False
        
        if only_tables:
            del chunks[j]
        else:
            chunks[j] = [chunks[j][0], new_sub_chunks]
            j += 1
        
    er = time.time()
    fr = er - s
    print(f'TIME INTERMEDIATE {fr}')
        
    weakDict = OrderedMultiIndexMapWeakRef()
    metadata_main_title = None
    metadata_sub_title = None

    for chunk in chunks:
        nb_titles = 0
        nb_sub_titles = 0
        
        for i, sub_chunk in enumerate(chunk[1]):
            if type(sub_chunk) is list:
                if sub_chunk[0] == 'Title':
                    nb_titles += 1
                elif sub_chunk[0] == 'UncategorizedText':
                    nb_sub_titles += 1
            else:
                if (nb_titles <= 1) and (nb_sub_titles <= 1):
                    try:
                        first_chunk = chunk[1][i-1]

                        if first_chunk[0] == 'UncategorizedText':
                            if metadata_sub_title != first_chunk[1]:
                                metadata_sub_title = first_chunk[1]
                                
                            try:
                                ok = chunk[1][i-2]

                                if ok[0] == 'Title':
                                    if metadata_main_title != ok[1]:
                                        metadata_main_title = ok[1]
                                    weakDict.insert(chunk[0], sub_chunk, metadata_main_title, metadata_sub_title)
                                    break
                            except IndexError:
                                weakDict.insert(chunk[0], sub_chunk, metadata_sub_title)
                                break
                        elif first_chunk[0] == 'Title':
                            if metadata_main_title != first_chunk[1]:
                                metadata_main_title = first_chunk[1]
                            weakDict.insert(chunk[0], sub_chunk, metadata_main_title)
                            break
                    except IndexError:
                        weakDict.insert(chunk[0], sub_chunk)
                        break

                
            if i == len(chunk) - 1:
                weakDict.insert(chunk[0], "\n".join([c[1] for c in chunk[1]]))
                    
            

    et = time.time()
    ft = et - s
    print(f'TIME FINAL {ft}')


    #for test in weakDict:
        
    return weakDict, tables

    raise NotImplementedError(
        "file type not supported yet(pptx)")