from io import StringIO
from typing import List

import pathos.multiprocessing as mp
from unstructured.partition.pptx import partition_pptx
from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars

from ordered_multimap import OrderedMultiIndexMapWeakRef

def process_chunk(chunk, nlp):
    marked = []
    
    for i in range(len(chunk[1])):
        current = chunk[1][i]
        current_text = current[1]
        
        if (type(current) is list) and current_text.isupper() and (current[0] == 'Title'):
            tokens = nlp(current_text)
            
            try:
                next = chunk[1][i+1]
            except IndexError:
                continue
                
            if (type(next) is list) and next[1].isupper() and (next[0] == 'Title'):
                if tokens[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
                    chunk[1][i+1][1] = current_text + ' ' + next[1]
                    marked.append(i)

    for i in marked:
        del chunk[1][i]

    return chunk

def ppt_chunk(file_like, model):
    import time
    elements = partition_pptx(file=file_like)

    chunks = []
    current_chunk = []
    list_items = []

    for elem in elements:
        if elem.category == "PageBreak":
            if current_chunk or list_items:
                if list_items:
                    current_chunk.append("\n".join(list_items))
                    list_items = []
                    
                chunks.append((elem.id, current_chunk))
                current_chunk = []
        else:
            if elem.category == "ListItem":
                list_items.append(elem.text)
            else:
                current_chunk.append([elem.category, elem.text])

    sr = time.time()
    for chunk in chunks:
        chunk = process_chunk(chunk, model)

    er = time.time()
    fr = er - sr
    print(f'TIME {fr}')
    #with mp.Pool(mp.cpu_count()) as pool:
        #results = pool.imap(process_chunk, chunks)

    print('PASSED AFTER')

    for chunk in chunks:
        for i, sub_chunk in enumerate(chunk[1]):
            if type(sub_chunk) is tuple:
                print(f'MODIFIED TEXT {i} : {sub_chunk[1]}')
            else:
                print(f'MODIFIED TEXT {i} : {sub_chunk}')

        
    weakDict = OrderedMultiIndexMapWeakRef()

    metadata_main_title = ''
    
    for pn, slide in enumerate(ppt.slides):
        if pn < from_page:
            continue
        if pn >= to_page:
            break

        try:
            _ = slide.shapes[0]
        except IndexError:
            continue

        text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]

        if len(text_shapes) == 1:
            metadata_main_title = text_shapes[0].text_frame.text
            continue
            
        print(f'SLIDE TEXT: {slide_text}')
        weakDict.insert(slide_text, metadata_main_title)
    return weakDict

    raise NotImplementedError(
        "file type not supported yet(pptx)")