from io import StringIO
from multiprocessing import cpu_count
from concurrent.futures import ProcessPoolExecutor

import spacy
from unstructured.partition.pptx import partition_pptx
from unstructured.cleaners.core import clean_ordered_bullets, clean_bullets, clean_non_ascii_chars

from ordered_multimap import OrderedMultiIndexMapWeakRef

def process_text(text_1, text_2):
    tokens = nlp(text_1)
    if token[-1].pos_ in ["ADP", 'PART', 'PRON', 'DET', "AUX", 'SCONJ', 'CONJ', "CCONJ"]:
        return text_1 + ' ' +  text_2
    return text_1

def ppt_chunk(file_like, model):
    elements = partition_pptx(file=file_like)

    with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
        results = list(executor.map(process_text, texts))
                
    for elem in elements:
        elem.text = clean(elem.text, bullets=True)
        type = elem.to_dict()['type']
        print(f'UNSTRUCTURED TEXT: {type} , {text}')

        
    weakDict = OrderedMultiIndexMapWeakRef()

    metadata_main_title = ''
    
    for pn, slide in enumerate(ppt.slides):
        if pn < from_page:
            continue
        if pn >= to_page:
            break

        try:
            _ = slide.shapes[0]
        except IndexError:
            continue

        text_shapes = [shape for shape in slide.shapes if shape.has_text_frame]

        if len(text_shapes) == 1:
            metadata_main_title = text_shapes[0].text_frame.text
            continue
            
        print(f'SLIDE TEXT: {slide_text}')
        weakDict.insert(slide_text, metadata_main_title)
    return weakDict

    raise NotImplementedError(
        "file type not supported yet(pptx)")