File size: 1,107 Bytes
4fcfa11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import numpy as np


def query_pinecone(dense_vec, top_k, index, indices):
    xc = index.query(
        vector=dense_vec,
        top_k=top_k,
        filter={"QA_Flag": {"$eq": "Answer"}, "index": {"$in": indices}},
        include_metadata=True,
    )
    return xc["matches"]


def format_query(query_results):
    # extract passage_text from Pinecone search result
    context = [
        (result["metadata"]["Text"], result["score"])
        for result in query_results
    ]
    return context


def format_context(context):
    output_text = []
    for text, score in context:
        output_text.append(f"Text: {text}\nCosine Similarity: {score}")
    return output_text


def get_bm25_search_hits(corpus, sparse_scores, top_n=50):
    bm25_search = []
    indices = []
    for idx in sparse_scores:
        if len(bm25_search) <= top_n:
            bm25_search.append(corpus[idx])
            indices.append(idx)
    return indices


def retrieve_transcript():
    open_file = open(
        f"2020-Apr-28-AMD.txt",
        "r",
    )
    file_text = open_file.read()
    return f"""{file_text}"""