File size: 2,340 Bytes
6ab617c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
from dotenv import load_dotenv
load_dotenv()

import yaml
import argparse


from policy_rag.text_utils import DocLoader
from policy_rag.vectorstore_utils import QdrantVectorstoreHelper
from policy_rag.app_utils import (
    CHUNK_METHOD,
    EMBEDDING_MODEL_SOURCE,
    get_chunk_func,
    get_embedding_model
)
from policy_rag.eval_utils import eval_on_ls_dataset







if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', default=None, help='YAML config file to run')
    parser.add_argument('--config_dir', default=None, help='Directory of YAML config files to run')
    args = parser.parse_args()
    with open(args.config, 'r') as file:
        config_yml = yaml.safe_load(file)

    data_dir = config_yml['data_dir']
    chunk_method = config_yml['chunk_method']
    ls_project = config_yml['ls_project']
    ls_dataset_name = config_yml['ls_dataset_name']
    ls_experiment_name = config_yml['ls_experiment_name']
    vectorstore_model = config_yml['vectorstore_model']
    metrics = config_yml['metrics']

    os.environ['LANGCHAIN_PROJECT'] = ls_project
    os.environ['LANGCHAIN_TRACING_V2'] = 'false'


    # Load Raw Data
    print('Loading Docs')
    loader = DocLoader()
    docs = loader.load_dir(data_dir)


    # Chunk Docs
    print('Chunking Docs')
    chunk_func, chunk_func_args = get_chunk_func(chunk_method)
    print(chunk_func_args)
    chunks = chunk_func(docs=docs, **chunk_func_args)
    print(f"len of chunks: {len(chunks)}")


    # Load chunks into vectorstore
    print('Creating Qdrant Collection and Getting Retriever')
    qdrant_vectorstore = QdrantVectorstoreHelper()
    qdrant_vectorstore.create_cloud_vectorstore(
        chunks=chunks,
        collection_name=ls_experiment_name,
        embedding_model=get_embedding_model(vectorstore_model),
        vector_size=vectorstore_model['vector_size']
    )
    retriever = qdrant_vectorstore.get_retriever(
        collection_name=ls_experiment_name,
        embedding_model=get_embedding_model(vectorstore_model),
        k=3
    )
    
    # Run RAGAS Evaluation in LangSmith
    result = eval_on_ls_dataset(
        metrics=metrics,
        retriever=retriever,
        ls_dataset_name=ls_dataset_name,
        ls_project_name=ls_project,
        ls_experiment_name=ls_experiment_name
    )